diff --git a/Changelog.md b/Changelog.md index 3a26d21..5ae6295 100644 --- a/Changelog.md +++ b/Changelog.md @@ -5,7 +5,7 @@ - Allow to set number of threads from CLI - [#972](https://github.com/qarmin/czkawka/pull/972) - Fix problem with invalid item sorting in bad extensions mode - [#972](https://github.com/qarmin/czkawka/pull/972) - Big refactor/cleaning of code - [#956](https://github.com/qarmin/czkawka/pull/956)/[#970](https://github.com/qarmin/czkawka/pull/970)/[#972](https://github.com/qarmin/czkawka/pull/972) -- Use builtin gtk webp preview loader - [#923](https://github.com/qarmin/czkawka/pull/923) +- Use builtin gtk webp loader for previews - [#923](https://github.com/qarmin/czkawka/pull/923) - Fixed docker build - [#947](https://github.com/qarmin/czkawka/pull/947) - Restore snap builds broken since GTk 4 port - [#965](https://github.com/qarmin/czkawka/pull/947) - Instruction how to build native ARM64 binaries on Mac - [#945](https://github.com/qarmin/czkawka/pull/945)/[#971](https://github.com/qarmin/czkawka/pull/971) diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index 0e969bd..7828dbd 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -622,7 +622,49 @@ impl SimilarImages { Some((hashes_parents, hashes_similarity)) } - fn chunk_hashes(&mut self, all_hashed_images: &HashMap>, all_hashes: &Vec) -> (Vec>, HashSet) { + // Split hashes at 2 parts, base hashes and hashes to compare, 3 argument is set of hashes with multiple images + fn split_hashes(&mut self, all_hashed_images: &HashMap>) -> (Vec, Vec, HashSet) { + // Fast way to check if hash have multiple images - todo is this really needed? + let mut hashes_with_multiple_images: HashSet = Default::default(); + let mut base_hashes = Vec::new(); // Initial hashes + let mut hashes_to_compare = Vec::new(); + if self.use_reference_folders { + let mut files_from_referenced_folders: HashMap> = HashMap::new(); + let mut normal_files: HashMap> = HashMap::new(); + + all_hashed_images.clone().into_iter().for_each(|(hash, vec_file_entry)| { + for file_entry in vec_file_entry { + if is_in_reference_folder(&self.directories.reference_directories, &file_entry.path) { + files_from_referenced_folders.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); + } else { + normal_files.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); + } + } + }); + + for (hash, vec_files) in normal_files { + if vec_files.len() >= 2 { + hashes_with_multiple_images.insert(hash.clone()); + } + hashes_to_compare.push(hash.clone()); + self.bktree.add(hash); + } + + for (hash, vec_files) in files_from_referenced_folders { + if vec_files.len() >= 2 { + hashes_with_multiple_images.insert(hash.clone()); + } + base_hashes.push(hash.clone()); + } + } else { + base_hashes = all_hashed_images.clone().into_keys().collect::>(); + hashes_to_compare = base_hashes.clone(); + } + (base_hashes, hashes_to_compare, hashes_with_multiple_images) + } + + fn chunk_hashes(&mut self, all_hashed_images: &HashMap>) -> (Vec>, HashSet) { + let all_hashes = all_hashed_images.clone().into_keys().collect::>(); let mut hashes_with_multiple_images: HashSet = Default::default(); // Fast way to check if hash have multiple images let mut files_from_referenced_folders: HashMap> = HashMap::new(); let mut normal_files: HashMap> = HashMap::new(); @@ -635,10 +677,9 @@ impl SimilarImages { let mut chunks: Vec>; if self.use_reference_folders { - let reference_directories = self.directories.reference_directories.clone(); all_hashed_images.clone().into_iter().for_each(|(hash, vec_file_entry)| { for file_entry in vec_file_entry { - if reference_directories.iter().any(|e| file_entry.path.starts_with(e)) { + if is_in_reference_folder(&self.directories.reference_directories, &file_entry.path) { files_from_referenced_folders.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); } else { normal_files.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); @@ -661,7 +702,7 @@ impl SimilarImages { chunk_size = initial_hashes.len() / number_of_processors; chunks = if chunk_size > 0 { - initial_hashes.chunks(chunk_size).map(<[std::vec::Vec]>::to_vec).collect::>() + initial_hashes.chunks(chunk_size).map(<[Vec]>::to_vec).collect::>() } else { vec![initial_hashes] }; @@ -784,18 +825,77 @@ impl SimilarImages { debug_check_for_duplicated_things(self.use_reference_folders, &hashes_parents, &hashes_similarity, all_hashed_images, "LATTER"); // Just simple check if all original hashes with multiple entries are available in end results - let original_hashes_at_start = hashes_with_multiple_images.len(); - let original_hashes_in_end_results = hashes_parents - .iter() - .filter(|(parent_hash, _child_number)| hashes_with_multiple_images.contains(*parent_hash)) - .count(); if !self.use_reference_folders { + let original_hashes_at_start = hashes_with_multiple_images.len(); + let original_hashes_in_end_results = hashes_parents + .iter() + .filter(|(parent_hash, _child_number)| hashes_with_multiple_images.contains(*parent_hash)) + .count(); assert_eq!(original_hashes_at_start, original_hashes_in_end_results); } self.collect_hash_compare_result(hashes_parents, hashes_with_multiple_images, all_hashed_images, collected_similar_images, hashes_similarity); } + fn compare_hashes_with_non_zero_tolerance( + &mut self, + all_hashed_images: &HashMap>, + collected_similar_images: &mut HashMap>, + progress_sender: Option<&UnboundedSender>, + ) -> bool { + let (progress_thread_handle, progress_thread_run, atomic_counter, check_was_stopped) = + prepare_thread_handler_common(progress_sender, 2, 2, all_hashed_images.len(), CheckingMethod::None, self.tool_type); + + // Don't use hashes with multiple images in bktree, because they will always be master of group and cannot be find by other hashes + + let (base_hashes, _hashes_to_compare, hashes_with_multiple_images) = self.split_hashes(&all_hashed_images); + base_hashes.into_par_iter(); + + // Check them in chunks, to decrease number of used memory + let base_hashes_chunks = base_hashes.chunks(1000); + for chunk in base_hashes_chunks { + let partial_results = chunk + .into_par_iter() + .map(|hash_to_check| { + if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { + check_was_stopped.store(true, Ordering::Relaxed); + return None; + } + let mut found_items = self + .bktree + .find(hash_to_check, tolerance) + .filter(|(similarity, _hash)| if self.use_reference_folders { true } else { *similarity != 0 }) + .collect::>(); + + found_items.sort_unstable_by_key(|f| f.0); + Some((hash_to_check, found_items)) + }) + .while_some() + .collect::>(); + + if check_was_stopped.load(Ordering::Relaxed) { + send_info_and_wait_for_ending_all_threads(&progress_thread_run, progress_thread_handle); + return false; + } + + self.connect_results(partial_results, &hashes_with_multiple_images, collected_similar_images); + } + true + } + + fn connect_results(&mut self, partial_results: Vec<(&ImHash, Vec<(u32, &ImHash)>)>, hashes_with_multiple_images: &HashSet, collected_similar_images: &mut HashMap>) { + let mut hashes_parents: HashMap = Default::default(); + for (original_hash, vec_compared_hashes) in partial_results{ + for (similarity, compared_hash) in vec_compared_hashes { + if hashes_parents.contains_key(compared_hash) || + { + return; + } + } + } + } + } + fn find_similar_hashes(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&UnboundedSender>) -> bool { if self.image_hashes.is_empty() { return true; @@ -808,22 +908,27 @@ impl SimilarImages { let all_hashed_images = mem::take(&mut self.image_hashes); - let all_hashes: Vec<_> = all_hashed_images.clone().into_keys().collect(); - // Checking entries with tolerance 0 is really easy and fast, because only entries with same hashes needs to be checked if tolerance == 0 { - for (hash, vec_file_entry) in all_hashed_images.clone() { + for (hash, vec_file_entry) in all_hashed_images { if vec_file_entry.len() >= 2 { collected_similar_images.insert(hash, vec_file_entry); } } } else { let (progress_thread_handle, progress_thread_run, atomic_counter, check_was_stopped) = - prepare_thread_handler_common(progress_sender, 2, 2, all_hashes.len(), CheckingMethod::None, self.tool_type); + prepare_thread_handler_common(progress_sender, 2, 2, all_hashed_images.len(), CheckingMethod::None, self.tool_type); - // Don't use hashes with multiple images in bktree, because they will always be master of group and cannot be find by other hashes + if !self.compare_hashes_with_non_zero_tolerance(&all_hashed_images, &mut collected_similar_images, progress_sender) { + return false; + } - let (chunks, hashes_with_multiple_images) = self.chunk_hashes(&all_hashed_images, &all_hashes); + send_info_and_wait_for_ending_all_threads(&progress_thread_run, progress_thread_handle); + if check_was_stopped.load(Ordering::Relaxed) { + return false; + } + + let (chunks, hashes_with_multiple_images) = self.chunk_hashes(&all_hashed_images); let parts: Vec<_> = chunks .into_par_iter() @@ -852,11 +957,12 @@ impl SimilarImages { self.verify_duplicated_items(&collected_similar_images); + // Info about hashes is not needed anymore, so we drop this info self.similar_vectors = collected_similar_images.into_values().collect(); self.exclude_items_with_same_size(); - self.check_for_reference_folders(); + self.remove_multiple_records_from_reference_folders(); if self.use_reference_folders { for (_fe, vector) in &self.similar_referenced_vectors { @@ -896,7 +1002,7 @@ impl SimilarImages { } } - fn check_for_reference_folders(&mut self) { + fn remove_multiple_records_from_reference_folders(&mut self) { if self.use_reference_folders { self.similar_referenced_vectors = mem::take(&mut self.similar_vectors) .into_iter() @@ -917,12 +1023,14 @@ impl SimilarImages { #[allow(dead_code)] #[allow(unreachable_code)] #[allow(unused_variables)] + // TODO this probably not works good when reference folders are used pub fn verify_duplicated_items(&self, collected_similar_images: &HashMap>) { #[cfg(not(debug_assertions))] return; // Validating if group contains duplicated results let mut result_hashset: HashSet = Default::default(); let mut found = false; + dbg!(collected_similar_images.len()); for vec_file_entry in collected_similar_images.values() { if vec_file_entry.is_empty() { println!("Empty group");