From c183c52a9d6c88964941dcdb84c594b0e2f515e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= Date: Tue, 2 May 2023 16:15:20 +0200 Subject: [PATCH] Split chunked hashes --- czkawka_core/src/similar_images.rs | 134 +++++++++++++++-------------- 1 file changed, 70 insertions(+), 64 deletions(-) diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index 97a3837..303d503 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -609,12 +609,12 @@ impl SimilarImages { fn compare_hashes( &self, - hashes_to_check: &[&ImHash], + hashes_to_check: &[ImHash], atomic_mode_counter: &Arc, stop_receiver: Option<&Receiver<()>>, check_was_stopped: &AtomicBool, tolerance: u32, - hashes_with_multiple_images: &HashSet<&ImHash>, + hashes_with_multiple_images: &HashSet, all_hashed_images: &HashMap>, ) -> Option<(HashMap, HashMap)> { let mut hashes_parents: HashMap = Default::default(); // Hashes used as parent (hash, children_number_of_hash) @@ -668,6 +668,70 @@ impl SimilarImages { Some((hashes_parents, hashes_similarity)) } + fn chunk_hashes(&mut self, all_hashed_images: &HashMap>, all_hashes: &Vec) -> (Vec>, HashSet) { + let mut hashes_with_multiple_images: HashSet = Default::default(); // Fast way to check if hash have multiple images + let mut files_from_referenced_folders: HashMap> = HashMap::new(); + let mut normal_files: HashMap> = HashMap::new(); + + let number_of_processors = get_number_of_threads(); + let chunk_size; + + let mut initial_hashes: Vec = Vec::new(); + let mut additional_chunk_to_check: Vec = Default::default(); + + let mut chunks: Vec>; + if self.use_reference_folders { + let reference_directories = self.directories.reference_directories.clone(); + all_hashed_images.clone().into_iter().for_each(|(hash, vec_file_entry)| { + for file_entry in vec_file_entry { + if reference_directories.iter().any(|e| file_entry.path.starts_with(e)) { + files_from_referenced_folders.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); + } else { + normal_files.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); + } + } + }); + + for (hash, vec_files) in normal_files { + if vec_files.len() >= 2 { + hashes_with_multiple_images.insert(hash.clone()); + } + self.bktree.add(hash); + } + for (hash, vec_files) in files_from_referenced_folders { + if vec_files.len() >= 2 { + hashes_with_multiple_images.insert(hash.clone()); + } + initial_hashes.push(hash); + } + chunk_size = initial_hashes.len() / number_of_processors; + + chunks = if chunk_size > 0 { + initial_hashes.chunks(chunk_size).map(<[std::vec::Vec]>::to_vec).collect::>() + } else { + vec![initial_hashes] + }; + } else { + for (hash, vec_files) in all_hashed_images { + if vec_files.len() >= 2 { + additional_chunk_to_check.push(hash.clone()); + hashes_with_multiple_images.insert(hash.clone()); + } else { + self.bktree.add(hash.clone()); + } + } + chunk_size = all_hashes.len() / number_of_processors; + chunks = if chunk_size > 0 { + all_hashes.chunks(chunk_size).map(<[Vec]>::to_vec).collect::>() + } else { + vec![all_hashes.clone()] + }; + chunks.push(additional_chunk_to_check); + } + + (chunks, hashes_with_multiple_images) + } + fn find_similar_hashes(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>) -> bool { if self.image_hashes.is_empty() { return true; @@ -682,7 +746,7 @@ impl SimilarImages { let mut all_hashed_images = Default::default(); mem::swap(&mut all_hashed_images, &mut self.image_hashes); - let all_hashes: Vec<_> = all_hashed_images.keys().collect(); + let all_hashes: Vec<_> = all_hashed_images.clone().into_keys().collect(); // Checking entries with tolerance 0 is really easy and fast, because only entries with same hashes needs to be checked if tolerance == 0 { @@ -698,72 +762,14 @@ impl SimilarImages { let progress_thread_handle = self.prepare_thread_handler_similar_images(progress_sender, &progress_thread_run, &atomic_mode_counter, 2, 2, all_hashes.len()); // Don't use hashes with multiple images in bktree, because they will always be master of group and cannot be find by other hashes - let mut hashes_with_multiple_images: HashSet<_> = Default::default(); // Fast way to check if hash have multiple images - let mut files_from_referenced_folders = HashMap::new(); - let mut normal_files = HashMap::new(); - - let number_of_processors = get_number_of_threads(); - let chunk_size; - let mut chunks: Vec<&[&ImHash]>; - - let mut initial_hashes: Vec<&ImHash> = Vec::new(); - let mut additional_chunk_to_check: Vec<&ImHash> = Default::default(); - - if self.use_reference_folders { - let reference_directories = self.directories.reference_directories.clone(); - all_hashed_images.clone().into_iter().for_each(|(hash, vec_file_entry)| { - for file_entry in vec_file_entry { - if reference_directories.iter().any(|e| file_entry.path.starts_with(e)) { - files_from_referenced_folders.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); - } else { - normal_files.entry(hash.clone()).or_insert_with(Vec::new).push(file_entry); - } - } - }); - - for (hash, vec_files) in &normal_files { - if vec_files.len() >= 2 { - hashes_with_multiple_images.insert(hash); - } - self.bktree.add(hash.clone()); - } - for (hash, vec_files) in &files_from_referenced_folders { - if vec_files.len() >= 2 { - hashes_with_multiple_images.insert(hash); - } - initial_hashes.push(hash); - } - chunk_size = initial_hashes.len() / number_of_processors; - - chunks = if chunk_size > 0 { - initial_hashes.chunks(chunk_size).collect::>() - } else { - vec![&initial_hashes] - }; - } else { - for (hash, vec_files) in &all_hashed_images { - if vec_files.len() >= 2 { - additional_chunk_to_check.push(hash); - hashes_with_multiple_images.insert(hash); - } else { - self.bktree.add(hash.clone()); - } - } - chunk_size = all_hashes.len() / number_of_processors; - chunks = if chunk_size > 0 { - all_hashes.chunks(chunk_size).collect::>() - } else { - vec![&all_hashes] - }; - chunks.push(&additional_chunk_to_check); - } + let (chunks, hashes_with_multiple_images) = self.chunk_hashes(&all_hashed_images, &all_hashes); let parts: Vec<_> = chunks .into_par_iter() .map(|hashes_to_check| { self.compare_hashes( - hashes_to_check, + &hashes_to_check, &atomic_mode_counter, stop_receiver, &check_was_stopped, @@ -995,7 +1001,7 @@ impl SimilarImages { fn image_to_check<'a>( hashes_parents: &mut HashMap, hashes_similarity: &mut HashMap, - hashes_with_multiple_images: &HashSet<&'a ImHash>, + hashes_with_multiple_images: &HashSet, hash_to_check: &'a ImHash, compared_hash: &'a ImHash, similarity: u32,