From 7da578fa7fe3391988d75aec9f9432e590d2cd96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <41945903+qarmin@users.noreply.github.com> Date: Tue, 28 Dec 2021 20:20:24 +0100 Subject: [PATCH] Optimize a little image compare algorithm (#528) * Split checking images at 2 functions * Optimize a little image finding * 1.54.0 farawell --- czkawka_core/src/similar_images.rs | 160 ++++++++++++++--------------- 1 file changed, 79 insertions(+), 81 deletions(-) diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index e1f9653..8345258 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, BTreeSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fs::OpenOptions; use std::fs::{File, Metadata}; use std::io::Write; @@ -47,7 +47,6 @@ const LOOP_DURATION: u32 = 200; //ms #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug, Serialize, Deserialize)] pub enum Similarity { - None, Similar(u32), } @@ -239,7 +238,11 @@ impl SimilarImages { self.stopped_search = true; return; } - if !self.sort_images(stop_receiver, progress_sender) { + if !self.hash_images(stop_receiver, progress_sender) { + self.stopped_search = true; + return; + } + if !self.find_similar_hashes(stop_receiver, progress_sender) { self.stopped_search = true; return; } @@ -412,7 +415,7 @@ impl SimilarImages { }, hash: Vec::new(), - similarity: Similarity::None, + similarity: Similarity::Similar(0), }; fe_result.push((current_file_name.to_string_lossy().to_string(), fe)); @@ -450,7 +453,7 @@ impl SimilarImages { // - Join already read hashes with hashes which were read from file // - Join all hashes and save it to file - fn sort_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>) -> bool { + fn hash_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>) -> bool { let hash_map_modification = SystemTime::now(); let loaded_hash_map; @@ -579,7 +582,6 @@ impl SimilarImages { for (file_entry, buf) in &vec_file_entry { // Only use to comparing, non broken hashes(all 0 or 255 hashes means that algorithm fails to decode them because e.g. contains a log of alpha channel) if !(buf.iter().all(|e| *e == 0) || buf.iter().all(|e| *e == 255)) { - self.bktree.add(buf.clone()); self.image_hashes.entry(buf.clone()).or_insert_with(Vec::::new); self.image_hashes.get_mut(buf).unwrap().push(file_entry.clone()); } @@ -595,18 +597,32 @@ impl SimilarImages { } Common::print_time(hash_map_modification, SystemTime::now(), "sort_images - saving data to files".to_string()); + true + } + + fn find_similar_hashes(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>) -> bool { let hash_map_modification = SystemTime::now(); + let Similarity::Similar(similarity) = self.similarity; - let similarity: u32 = match self.similarity { - Similarity::Similar(k) => k, - _ => panic!(), - }; - + // Results let mut collected_similar_images: BTreeMap, Vec> = Default::default(); - let mut available_hashes = self.image_hashes.clone(); + let mut temp_hashes = Default::default(); + mem::swap(&mut temp_hashes, &mut self.image_hashes); + let mut this_time_check_hashes; - let mut master_of_group: BTreeSet> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie + let mut master_of_group: HashSet> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie + + let mut available_hashes: HashMap, Vec> = Default::default(); + for (hash, vec_file_entry) in temp_hashes { + // There exists 2 or more hashes with same hash + if vec_file_entry.len() >= 2 { + collected_similar_images.insert(hash, vec_file_entry); + } else { + self.bktree.add(hash.clone()); + available_hashes.insert(hash, vec_file_entry); + } + } //// PROGRESS THREAD START let progress_thread_run = Arc::new(AtomicBool::new(true)); @@ -636,37 +652,10 @@ impl SimilarImages { thread::spawn(|| {}) }; //// PROGRESS THREAD END + if similarity >= 1 { + for current_similarity in 1..=similarity { + this_time_check_hashes = available_hashes.clone(); - for current_similarity in 0..=similarity { - this_time_check_hashes = available_hashes.clone(); - - if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { - // End thread which send info to gui - progress_thread_run.store(false, Ordering::Relaxed); - progress_thread_handle.join().unwrap(); - return false; - } - - for (hash, vec_file_entry) in &this_time_check_hashes { - atomic_mode_counter.fetch_add(1, Ordering::Relaxed); - - let vector_with_found_similar_hashes = self - .bktree - .find(hash, similarity) - .filter(|r| (r.0 == current_similarity) && !master_of_group.contains(r.1) && available_hashes.contains_key(r.1)) - .collect::>(); - - // Not found any hash with specific distance - if vector_with_found_similar_hashes.is_empty() { - continue; - } - - // This one picture doesn't have similar pictures except self in similarity 0 - if current_similarity == 0 && vector_with_found_similar_hashes.len() == 1 { - continue; - } - - // This shouldn't be executed too much times, so it should be quite fast to check this if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { // End thread which send info to gui progress_thread_run.store(false, Ordering::Relaxed); @@ -674,43 +663,51 @@ impl SimilarImages { return false; } - // Jeśli jeszcze nie dodał, to dodaje teraz grupę główną do już obrobionych - if !master_of_group.contains(hash) { - master_of_group.insert(hash.clone()); - collected_similar_images.insert(hash.clone(), Vec::new()); + for (hash, vec_file_entry) in this_time_check_hashes.into_iter() { + atomic_mode_counter.fetch_add(1, Ordering::Relaxed); - let mut things: Vec = vec_file_entry - .iter() - .map(|fe| FileEntry { - path: fe.path.clone(), - size: fe.size, - dimensions: fe.dimensions.clone(), - modified_date: fe.modified_date, - hash: fe.hash.clone(), - similarity: Similarity::Similar(0), - }) - .collect(); - collected_similar_images.get_mut(hash).unwrap().append(&mut things); - } + // Finds hashes with specific distance to + let vector_with_found_similar_hashes = self + .bktree + .find(&hash, similarity) + .filter(|(similarity, hash)| (*similarity == current_similarity) && !master_of_group.contains(*hash) && available_hashes.contains_key(*hash)) + .collect::>(); - // Since we checked hash, we don't need to check it again - if current_similarity != 0 { - vector_with_found_similar_hashes.iter().for_each(|e| { - let mut things: Vec = available_hashes - .get_mut(e.1) - .unwrap() - .iter() - .map(|fe| FileEntry { - path: fe.path.clone(), - size: fe.size, - dimensions: fe.dimensions.clone(), - modified_date: fe.modified_date, - hash: Vec::new(), - similarity: Similarity::Similar(current_similarity), + // Not found any hash with specific distance + if vector_with_found_similar_hashes.is_empty() { + continue; + } + + // Current checked hash isn't in any group of similarity, so we create one, because found similar images + if !master_of_group.contains(&hash) { + master_of_group.insert(hash.clone()); + collected_similar_images.insert(hash.clone(), Vec::new()); + + let mut things: Vec = vec_file_entry + .into_iter() + .map(|mut fe| { + fe.similarity = Similarity::Similar(0); + fe }) - .collect::>(); - collected_similar_images.get_mut(hash).unwrap().append(&mut things); - available_hashes.remove(e.1); + .collect(); + collected_similar_images.get_mut(&hash).unwrap().append(&mut things); + + // This shouldn't be executed too much times, so it should be quite fast to check this + if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { + // End thread which send info to gui + progress_thread_run.store(false, Ordering::Relaxed); + progress_thread_handle.join().unwrap(); + return false; + } + } + + vector_with_found_similar_hashes.iter().for_each(|(_similarity, other_hash)| { + let mut vec_fe = available_hashes.remove(*other_hash).unwrap(); + for fe in &mut vec_fe { + fe.similarity = Similarity::Similar(current_similarity) + } + + collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe); }); } } @@ -719,7 +716,8 @@ impl SimilarImages { progress_thread_run.store(false, Ordering::Relaxed); progress_thread_handle.join().unwrap(); - self.similar_vectors = collected_similar_images.values().cloned().collect(); + // self.similar_vectors = collected_similar_images.into_values().collect(); // TODO use this in Rust 1.54.0 + self.similar_vectors = collected_similar_images.values().cloned().collect(); // 1.53.0 version if self.exclude_images_with_same_size { let mut new_vector = Default::default(); @@ -1028,9 +1026,9 @@ pub fn get_string_from_similarity(similarity: &Similarity, hash_size: u8) -> Str }; match similarity { - Similarity::None => { - panic!() - } + // Similarity::None => { + // panic!() + // } Similarity::Similar(h) => { // #[cfg(debug_assertions)] // {