1
0
Fork 0
mirror of synced 2024-05-19 20:02:24 +12:00

Optimize a little image compare algorithm (#528)

* Split checking images at 2 functions

* Optimize a little image finding

* 1.54.0 farawell
This commit is contained in:
Rafał Mikrut 2021-12-28 20:20:24 +01:00 committed by GitHub
parent d8700f6e78
commit 7da578fa7f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,4 +1,4 @@
use std::collections::{BTreeMap, BTreeSet}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fs::OpenOptions; use std::fs::OpenOptions;
use std::fs::{File, Metadata}; use std::fs::{File, Metadata};
use std::io::Write; use std::io::Write;
@ -47,7 +47,6 @@ const LOOP_DURATION: u32 = 200; //ms
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug, Serialize, Deserialize)] #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug, Serialize, Deserialize)]
pub enum Similarity { pub enum Similarity {
None,
Similar(u32), Similar(u32),
} }
@ -239,7 +238,11 @@ impl SimilarImages {
self.stopped_search = true; self.stopped_search = true;
return; return;
} }
if !self.sort_images(stop_receiver, progress_sender) { if !self.hash_images(stop_receiver, progress_sender) {
self.stopped_search = true;
return;
}
if !self.find_similar_hashes(stop_receiver, progress_sender) {
self.stopped_search = true; self.stopped_search = true;
return; return;
} }
@ -412,7 +415,7 @@ impl SimilarImages {
}, },
hash: Vec::new(), hash: Vec::new(),
similarity: Similarity::None, similarity: Similarity::Similar(0),
}; };
fe_result.push((current_file_name.to_string_lossy().to_string(), fe)); fe_result.push((current_file_name.to_string_lossy().to_string(), fe));
@ -450,7 +453,7 @@ impl SimilarImages {
// - Join already read hashes with hashes which were read from file // - Join already read hashes with hashes which were read from file
// - Join all hashes and save it to file // - Join all hashes and save it to file
fn sort_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool { fn hash_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
let hash_map_modification = SystemTime::now(); let hash_map_modification = SystemTime::now();
let loaded_hash_map; let loaded_hash_map;
@ -579,7 +582,6 @@ impl SimilarImages {
for (file_entry, buf) in &vec_file_entry { for (file_entry, buf) in &vec_file_entry {
// Only use to comparing, non broken hashes(all 0 or 255 hashes means that algorithm fails to decode them because e.g. contains a log of alpha channel) // Only use to comparing, non broken hashes(all 0 or 255 hashes means that algorithm fails to decode them because e.g. contains a log of alpha channel)
if !(buf.iter().all(|e| *e == 0) || buf.iter().all(|e| *e == 255)) { if !(buf.iter().all(|e| *e == 0) || buf.iter().all(|e| *e == 255)) {
self.bktree.add(buf.clone());
self.image_hashes.entry(buf.clone()).or_insert_with(Vec::<FileEntry>::new); self.image_hashes.entry(buf.clone()).or_insert_with(Vec::<FileEntry>::new);
self.image_hashes.get_mut(buf).unwrap().push(file_entry.clone()); self.image_hashes.get_mut(buf).unwrap().push(file_entry.clone());
} }
@ -595,18 +597,32 @@ impl SimilarImages {
} }
Common::print_time(hash_map_modification, SystemTime::now(), "sort_images - saving data to files".to_string()); Common::print_time(hash_map_modification, SystemTime::now(), "sort_images - saving data to files".to_string());
true
}
fn find_similar_hashes(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
let hash_map_modification = SystemTime::now(); let hash_map_modification = SystemTime::now();
let Similarity::Similar(similarity) = self.similarity;
let similarity: u32 = match self.similarity { // Results
Similarity::Similar(k) => k,
_ => panic!(),
};
let mut collected_similar_images: BTreeMap<Vec<u8>, Vec<FileEntry>> = Default::default(); let mut collected_similar_images: BTreeMap<Vec<u8>, Vec<FileEntry>> = Default::default();
let mut available_hashes = self.image_hashes.clone(); let mut temp_hashes = Default::default();
mem::swap(&mut temp_hashes, &mut self.image_hashes);
let mut this_time_check_hashes; let mut this_time_check_hashes;
let mut master_of_group: BTreeSet<Vec<u8>> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie let mut master_of_group: HashSet<Vec<u8>> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie
let mut available_hashes: HashMap<Vec<u8>, Vec<FileEntry>> = Default::default();
for (hash, vec_file_entry) in temp_hashes {
// There exists 2 or more hashes with same hash
if vec_file_entry.len() >= 2 {
collected_similar_images.insert(hash, vec_file_entry);
} else {
self.bktree.add(hash.clone());
available_hashes.insert(hash, vec_file_entry);
}
}
//// PROGRESS THREAD START //// PROGRESS THREAD START
let progress_thread_run = Arc::new(AtomicBool::new(true)); let progress_thread_run = Arc::new(AtomicBool::new(true));
@ -636,37 +652,10 @@ impl SimilarImages {
thread::spawn(|| {}) thread::spawn(|| {})
}; };
//// PROGRESS THREAD END //// PROGRESS THREAD END
if similarity >= 1 {
for current_similarity in 1..=similarity {
this_time_check_hashes = available_hashes.clone();
for current_similarity in 0..=similarity {
this_time_check_hashes = available_hashes.clone();
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
return false;
}
for (hash, vec_file_entry) in &this_time_check_hashes {
atomic_mode_counter.fetch_add(1, Ordering::Relaxed);
let vector_with_found_similar_hashes = self
.bktree
.find(hash, similarity)
.filter(|r| (r.0 == current_similarity) && !master_of_group.contains(r.1) && available_hashes.contains_key(r.1))
.collect::<Vec<_>>();
// Not found any hash with specific distance
if vector_with_found_similar_hashes.is_empty() {
continue;
}
// This one picture doesn't have similar pictures except self in similarity 0
if current_similarity == 0 && vector_with_found_similar_hashes.len() == 1 {
continue;
}
// This shouldn't be executed too much times, so it should be quite fast to check this
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui // End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed); progress_thread_run.store(false, Ordering::Relaxed);
@ -674,43 +663,51 @@ impl SimilarImages {
return false; return false;
} }
// Jeśli jeszcze nie dodał, to dodaje teraz grupę główną do już obrobionych for (hash, vec_file_entry) in this_time_check_hashes.into_iter() {
if !master_of_group.contains(hash) { atomic_mode_counter.fetch_add(1, Ordering::Relaxed);
master_of_group.insert(hash.clone());
collected_similar_images.insert(hash.clone(), Vec::new());
let mut things: Vec<FileEntry> = vec_file_entry // Finds hashes with specific distance to
.iter() let vector_with_found_similar_hashes = self
.map(|fe| FileEntry { .bktree
path: fe.path.clone(), .find(&hash, similarity)
size: fe.size, .filter(|(similarity, hash)| (*similarity == current_similarity) && !master_of_group.contains(*hash) && available_hashes.contains_key(*hash))
dimensions: fe.dimensions.clone(), .collect::<Vec<_>>();
modified_date: fe.modified_date,
hash: fe.hash.clone(),
similarity: Similarity::Similar(0),
})
.collect();
collected_similar_images.get_mut(hash).unwrap().append(&mut things);
}
// Since we checked hash, we don't need to check it again // Not found any hash with specific distance
if current_similarity != 0 { if vector_with_found_similar_hashes.is_empty() {
vector_with_found_similar_hashes.iter().for_each(|e| { continue;
let mut things: Vec<FileEntry> = available_hashes }
.get_mut(e.1)
.unwrap() // Current checked hash isn't in any group of similarity, so we create one, because found similar images
.iter() if !master_of_group.contains(&hash) {
.map(|fe| FileEntry { master_of_group.insert(hash.clone());
path: fe.path.clone(), collected_similar_images.insert(hash.clone(), Vec::new());
size: fe.size,
dimensions: fe.dimensions.clone(), let mut things: Vec<FileEntry> = vec_file_entry
modified_date: fe.modified_date, .into_iter()
hash: Vec::new(), .map(|mut fe| {
similarity: Similarity::Similar(current_similarity), fe.similarity = Similarity::Similar(0);
fe
}) })
.collect::<Vec<_>>(); .collect();
collected_similar_images.get_mut(hash).unwrap().append(&mut things); collected_similar_images.get_mut(&hash).unwrap().append(&mut things);
available_hashes.remove(e.1);
// This shouldn't be executed too much times, so it should be quite fast to check this
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
return false;
}
}
vector_with_found_similar_hashes.iter().for_each(|(_similarity, other_hash)| {
let mut vec_fe = available_hashes.remove(*other_hash).unwrap();
for fe in &mut vec_fe {
fe.similarity = Similarity::Similar(current_similarity)
}
collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe);
}); });
} }
} }
@ -719,7 +716,8 @@ impl SimilarImages {
progress_thread_run.store(false, Ordering::Relaxed); progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap(); progress_thread_handle.join().unwrap();
self.similar_vectors = collected_similar_images.values().cloned().collect(); // self.similar_vectors = collected_similar_images.into_values().collect(); // TODO use this in Rust 1.54.0
self.similar_vectors = collected_similar_images.values().cloned().collect(); // 1.53.0 version
if self.exclude_images_with_same_size { if self.exclude_images_with_same_size {
let mut new_vector = Default::default(); let mut new_vector = Default::default();
@ -1028,9 +1026,9 @@ pub fn get_string_from_similarity(similarity: &Similarity, hash_size: u8) -> Str
}; };
match similarity { match similarity {
Similarity::None => { // Similarity::None => {
panic!() // panic!()
} // }
Similarity::Similar(h) => { Similarity::Similar(h) => {
// #[cfg(debug_assertions)] // #[cfg(debug_assertions)]
// { // {