1
0
Fork 0
mirror of synced 2024-05-07 22:12:36 +12:00

Optimize a little image compare algorithm (#528)

* Split checking images at 2 functions

* Optimize a little image finding

* 1.54.0 farawell
This commit is contained in:
Rafał Mikrut 2021-12-28 20:20:24 +01:00 committed by GitHub
parent d8700f6e78
commit 7da578fa7f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,4 +1,4 @@
use std::collections::{BTreeMap, BTreeSet};
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fs::OpenOptions;
use std::fs::{File, Metadata};
use std::io::Write;
@ -47,7 +47,6 @@ const LOOP_DURATION: u32 = 200; //ms
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug, Serialize, Deserialize)]
pub enum Similarity {
None,
Similar(u32),
}
@ -239,7 +238,11 @@ impl SimilarImages {
self.stopped_search = true;
return;
}
if !self.sort_images(stop_receiver, progress_sender) {
if !self.hash_images(stop_receiver, progress_sender) {
self.stopped_search = true;
return;
}
if !self.find_similar_hashes(stop_receiver, progress_sender) {
self.stopped_search = true;
return;
}
@ -412,7 +415,7 @@ impl SimilarImages {
},
hash: Vec::new(),
similarity: Similarity::None,
similarity: Similarity::Similar(0),
};
fe_result.push((current_file_name.to_string_lossy().to_string(), fe));
@ -450,7 +453,7 @@ impl SimilarImages {
// - Join already read hashes with hashes which were read from file
// - Join all hashes and save it to file
fn sort_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
fn hash_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
let hash_map_modification = SystemTime::now();
let loaded_hash_map;
@ -579,7 +582,6 @@ impl SimilarImages {
for (file_entry, buf) in &vec_file_entry {
// Only use to comparing, non broken hashes(all 0 or 255 hashes means that algorithm fails to decode them because e.g. contains a log of alpha channel)
if !(buf.iter().all(|e| *e == 0) || buf.iter().all(|e| *e == 255)) {
self.bktree.add(buf.clone());
self.image_hashes.entry(buf.clone()).or_insert_with(Vec::<FileEntry>::new);
self.image_hashes.get_mut(buf).unwrap().push(file_entry.clone());
}
@ -595,18 +597,32 @@ impl SimilarImages {
}
Common::print_time(hash_map_modification, SystemTime::now(), "sort_images - saving data to files".to_string());
true
}
fn find_similar_hashes(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
let hash_map_modification = SystemTime::now();
let Similarity::Similar(similarity) = self.similarity;
let similarity: u32 = match self.similarity {
Similarity::Similar(k) => k,
_ => panic!(),
};
// Results
let mut collected_similar_images: BTreeMap<Vec<u8>, Vec<FileEntry>> = Default::default();
let mut available_hashes = self.image_hashes.clone();
let mut temp_hashes = Default::default();
mem::swap(&mut temp_hashes, &mut self.image_hashes);
let mut this_time_check_hashes;
let mut master_of_group: BTreeSet<Vec<u8>> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie
let mut master_of_group: HashSet<Vec<u8>> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie
let mut available_hashes: HashMap<Vec<u8>, Vec<FileEntry>> = Default::default();
for (hash, vec_file_entry) in temp_hashes {
// There exists 2 or more hashes with same hash
if vec_file_entry.len() >= 2 {
collected_similar_images.insert(hash, vec_file_entry);
} else {
self.bktree.add(hash.clone());
available_hashes.insert(hash, vec_file_entry);
}
}
//// PROGRESS THREAD START
let progress_thread_run = Arc::new(AtomicBool::new(true));
@ -636,37 +652,10 @@ impl SimilarImages {
thread::spawn(|| {})
};
//// PROGRESS THREAD END
if similarity >= 1 {
for current_similarity in 1..=similarity {
this_time_check_hashes = available_hashes.clone();
for current_similarity in 0..=similarity {
this_time_check_hashes = available_hashes.clone();
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
return false;
}
for (hash, vec_file_entry) in &this_time_check_hashes {
atomic_mode_counter.fetch_add(1, Ordering::Relaxed);
let vector_with_found_similar_hashes = self
.bktree
.find(hash, similarity)
.filter(|r| (r.0 == current_similarity) && !master_of_group.contains(r.1) && available_hashes.contains_key(r.1))
.collect::<Vec<_>>();
// Not found any hash with specific distance
if vector_with_found_similar_hashes.is_empty() {
continue;
}
// This one picture doesn't have similar pictures except self in similarity 0
if current_similarity == 0 && vector_with_found_similar_hashes.len() == 1 {
continue;
}
// This shouldn't be executed too much times, so it should be quite fast to check this
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
@ -674,43 +663,51 @@ impl SimilarImages {
return false;
}
// Jeśli jeszcze nie dodał, to dodaje teraz grupę główną do już obrobionych
if !master_of_group.contains(hash) {
master_of_group.insert(hash.clone());
collected_similar_images.insert(hash.clone(), Vec::new());
for (hash, vec_file_entry) in this_time_check_hashes.into_iter() {
atomic_mode_counter.fetch_add(1, Ordering::Relaxed);
let mut things: Vec<FileEntry> = vec_file_entry
.iter()
.map(|fe| FileEntry {
path: fe.path.clone(),
size: fe.size,
dimensions: fe.dimensions.clone(),
modified_date: fe.modified_date,
hash: fe.hash.clone(),
similarity: Similarity::Similar(0),
})
.collect();
collected_similar_images.get_mut(hash).unwrap().append(&mut things);
}
// Finds hashes with specific distance to
let vector_with_found_similar_hashes = self
.bktree
.find(&hash, similarity)
.filter(|(similarity, hash)| (*similarity == current_similarity) && !master_of_group.contains(*hash) && available_hashes.contains_key(*hash))
.collect::<Vec<_>>();
// Since we checked hash, we don't need to check it again
if current_similarity != 0 {
vector_with_found_similar_hashes.iter().for_each(|e| {
let mut things: Vec<FileEntry> = available_hashes
.get_mut(e.1)
.unwrap()
.iter()
.map(|fe| FileEntry {
path: fe.path.clone(),
size: fe.size,
dimensions: fe.dimensions.clone(),
modified_date: fe.modified_date,
hash: Vec::new(),
similarity: Similarity::Similar(current_similarity),
// Not found any hash with specific distance
if vector_with_found_similar_hashes.is_empty() {
continue;
}
// Current checked hash isn't in any group of similarity, so we create one, because found similar images
if !master_of_group.contains(&hash) {
master_of_group.insert(hash.clone());
collected_similar_images.insert(hash.clone(), Vec::new());
let mut things: Vec<FileEntry> = vec_file_entry
.into_iter()
.map(|mut fe| {
fe.similarity = Similarity::Similar(0);
fe
})
.collect::<Vec<_>>();
collected_similar_images.get_mut(hash).unwrap().append(&mut things);
available_hashes.remove(e.1);
.collect();
collected_similar_images.get_mut(&hash).unwrap().append(&mut things);
// This shouldn't be executed too much times, so it should be quite fast to check this
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
return false;
}
}
vector_with_found_similar_hashes.iter().for_each(|(_similarity, other_hash)| {
let mut vec_fe = available_hashes.remove(*other_hash).unwrap();
for fe in &mut vec_fe {
fe.similarity = Similarity::Similar(current_similarity)
}
collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe);
});
}
}
@ -719,7 +716,8 @@ impl SimilarImages {
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
self.similar_vectors = collected_similar_images.values().cloned().collect();
// self.similar_vectors = collected_similar_images.into_values().collect(); // TODO use this in Rust 1.54.0
self.similar_vectors = collected_similar_images.values().cloned().collect(); // 1.53.0 version
if self.exclude_images_with_same_size {
let mut new_vector = Default::default();
@ -1028,9 +1026,9 @@ pub fn get_string_from_similarity(similarity: &Similarity, hash_size: u8) -> Str
};
match similarity {
Similarity::None => {
panic!()
}
// Similarity::None => {
// panic!()
// }
Similarity::Similar(h) => {
// #[cfg(debug_assertions)]
// {