1
0
Fork 0
mirror of synced 2024-05-17 19:03:08 +12:00

Split into multiple parts

This commit is contained in:
Rafał Mikrut 2023-05-02 16:29:28 +02:00
parent c183c52a9d
commit c3c6042954

View file

@ -732,6 +732,119 @@ impl SimilarImages {
(chunks, hashes_with_multiple_images)
}
fn collect_hash_compare_result(
&self,
hashes_parents: HashMap<ImHash, u32>,
hashes_with_multiple_images: &HashSet<ImHash>,
all_hashed_images: &HashMap<ImHash, Vec<FileEntry>>,
collected_similar_images: &mut HashMap<ImHash, Vec<FileEntry>>,
hashes_similarity: HashMap<ImHash, (ImHash, u32)>,
) {
if self.use_reference_folders {
// This is same step as without reference folders, but also checks if children are inside/outside reference directories, because may happen, that one file is inside reference folder and other outside
// Collecting results to vector
for (parent_hash, child_number) in hashes_parents {
// If hash contains other hasher OR multiple images are available for checked hash
if child_number > 0 || hashes_with_multiple_images.contains(&parent_hash) {
let vec_fe = all_hashed_images
.get(&parent_hash)
.unwrap()
.iter()
.filter(|e| is_in_reference_folder(&self.directories.reference_directories, &e.path))
.cloned()
.collect();
collected_similar_images.insert(parent_hash.clone(), vec_fe);
}
}
for (child_hash, (parent_hash, similarity)) in hashes_similarity {
let mut vec_fe: Vec<_> = all_hashed_images
.get(&child_hash)
.unwrap()
.iter()
.filter(|e| !is_in_reference_folder(&self.directories.reference_directories, &e.path))
.cloned()
.collect();
for mut fe in &mut vec_fe {
fe.similarity = similarity;
}
collected_similar_images.get_mut(&parent_hash).unwrap().append(&mut vec_fe);
}
} else {
// Collecting results to vector
for (parent_hash, child_number) in hashes_parents {
// If hash contains other hasher OR multiple images are available for checked hash
if child_number > 0 || hashes_with_multiple_images.contains(&parent_hash) {
let vec_fe = all_hashed_images.get(&parent_hash).unwrap().clone();
collected_similar_images.insert(parent_hash.clone(), vec_fe);
}
}
for (child_hash, (parent_hash, similarity)) in hashes_similarity {
let mut vec_fe = all_hashed_images.get(&child_hash).unwrap().clone();
for mut fe in &mut vec_fe {
fe.similarity = similarity;
}
collected_similar_images.get_mut(&parent_hash).unwrap().append(&mut vec_fe);
}
}
}
fn check_for_duplicate_hashes(
&self,
parts: Vec<(HashMap<ImHash, u32>, HashMap<ImHash, (ImHash, u32)>)>,
hashes_with_multiple_images: &HashSet<ImHash>,
all_hashed_images: &HashMap<ImHash, Vec<FileEntry>>,
collected_similar_images: &mut HashMap<ImHash, Vec<FileEntry>>,
) {
let mut hashes_parents: HashMap<ImHash, u32> = Default::default();
let mut hashes_similarity: HashMap<ImHash, (ImHash, u32)> = Default::default();
let mut iter = parts.into_iter();
// At start fill arrays with first item
// Normal algorithm would do exactly same thing, but slower, one record after one
if let Some((first_hashes_parents, first_hashes_similarity)) = iter.next() {
hashes_parents = first_hashes_parents;
hashes_similarity = first_hashes_similarity;
}
for (partial_hashes_with_parents, partial_hashes_with_similarity) in iter {
for (parent_hash, _child_number) in partial_hashes_with_parents {
if !hashes_parents.contains_key(&parent_hash) && !hashes_similarity.contains_key(&parent_hash) {
hashes_parents.insert(parent_hash, 0);
}
}
for (hash_to_check, (compared_hash, similarity)) in partial_hashes_with_similarity {
image_to_check(
&mut hashes_parents,
&mut hashes_similarity,
hashes_with_multiple_images,
&hash_to_check,
&compared_hash,
similarity,
);
}
}
#[cfg(debug_assertions)]
if !self.use_reference_folders {
debug_check_for_duplicated_things(&hashes_parents, &hashes_similarity, all_hashed_images, "LATTER");
}
// Just simple check if all original hashes with multiple entries are available in end results
let original_hashes_at_start = hashes_with_multiple_images.len();
let original_hashes_in_end_results = hashes_parents
.iter()
.filter(|(parent_hash, _child_number)| hashes_with_multiple_images.contains(*parent_hash))
.count();
if !self.use_reference_folders {
assert_eq!(original_hashes_at_start, original_hashes_in_end_results);
}
self.collect_hash_compare_result(hashes_parents, hashes_with_multiple_images, all_hashed_images, collected_similar_images, hashes_similarity);
}
fn find_similar_hashes(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
if self.image_hashes.is_empty() {
return true;
@ -787,103 +900,89 @@ impl SimilarImages {
return false;
}
{
let mut hashes_parents: HashMap<ImHash, u32> = Default::default();
let mut hashes_similarity: HashMap<ImHash, (ImHash, u32)> = Default::default();
let mut iter = parts.into_iter();
// At start fill arrays with first item
// Normal algorithm would do exactly same thing, but slower, one record after one
if let Some((first_hashes_parents, first_hashes_similarity)) = iter.next() {
hashes_parents = first_hashes_parents;
hashes_similarity = first_hashes_similarity;
}
self.check_for_duplicate_hashes(parts, &hashes_with_multiple_images, &all_hashed_images, &mut collected_similar_images);
}
for (partial_hashes_with_parents, partial_hashes_with_similarity) in iter {
for (parent_hash, _child_number) in partial_hashes_with_parents {
if !hashes_parents.contains_key(&parent_hash) && !hashes_similarity.contains_key(&parent_hash) {
hashes_parents.insert(parent_hash, 0);
}
}
self.verify_duplicated_items(&collected_similar_images);
for (hash_to_check, (compared_hash, similarity)) in partial_hashes_with_similarity {
image_to_check(
&mut hashes_parents,
&mut hashes_similarity,
&hashes_with_multiple_images,
&hash_to_check,
&compared_hash,
similarity,
);
}
}
self.similar_vectors = collected_similar_images.into_values().collect();
#[cfg(debug_assertions)]
if !self.use_reference_folders {
debug_check_for_duplicated_things(&hashes_parents, &hashes_similarity, &all_hashed_images, "LATTER");
}
self.exclude_items_with_same_size();
// Just simple check if all original hashes with multiple entries are available in end results
let original_hashes_at_start = hashes_with_multiple_images.len();
let original_hashes_in_end_results = hashes_parents
.iter()
.filter(|(parent_hash, _child_number)| hashes_with_multiple_images.contains(*parent_hash))
.count();
if !self.use_reference_folders {
assert_eq!(original_hashes_at_start, original_hashes_in_end_results);
}
self.check_for_reference_folders();
if self.use_reference_folders {
// This is same step as without reference folders, but also checks if children are inside/outside reference directories, because may happen, that one file is inside reference folder and other outside
Common::print_time(hash_map_modification, SystemTime::now(), "sort_images - selecting data from HashMap");
// Collecting results to vector
for (parent_hash, child_number) in hashes_parents {
// If hash contains other hasher OR multiple images are available for checked hash
if child_number > 0 || hashes_with_multiple_images.contains(&parent_hash) {
let vec_fe = all_hashed_images
.get(&parent_hash)
.unwrap()
.iter()
.filter(|e| is_in_reference_folder(&self.directories.reference_directories, &e.path))
.cloned()
.collect();
collected_similar_images.insert(parent_hash.clone(), vec_fe);
}
}
for (child_hash, (parent_hash, similarity)) in hashes_similarity {
let mut vec_fe: Vec<_> = all_hashed_images
.get(&child_hash)
.unwrap()
.iter()
.filter(|e| !is_in_reference_folder(&self.directories.reference_directories, &e.path))
.cloned()
.collect();
for mut fe in &mut vec_fe {
fe.similarity = similarity;
}
collected_similar_images.get_mut(&parent_hash).unwrap().append(&mut vec_fe);
}
} else {
// Collecting results to vector
for (parent_hash, child_number) in hashes_parents {
// If hash contains other hasher OR multiple images are available for checked hash
if child_number > 0 || hashes_with_multiple_images.contains(&parent_hash) {
let vec_fe = all_hashed_images.get(&parent_hash).unwrap().clone();
collected_similar_images.insert(parent_hash.clone(), vec_fe);
}
}
for (child_hash, (parent_hash, similarity)) in hashes_similarity {
let mut vec_fe = all_hashed_images.get(&child_hash).unwrap().clone();
for mut fe in &mut vec_fe {
fe.similarity = similarity;
}
collected_similar_images.get_mut(&parent_hash).unwrap().append(&mut vec_fe);
}
}
if self.use_reference_folders {
for (_fe, vector) in &self.similar_referenced_vectors {
self.information.number_of_duplicates += vector.len();
self.information.number_of_groups += 1;
}
} else {
for vector in &self.similar_vectors {
self.information.number_of_duplicates += vector.len() - 1;
self.information.number_of_groups += 1;
}
}
// Clean unused data
self.image_hashes = Default::default();
self.images_to_check = Default::default();
self.bktree = BKTree::new(Hamming);
true
}
fn exclude_items_with_same_size(&mut self) {
if self.exclude_images_with_same_size {
let mut new_vector = Default::default();
mem::swap(&mut self.similar_vectors, &mut new_vector);
for vec_file_entry in new_vector {
let mut bt_sizes: BTreeSet<u64> = Default::default();
let mut vec_values = Vec::new();
for file_entry in vec_file_entry {
if !bt_sizes.contains(&file_entry.size) {
bt_sizes.insert(file_entry.size);
vec_values.push(file_entry);
}
}
if vec_values.len() > 1 {
self.similar_vectors.push(vec_values);
}
}
}
}
fn check_for_reference_folders(&mut self) {
if self.use_reference_folders {
let mut similar_vector = Default::default();
mem::swap(&mut self.similar_vectors, &mut similar_vector);
let reference_directories = self.directories.reference_directories.clone();
self.similar_referenced_vectors = similar_vector
.into_iter()
.filter_map(|vec_file_entry| {
let mut files_from_referenced_folders = Vec::new();
let mut normal_files = Vec::new();
for file_entry in vec_file_entry {
if reference_directories.iter().any(|e| file_entry.path.starts_with(e)) {
files_from_referenced_folders.push(file_entry);
} else {
normal_files.push(file_entry);
}
}
if files_from_referenced_folders.is_empty() || normal_files.is_empty() {
None
} else {
Some((files_from_referenced_folders.pop().unwrap(), normal_files))
}
})
.collect::<Vec<(FileEntry, Vec<FileEntry>)>>();
}
}
#[allow(dead_code)]
pub fn verify_duplicated_items(&self, collected_similar_images: &HashMap<ImHash, Vec<FileEntry>>) {
// Validating if group contains duplicated results
#[cfg(debug_assertions)]
{
@ -912,72 +1011,6 @@ impl SimilarImages {
}
assert!(!found, "Found Invalid entries, verify errors before"); // TODO crashes with empty result with reference folder, verify why
}
self.similar_vectors = collected_similar_images.into_values().collect();
if self.exclude_images_with_same_size {
let mut new_vector = Default::default();
mem::swap(&mut self.similar_vectors, &mut new_vector);
for vec_file_entry in new_vector {
let mut bt_sizes: BTreeSet<u64> = Default::default();
let mut vec_values = Vec::new();
for file_entry in vec_file_entry {
if !bt_sizes.contains(&file_entry.size) {
bt_sizes.insert(file_entry.size);
vec_values.push(file_entry);
}
}
if vec_values.len() > 1 {
self.similar_vectors.push(vec_values);
}
}
}
if self.use_reference_folders {
let mut similar_vector = Default::default();
mem::swap(&mut self.similar_vectors, &mut similar_vector);
let reference_directories = self.directories.reference_directories.clone();
self.similar_referenced_vectors = similar_vector
.into_iter()
.filter_map(|vec_file_entry| {
let mut files_from_referenced_folders = Vec::new();
let mut normal_files = Vec::new();
for file_entry in vec_file_entry {
if reference_directories.iter().any(|e| file_entry.path.starts_with(e)) {
files_from_referenced_folders.push(file_entry);
} else {
normal_files.push(file_entry);
}
}
if files_from_referenced_folders.is_empty() || normal_files.is_empty() {
None
} else {
Some((files_from_referenced_folders.pop().unwrap(), normal_files))
}
})
.collect::<Vec<(FileEntry, Vec<FileEntry>)>>();
}
Common::print_time(hash_map_modification, SystemTime::now(), "sort_images - selecting data from HashMap");
if self.use_reference_folders {
for (_fe, vector) in &self.similar_referenced_vectors {
self.information.number_of_duplicates += vector.len();
self.information.number_of_groups += 1;
}
} else {
for vector in &self.similar_vectors {
self.information.number_of_duplicates += vector.len() - 1;
self.information.number_of_groups += 1;
}
}
// Clean unused data
self.image_hashes = Default::default();
self.images_to_check = Default::default();
self.bktree = BKTree::new(Hamming);
true
}
/// Set included dir which needs to be relative, exists etc.