From b61bd09860ddfc544d3be2d9c67d9cd6b8119078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= Date: Tue, 28 Mar 2023 20:27:41 +0200 Subject: [PATCH] Partial hashing --- czkawka_core/src/duplicate.rs | 237 ++++++++++++++++++---------------- 1 file changed, 127 insertions(+), 110 deletions(-) diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index c2e65ef..45b2de1 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -459,7 +459,7 @@ impl DuplicateFinder { } } - self.check_references_for_size(); + self.filter_reference_folders_by_size(); self.calculate_size_stats(); Common::print_time(start_time, SystemTime::now(), "check_files_size"); @@ -490,7 +490,7 @@ impl DuplicateFinder { /// This step check for references, only when checking for size. /// This is needed, because later reference folders looks for hashes, not size - fn check_references_for_size(&mut self) { + fn filter_reference_folders_by_size(&mut self) { if self.use_reference_folders && self.check_method == CheckingMethod::Size { let mut btree_map = Default::default(); mem::swap(&mut self.files_with_identical_size, &mut btree_map); @@ -521,7 +521,7 @@ impl DuplicateFinder { } } - // TODO Generalize this if possible + // TODO Generalize this if possible with different tools fn prepare_hash_thread_handler( &self, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>, @@ -556,15 +556,15 @@ impl DuplicateFinder { } } - /// The slowest checking type, which must be applied after checking for size - fn check_files_hash(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>) -> bool { - assert_eq!(self.check_method, CheckingMethod::Hash); - - let check_type = Arc::new(self.hash_type); - + fn prehashing( + &mut self, + stop_receiver: Option<&Receiver<()>>, + progress_sender: Option<&futures::channel::mpsc::UnboundedSender>, + pre_checked_map: &mut BTreeMap>, + ) -> Option<()> { let start_time: SystemTime = SystemTime::now(); + let check_type = self.hash_type; let check_was_stopped = AtomicBool::new(false); // Used for breaking from GUI and ending check thread - let mut pre_checked_map: BTreeMap> = Default::default(); let progress_thread_run = Arc::new(AtomicBool::new(true)); let atomic_file_counter = Arc::new(AtomicUsize::new(0)); @@ -577,133 +577,150 @@ impl DuplicateFinder { self.files_with_identical_size.values().map(Vec::len).sum(), ); - ///////////////////////////////////////////////////////////////////////////// PREHASHING START - { - let loaded_hash_map; - let mut records_already_cached: BTreeMap> = Default::default(); - let mut non_cached_files_to_check: BTreeMap> = Default::default(); + let loaded_hash_map; + let mut records_already_cached: BTreeMap> = Default::default(); + let mut non_cached_files_to_check: BTreeMap> = Default::default(); - // Cache algorithm - // - Load data from cache - // - Convert from BT> to BT - // - Save to proper values - if self.use_prehash_cache { - loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, true) { - Some(t) => t, - None => Default::default(), - }; + // Cache algorithm + // - Load data from cache + // - Convert from BT> to BT + // - Save to proper values + if self.use_prehash_cache { + loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, true) { + Some(t) => t, + None => Default::default(), + }; - let mut loaded_hash_map2: BTreeMap = Default::default(); - for vec_file_entry in loaded_hash_map.values() { - for file_entry in vec_file_entry { - loaded_hash_map2.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); + let mut loaded_hash_map2: BTreeMap = Default::default(); + for vec_file_entry in loaded_hash_map.values() { + for file_entry in vec_file_entry { + loaded_hash_map2.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); + } + } + + #[allow(clippy::if_same_then_else)] + for vec_file_entry in self.files_with_identical_size.values() { + for file_entry in vec_file_entry { + let name = file_entry.path.to_string_lossy().to_string(); + if !loaded_hash_map2.contains_key(&name) { + // If loaded data doesn't contains current image info + non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new).push(file_entry.clone()); + } else if file_entry.size != loaded_hash_map2.get(&name).unwrap().size || file_entry.modified_date != loaded_hash_map2.get(&name).unwrap().modified_date { + // When size or modification date of image changed, then it is clear that is different image + non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new).push(file_entry.clone()); + } else { + // Checking may be omitted when already there is entry with same size and modification date + records_already_cached.entry(file_entry.size).or_insert_with(Vec::new).push(file_entry.clone()); } } + } + } else { + loaded_hash_map = Default::default(); + mem::swap(&mut self.files_with_identical_size, &mut non_cached_files_to_check); + } - #[allow(clippy::if_same_then_else)] - for vec_file_entry in self.files_with_identical_size.values() { - for file_entry in vec_file_entry { - let name = file_entry.path.to_string_lossy().to_string(); - if !loaded_hash_map2.contains_key(&name) { - // If loaded data doesn't contains current image info - non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new).push(file_entry.clone()); - } else if file_entry.size != loaded_hash_map2.get(&name).unwrap().size || file_entry.modified_date != loaded_hash_map2.get(&name).unwrap().modified_date { - // When size or modification date of image changed, then it is clear that is different image - non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new).push(file_entry.clone()); - } else { - // Checking may be omitted when already there is entry with same size and modification date - records_already_cached.entry(file_entry.size).or_insert_with(Vec::new).push(file_entry.clone()); + #[allow(clippy::type_complexity)] + let pre_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check + .par_iter() + .map(|(size, vec_file_entry)| { + let mut hashmap_with_hash: BTreeMap> = Default::default(); + let mut errors: Vec = Vec::new(); + let mut buffer = [0u8; 1024 * 2]; + + atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); + for file_entry in vec_file_entry { + if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { + check_was_stopped.store(true, Ordering::Relaxed); + return None; + } + match hash_calculation(&mut buffer, file_entry, &check_type, 0) { + Ok(hash_string) => { + hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new).push(file_entry.clone()); } + Err(s) => errors.push(s), } } - } else { - loaded_hash_map = Default::default(); - mem::swap(&mut self.files_with_identical_size, &mut non_cached_files_to_check); + Some((*size, hashmap_with_hash, errors)) + }) + .while_some() + .collect(); + + // End thread which send info to gui + progress_thread_run.store(false, Ordering::Relaxed); + progress_thread_handle.join().unwrap(); + + // Check if user aborted search(only from GUI) + if check_was_stopped.load(Ordering::Relaxed) { + return None; + } + + // Add data from cache + for (size, vec_file_entry) in &records_already_cached { + pre_checked_map.entry(*size).or_insert_with(Vec::new).append(&mut vec_file_entry.clone()); + } + + // Check results + for (size, hash_map, errors) in &pre_hash_results { + self.text_messages.warnings.append(&mut errors.clone()); + for vec_file_entry in hash_map.values() { + if vec_file_entry.len() > 1 { + pre_checked_map.entry(*size).or_insert_with(Vec::new).append(&mut vec_file_entry.clone()); + } } + } - #[allow(clippy::type_complexity)] - let pre_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check - .par_iter() - .map(|(size, vec_file_entry)| { - let mut hashmap_with_hash: BTreeMap> = Default::default(); - let mut errors: Vec = Vec::new(); - let mut buffer = [0u8; 1024 * 2]; + if self.use_prehash_cache { + // All results = records already cached + computed results + let mut save_cache_to_hashmap: BTreeMap = Default::default(); - atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); + for (size, vec_file_entry) in loaded_hash_map { + if size >= self.minimal_prehash_cache_file_size { for file_entry in vec_file_entry { - if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { - check_was_stopped.store(true, Ordering::Relaxed); - return None; - } - match hash_calculation(&mut buffer, file_entry, &check_type, 0) { - Ok(hash_string) => { - hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new).push(file_entry.clone()); - } - Err(s) => errors.push(s), - } - } - Some((*size, hashmap_with_hash, errors)) - }) - .while_some() - .collect(); - - // End thread which send info to gui - progress_thread_run.store(false, Ordering::Relaxed); - progress_thread_handle.join().unwrap(); - - // Check if user aborted search(only from GUI) - if check_was_stopped.load(Ordering::Relaxed) { - return false; - } - - // Add data from cache - for (size, vec_file_entry) in &records_already_cached { - pre_checked_map.entry(*size).or_insert_with(Vec::new).append(&mut vec_file_entry.clone()); - } - - // Check results - for (size, hash_map, errors) in &pre_hash_results { - self.text_messages.warnings.append(&mut errors.clone()); - for vec_file_entry in hash_map.values() { - if vec_file_entry.len() > 1 { - pre_checked_map.entry(*size).or_insert_with(Vec::new).append(&mut vec_file_entry.clone()); + save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); } } } - if self.use_prehash_cache { - // All results = records already cached + computed results - let mut save_cache_to_hashmap: BTreeMap = Default::default(); - - for (size, vec_file_entry) in loaded_hash_map { - if size >= self.minimal_prehash_cache_file_size { + for (size, hash_map, _errors) in &pre_hash_results { + if *size >= self.minimal_prehash_cache_file_size { + for vec_file_entry in hash_map.values() { for file_entry in vec_file_entry { save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); } } } - - for (size, hash_map, _errors) in &pre_hash_results { - if *size >= self.minimal_prehash_cache_file_size { - for vec_file_entry in hash_map.values() { - for file_entry in vec_file_entry { - save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); - } - } - } - } - - save_hashes_to_file(&save_cache_to_hashmap, &mut self.text_messages, &self.hash_type, true, self.minimal_prehash_cache_file_size); } + + save_hashes_to_file(&save_cache_to_hashmap, &mut self.text_messages, &self.hash_type, true, self.minimal_prehash_cache_file_size); + } + + Common::print_time(start_time, SystemTime::now(), "check_files_hash - prehash"); + + Some(()) + } + + fn full_hashing(&mut self) {} + + /// The slowest checking type, which must be applied after checking for size + fn check_files_hash(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>) -> bool { + assert_eq!(self.check_method, CheckingMethod::Hash); + + let check_type = self.hash_type; + + let check_was_stopped = AtomicBool::new(false); // Used for breaking from GUI and ending check thread + + ///////////////////////////////////////////////////////////////////////////// PREHASHING START + + let mut pre_checked_map: BTreeMap> = Default::default(); + let ret = self.prehashing(stop_receiver, progress_sender, &mut pre_checked_map); + if ret.is_none() { + return false; } ///////////////////////////////////////////////////////////////////////////// PREHASHING END - Common::print_time(start_time, SystemTime::now(), "check_files_hash - prehash"); - let start_time: SystemTime = SystemTime::now(); - ///////////////////////// - + let start_time: SystemTime = SystemTime::now(); //// PROGRESS THREAD START let progress_thread_run = Arc::new(AtomicBool::new(true)); let atomic_file_counter = Arc::new(AtomicUsize::new(0));