From 1fd53b854bdacaa23d092bff10448c0cbc6e1d6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <41945903+qarmin@users.noreply.github.com> Date: Wed, 1 Dec 2021 20:09:04 +0100 Subject: [PATCH] Add prehash cache support (#477) --- czkawka_cli/src/commands.rs | 2 +- czkawka_core/src/duplicate.rs | 447 +++++++++++++---------- czkawka_core/src/similar_videos.rs | 2 +- czkawka_gui/src/connect_button_search.rs | 9 +- czkawka_gui/src/connect_button_select.rs | 8 +- czkawka_gui/src/connect_popovers.rs | 54 ++- czkawka_gui/src/gui_settings.rs | 11 +- czkawka_gui/src/help_functions.rs | 2 +- czkawka_gui/src/saving_loading.rs | 49 ++- czkawka_gui/ui/settings.glade | 63 +++- 10 files changed, 427 insertions(+), 220 deletions(-) diff --git a/czkawka_cli/src/commands.rs b/czkawka_cli/src/commands.rs index 544aecb..2011ce3 100644 --- a/czkawka_cli/src/commands.rs +++ b/czkawka_cli/src/commands.rs @@ -22,7 +22,7 @@ pub enum Commands { minimal_file_size: u64, #[structopt(short = "i", long, parse(try_from_str = parse_maximal_file_size), default_value = "18446744073709551615", help = "Maximum size in bytes", long_help = "Maximum size of checked files in bytes, assigning lower value may speed up searching")] maximal_file_size: u64, - #[structopt(short = "c", long, parse(try_from_str = parse_minimal_file_size), default_value = "524288", help = "Minimum cached file size in bytes", long_help = "Minimum size of cached files in bytes, assigning bigger value may speed up will cause that lower amount of files will be cached, but loading of cache will be faster")] + #[structopt(short = "c", long, parse(try_from_str = parse_minimal_file_size), default_value = "257144", help = "Minimum cached file size in bytes", long_help = "Minimum size of cached files in bytes, assigning bigger value may speed up will cause that lower amount of files will be cached, but loading of cache will be faster")] minimal_cached_file_size: u64, #[structopt(flatten)] allowed_extensions: AllowedExtensions, diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index e1db1de..791393b 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -117,9 +117,6 @@ pub struct Info { pub number_of_duplicated_files_by_name: usize, pub lost_space_by_size: u64, pub lost_space_by_hash: u64, - pub bytes_read_when_hashing: u64, - pub number_of_removed_files: usize, - pub number_of_failed_to_remove_files: usize, pub gained_space: u64, } @@ -149,7 +146,9 @@ pub struct DuplicateFinder { dryrun: bool, stopped_search: bool, use_cache: bool, + use_prehash_cache: bool, minimal_cache_file_size: u64, + minimal_prehash_cache_file_size: u64, delete_outdated_cache: bool, } @@ -174,7 +173,9 @@ impl DuplicateFinder { hash_type: HashType::Blake3, dryrun: false, use_cache: true, - minimal_cache_file_size: 2 * 1024 * 1024, // By default cache only >= 1MB files + use_prehash_cache: true, + minimal_cache_file_size: 1024 * 1024 / 4, // By default cache only >= 256 KB files + minimal_prehash_cache_file_size: 0, delete_outdated_cache: true, } } @@ -229,6 +230,10 @@ impl DuplicateFinder { self.minimal_cache_file_size = minimal_cache_file_size; } + pub fn set_minimal_prehash_cache_file_size(&mut self, minimal_prehash_cache_file_size: u64) { + self.minimal_prehash_cache_file_size = minimal_prehash_cache_file_size; + } + pub const fn get_files_sorted_by_names(&self) -> &BTreeMap> { &self.files_with_identical_names } @@ -237,6 +242,10 @@ impl DuplicateFinder { self.use_cache = use_cache; } + pub fn set_use_prehash_cache(&mut self, use_prehash_cache: bool) { + self.use_prehash_cache = use_prehash_cache; + } + pub const fn get_files_sorted_by_size(&self) -> &BTreeMap> { &self.files_with_identical_size } @@ -659,6 +668,8 @@ impl DuplicateFinder { /// The slowest checking type, which must be applied after checking for size fn check_files_hash(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender>) -> bool { + assert_eq!(self.check_method, CheckingMethod::Hash); + let check_type = Arc::new(self.hash_type); let start_time: SystemTime = SystemTime::now(); @@ -699,57 +710,136 @@ impl DuplicateFinder { //// PROGRESS THREAD END - #[allow(clippy::type_complexity)] - let pre_hash_results: Vec<(u64, BTreeMap>, Vec, u64)> = self - .files_with_identical_size - .par_iter() - .map(|(size, vec_file_entry)| { - let mut hashmap_with_hash: BTreeMap> = Default::default(); - let mut errors: Vec = Vec::new(); - let mut bytes_read: u64 = 0; - let mut buffer = [0u8; 1024 * 2]; + ///////////////////////////////////////////////////////////////////////////// PREHASHING START + { + let loaded_hash_map; + let mut records_already_cached: BTreeMap> = Default::default(); + let mut non_cached_files_to_check: BTreeMap> = Default::default(); - atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); - for file_entry in vec_file_entry { - if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { - check_was_breaked.store(true, Ordering::Relaxed); - return None; - } - match hash_calculation(&mut buffer, file_entry, &check_type, 0) { - Ok((hash_string, bytes)) => { - bytes_read += bytes; - hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new); - hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.clone()); - } - Err(s) => errors.push(s), + // Cache algorithm + // - Load data from cache + // - Convert from BT> to BT + // - Save to proper values + if self.use_prehash_cache { + loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, true) { + Some(t) => t, + None => Default::default(), + }; + + let mut loaded_hash_map2: BTreeMap = Default::default(); + for vec_file_entry in loaded_hash_map.values() { + for file_entry in vec_file_entry { + loaded_hash_map2.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); } } - Some((*size, hashmap_with_hash, errors, bytes_read)) - }) - .while_some() - .collect(); - // End thread which send info to gui - progress_thread_run.store(false, Ordering::Relaxed); - progress_thread_handle.join().unwrap(); + #[allow(clippy::if_same_then_else)] + for vec_file_entry in self.files_with_identical_size.values() { + for file_entry in vec_file_entry { + let name = file_entry.path.to_string_lossy().to_string(); + if !loaded_hash_map2.contains_key(&name) { + // If loaded data doesn't contains current image info + non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new); + non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry.clone()); + } else if file_entry.size != loaded_hash_map2.get(&name).unwrap().size || file_entry.modified_date != loaded_hash_map2.get(&name).unwrap().modified_date { + // When size or modification date of image changed, then it is clear that is different image + non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new); + non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry.clone()); + } else { + // Checking may be omitted when already there is entry with same size and modification date + records_already_cached.entry(file_entry.size).or_insert_with(Vec::new); + records_already_cached.get_mut(&file_entry.size).unwrap().push(file_entry.clone()); + } + } + } + } else { + loaded_hash_map = Default::default(); + mem::swap(&mut self.files_with_identical_size, &mut non_cached_files_to_check); + } - // Check if user aborted search(only from GUI) - if check_was_breaked.load(Ordering::Relaxed) { - return false; - } + #[allow(clippy::type_complexity)] + let pre_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check + .par_iter() + .map(|(size, vec_file_entry)| { + let mut hashmap_with_hash: BTreeMap> = Default::default(); + let mut errors: Vec = Vec::new(); + let mut buffer = [0u8; 1024 * 2]; - // Check results - for (size, hash_map, mut errors, bytes_read) in pre_hash_results { - self.information.bytes_read_when_hashing += bytes_read; - self.text_messages.warnings.append(&mut errors); - for (_hash, mut vec_file_entry) in hash_map { - if vec_file_entry.len() > 1 { - pre_checked_map.entry(size).or_insert_with(Vec::new); - pre_checked_map.get_mut(&size).unwrap().append(&mut vec_file_entry); + atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); + for file_entry in vec_file_entry { + if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { + check_was_breaked.store(true, Ordering::Relaxed); + return None; + } + match hash_calculation(&mut buffer, file_entry, &check_type, 0) { + Ok(hash_string) => { + hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new); + hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.clone()); + } + Err(s) => errors.push(s), + } + } + Some((*size, hashmap_with_hash, errors)) + }) + .while_some() + .collect(); + + // End thread which send info to gui + progress_thread_run.store(false, Ordering::Relaxed); + progress_thread_handle.join().unwrap(); + + // Check if user aborted search(only from GUI) + if check_was_breaked.load(Ordering::Relaxed) { + return false; + } + + // Add data from cache + for (size, vec_file_entry) in &records_already_cached { + pre_checked_map.entry(*size).or_insert_with(Vec::new); + pre_checked_map.get_mut(size).unwrap().append(&mut vec_file_entry.clone()); + } + + // Check results + for (size, hash_map, errors) in &pre_hash_results { + self.text_messages.warnings.append(&mut errors.clone()); + for vec_file_entry in hash_map.values() { + if vec_file_entry.len() > 1 { + pre_checked_map.entry(*size).or_insert_with(Vec::new); + pre_checked_map.get_mut(size).unwrap().append(&mut vec_file_entry.clone()); + } } } + + if self.use_prehash_cache { + println!("non cached - {}", non_cached_files_to_check.values().map(|e| e.len()).sum::()); + println!("cached - {}", records_already_cached.values().map(|e| e.len()).sum::()); + // All results = records already cached + computed results + let mut save_cache_to_hashmap: BTreeMap = Default::default(); + + for (size, vec_file_entry) in loaded_hash_map { + if size >= self.minimal_prehash_cache_file_size { + for file_entry in vec_file_entry { + save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); + } + } + } + + for (size, hash_map, _errors) in &pre_hash_results { + if *size >= self.minimal_prehash_cache_file_size { + for vec_file_entry in hash_map.values() { + for file_entry in vec_file_entry { + save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); + } + } + } + } + + save_hashes_to_file(&save_cache_to_hashmap, &mut self.text_messages, &self.hash_type, true, self.minimal_prehash_cache_file_size); + } } + ///////////////////////////////////////////////////////////////////////////// PREHASHING END + Common::print_time(start_time, SystemTime::now(), "check_files_hash - prehash".to_string()); let start_time: SystemTime = SystemTime::now(); @@ -766,7 +856,7 @@ impl DuplicateFinder { let progress_send = progress_sender.clone(); let progress_thread_run = progress_thread_run.clone(); let atomic_file_counter = atomic_file_counter.clone(); - let files_to_check = pre_checked_map.iter().map(|e| e.1.len()).sum(); + let files_to_check = pre_checked_map.iter().map(|(_size, vec_file_entry)| vec_file_entry.len()).sum(); let checking_method = self.check_method.clone(); progress_thread_handle = thread::spawn(move || loop { progress_send @@ -789,154 +879,151 @@ impl DuplicateFinder { //// PROGRESS THREAD END - #[allow(clippy::type_complexity)] - let mut full_hash_results: Vec<(u64, BTreeMap>, Vec, u64)>; + ///////////////////////////////////////////////////////////////////////////// HASHING START + { + #[allow(clippy::type_complexity)] + let mut full_hash_results: Vec<(u64, BTreeMap>, Vec)>; - match self.check_method { - CheckingMethod::Hash => { - let loaded_hash_map; + let loaded_hash_map; - let mut records_already_cached: BTreeMap> = Default::default(); - let mut non_cached_files_to_check: BTreeMap> = Default::default(); + let mut records_already_cached: BTreeMap> = Default::default(); + let mut non_cached_files_to_check: BTreeMap> = Default::default(); - if self.use_cache { - loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, false) { - Some(t) => t, - None => Default::default(), - }; + if self.use_cache { + loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, false) { + Some(t) => t, + None => Default::default(), + }; - for (size, vec_file_entry) in pre_checked_map { - #[allow(clippy::collapsible_if)] - if !loaded_hash_map.contains_key(&size) { - // If loaded data doesn't contains current info - non_cached_files_to_check.insert(size, vec_file_entry); - } else { - let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap(); + for (size, vec_file_entry) in pre_checked_map { + #[allow(clippy::collapsible_if)] + if !loaded_hash_map.contains_key(&size) { + // If loaded data doesn't contains current info + non_cached_files_to_check.insert(size, vec_file_entry); + } else { + let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap(); + for file_entry in vec_file_entry { + let mut found: bool = false; + for loaded_file_entry in loaded_vec_file_entry { + if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date { + records_already_cached.entry(file_entry.size).or_insert_with(Vec::new); + records_already_cached.get_mut(&file_entry.size).unwrap().push(loaded_file_entry.clone()); + found = true; + break; + } + } + + if !found { + non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new); + non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry); + } + } + } + } + } else { + loaded_hash_map = Default::default(); + mem::swap(&mut pre_checked_map, &mut non_cached_files_to_check); + } + + full_hash_results = non_cached_files_to_check + .par_iter() + .map(|(size, vec_file_entry)| { + let mut hashmap_with_hash: BTreeMap> = Default::default(); + let mut errors: Vec = Vec::new(); + let mut buffer = [0u8; 1024 * 128]; + + atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); + for file_entry in vec_file_entry { + if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { + check_was_breaked.store(true, Ordering::Relaxed); + return None; + } + + match hash_calculation(&mut buffer, file_entry, &check_type, u64::MAX) { + Ok(hash_string) => { + let mut file_entry = file_entry.clone(); + file_entry.hash = hash_string.clone(); + hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new); + hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry); + } + Err(s) => errors.push(s), + } + } + Some((*size, hashmap_with_hash, errors)) + }) + .while_some() + .collect(); + + if self.use_cache { + 'main: for (size, vec_file_entry) in records_already_cached { + // Check if size already exists, if exists we must to change it outside because cannot have mut and non mut reference to full_hash_results + for (full_size, full_hashmap, _errors) in &mut full_hash_results { + if size == *full_size { for file_entry in vec_file_entry { - let mut found: bool = false; - for loaded_file_entry in loaded_vec_file_entry { - if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date { - records_already_cached.entry(file_entry.size).or_insert_with(Vec::new); - records_already_cached.get_mut(&file_entry.size).unwrap().push(loaded_file_entry.clone()); - found = true; - break; - } - } - - if !found { - non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new); - non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry); - } + full_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new); + full_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry); } + continue 'main; } } - } else { - loaded_hash_map = Default::default(); - mem::swap(&mut pre_checked_map, &mut non_cached_files_to_check); + // Size doesn't exists add results to files + let mut temp_hashmap: BTreeMap> = Default::default(); + for file_entry in vec_file_entry { + temp_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new); + temp_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry); + } + full_hash_results.push((size, temp_hashmap, Vec::new())); } - full_hash_results = non_cached_files_to_check - .par_iter() - .map(|(size, vec_file_entry)| { - let mut hashmap_with_hash: BTreeMap> = Default::default(); - let mut errors: Vec = Vec::new(); - let mut bytes_read: u64 = 0; - let mut buffer = [0u8; 1024 * 128]; - - atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); - for file_entry in vec_file_entry { - if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { - check_was_breaked.store(true, Ordering::Relaxed); - return None; - } - - match hash_calculation(&mut buffer, file_entry, &check_type, u64::MAX) { - Ok((hash_string, bytes)) => { - bytes_read += bytes; - let mut file_entry = file_entry.clone(); - file_entry.hash = hash_string.clone(); - hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new); - hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry); - } - Err(s) => errors.push(s), - } - } - Some((*size, hashmap_with_hash, errors, bytes_read)) - }) - .while_some() - .collect(); - - if self.use_cache { - 'main: for (size, vec_file_entry) in records_already_cached { - // Check if size already exists, if exists we must to change it outside because cannot have mut and non mut reference to full_hash_results - for (full_size, full_hashmap, _errors, _bytes_read) in &mut full_hash_results { - if size == *full_size { - for file_entry in vec_file_entry { - full_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new); - full_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry); - } - continue 'main; - } - } - // Size doesn't exists add results to files - let mut temp_hashmap: BTreeMap> = Default::default(); - for file_entry in vec_file_entry { - temp_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new); - temp_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry); - } - full_hash_results.push((size, temp_hashmap, Vec::new(), 0)); + // Must save all results to file, old loaded from file with all currently counted results + let mut all_results: BTreeMap = Default::default(); + for (_size, vec_file_entry) in loaded_hash_map { + for file_entry in vec_file_entry { + all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); } - - // Must save all results to file, old loaded from file with all currently counted results - let mut all_results: BTreeMap = Default::default(); - for (_size, vec_file_entry) in loaded_hash_map { + } + for (_size, hashmap, _errors) in &full_hash_results { + for vec_file_entry in hashmap.values() { for file_entry in vec_file_entry { - all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); + all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); } } - for (_size, hashmap, _errors, _bytes_read) in &full_hash_results { - for vec_file_entry in hashmap.values() { - for file_entry in vec_file_entry { - all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); - } - } + } + save_hashes_to_file(&all_results, &mut self.text_messages, &self.hash_type, false, self.minimal_cache_file_size); + } + + // End thread which send info to gui + progress_thread_run.store(false, Ordering::Relaxed); + progress_thread_handle.join().unwrap(); + + // Check if user aborted search(only from GUI) + if check_was_breaked.load(Ordering::Relaxed) { + return false; + } + + for (size, hash_map, mut errors) in full_hash_results { + self.text_messages.warnings.append(&mut errors); + for (_hash, vec_file_entry) in hash_map { + if vec_file_entry.len() > 1 { + self.files_with_identical_hashes.entry(size).or_insert_with(Vec::new); + self.files_with_identical_hashes.get_mut(&size).unwrap().push(vec_file_entry); } - save_hashes_to_file(&all_results, &mut self.text_messages, &self.hash_type, false, self.minimal_cache_file_size); } } - _ => panic!("What"), - } - // End thread which send info to gui - progress_thread_run.store(false, Ordering::Relaxed); - progress_thread_handle.join().unwrap(); + ///////////////////////// - // Check if user aborted search(only from GUI) - if check_was_breaked.load(Ordering::Relaxed) { - return false; - } - - for (size, hash_map, mut errors, bytes_read) in full_hash_results { - self.information.bytes_read_when_hashing += bytes_read; - self.text_messages.warnings.append(&mut errors); - for (_hash, vec_file_entry) in hash_map { - if vec_file_entry.len() > 1 { - self.files_with_identical_hashes.entry(size).or_insert_with(Vec::new); - self.files_with_identical_hashes.get_mut(&size).unwrap().push(vec_file_entry); + for (size, vector_vectors) in &self.files_with_identical_hashes { + for vector in vector_vectors { + self.information.number_of_duplicated_files_by_hash += vector.len() - 1; + self.information.number_of_groups_by_hash += 1; + self.information.lost_space_by_hash += (vector.len() as u64 - 1) * size; } } } - ///////////////////////// - - for (size, vector_vectors) in &self.files_with_identical_hashes { - for vector in vector_vectors { - self.information.number_of_duplicated_files_by_hash += vector.len() - 1; - self.information.number_of_groups_by_hash += 1; - self.information.lost_space_by_hash += (vector.len() as u64 - 1) * size; - } - } + ///////////////////////////////////////////////////////////////////////////// HASHING END Common::print_time(start_time, SystemTime::now(), "check_files_hash - full hash".to_string()); @@ -959,8 +1046,6 @@ impl DuplicateFinder { for vector in self.files_with_identical_names.values() { let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun); self.information.gained_space += tuple.0; - self.information.number_of_removed_files += tuple.1; - self.information.number_of_failed_to_remove_files += tuple.2; } } CheckingMethod::Hash => { @@ -968,8 +1053,6 @@ impl DuplicateFinder { for vector in vector_vectors.iter() { let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun); self.information.gained_space += tuple.0; - self.information.number_of_removed_files += tuple.1; - self.information.number_of_failed_to_remove_files += tuple.2; } } } @@ -977,8 +1060,6 @@ impl DuplicateFinder { for vector in self.files_with_identical_size.values() { let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun); self.information.gained_space += tuple.0; - self.information.number_of_removed_files += tuple.1; - self.information.number_of_failed_to_remove_files += tuple.2; } } CheckingMethod::None => { @@ -1031,13 +1112,6 @@ impl DebugPrint for DuplicateFinder { self.information.gained_space.file_size(options::BINARY).unwrap(), self.information.gained_space ); - println!( - "Bytes read when hashing - {} ({} bytes)", - self.information.bytes_read_when_hashing.file_size(options::BINARY).unwrap(), - self.information.bytes_read_when_hashing - ); - println!("Number of removed files - {}", self.information.number_of_removed_files); - println!("Number of failed to remove files - {}", self.information.number_of_failed_to_remove_files); println!("### Other"); @@ -1363,7 +1437,7 @@ pub trait MyHasher { fn finalize(&self) -> String; } -fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashType, limit: u64) -> Result<(String, u64), String> { +fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashType, limit: u64) -> Result { let mut file_handler = match File::open(&file_entry.path) { Ok(t) => t, Err(e) => return Err(format!("Unable to check hash of file {}, reason {}", file_entry.path.display(), e)), @@ -1384,7 +1458,7 @@ fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashT break; } } - Ok((hasher.finalize(), current_file_read_bytes)) + Ok(hasher.finalize()) } fn get_file_hash_name(type_of_hash: &HashType, is_prehash: bool) -> String { @@ -1560,8 +1634,7 @@ mod tests { file.write_all(b"aa")?; let e = FileEntry { path: src, ..Default::default() }; let r = hash_calculation(&mut buf, &e, &HashType::Blake3, 0).unwrap(); - assert_eq!(2, r.1); - assert!(!r.0.is_empty()); + assert!(!r.is_empty()); Ok(()) } diff --git a/czkawka_core/src/similar_videos.rs b/czkawka_core/src/similar_videos.rs index 3828464..8916b61 100644 --- a/czkawka_core/src/similar_videos.rs +++ b/czkawka_core/src/similar_videos.rs @@ -457,7 +457,7 @@ impl SimilarVideos { hashmap_with_file_entries.insert(file_entry.vhash.src_path().to_string_lossy().to_string(), file_entry.clone()); vector_of_hashes.push(file_entry.vhash.clone()); } else { - self.text_messages.errors.push(file_entry.error.clone()); + self.text_messages.warnings.push(file_entry.error.clone()); } } diff --git a/czkawka_gui/src/connect_button_search.rs b/czkawka_gui/src/connect_button_search.rs index 530600f..b16b367 100644 --- a/czkawka_gui/src/connect_button_search.rs +++ b/czkawka_gui/src/connect_button_search.rs @@ -92,7 +92,9 @@ pub fn connect_button_search( let radio_button_hash_type_xxh3 = gui_data.main_notebook.radio_button_hash_type_xxh3.clone(); let check_button_settings_hide_hard_links = gui_data.settings.check_button_settings_hide_hard_links.clone(); let check_button_settings_use_cache = gui_data.settings.check_button_settings_use_cache.clone(); + let check_button_duplicates_use_prehash_cache = gui_data.settings.check_button_duplicates_use_prehash_cache.clone(); let entry_settings_cache_file_minimal_size = gui_data.settings.entry_settings_cache_file_minimal_size.clone(); + let entry_settings_prehash_cache_file_minimal_size = gui_data.settings.entry_settings_prehash_cache_file_minimal_size.clone(); let radio_button_similar_hash_size_4 = gui_data.main_notebook.radio_button_similar_hash_size_4.clone(); let radio_button_similar_hash_size_8 = gui_data.main_notebook.radio_button_similar_hash_size_8.clone(); let radio_button_similar_hash_size_16 = gui_data.main_notebook.radio_button_similar_hash_size_16.clone(); @@ -118,7 +120,7 @@ pub fn connect_button_search( let allowed_extensions = entry_allowed_extensions.text().as_str().to_string(); let hide_hard_links = check_button_settings_hide_hard_links.is_active(); let use_cache = check_button_settings_use_cache.is_active(); - let minimal_cache_file_size = entry_settings_cache_file_minimal_size.text().as_str().parse::().unwrap_or(2 * 1024 * 1024); + let minimal_cache_file_size = entry_settings_cache_file_minimal_size.text().as_str().parse::().unwrap_or(1024 * 1024 / 4); let show_dialog = Arc::new(AtomicBool::new(true)); @@ -170,6 +172,9 @@ pub fn connect_button_search( panic!("No radio button is pressed"); } + let use_prehash_cache = check_button_duplicates_use_prehash_cache.is_active(); + let minimal_prehash_cache_file_size = entry_settings_prehash_cache_file_minimal_size.text().as_str().parse::().unwrap_or(0); + let delete_outdated_cache = check_button_settings_duplicates_delete_outdated_cache.is_active(); let futures_sender_duplicate_files = futures_sender_duplicate_files.clone(); @@ -184,10 +189,12 @@ pub fn connect_button_search( df.set_minimal_file_size(minimal_file_size); df.set_maximal_file_size(maximal_file_size); df.set_minimal_cache_file_size(minimal_cache_file_size); + df.set_minimal_prehash_cache_file_size(minimal_prehash_cache_file_size); df.set_check_method(check_method); df.set_hash_type(hash_type); df.set_ignore_hard_links(hide_hard_links); df.set_use_cache(use_cache); + df.set_use_prehash_cache(use_prehash_cache); df.set_delete_outdated_cache(delete_outdated_cache); df.find_duplicates(Some(&stop_receiver), Some(&futures_sender_duplicate_files)); let _ = glib_stop_sender.send(Message::Duplicates(df)); diff --git a/czkawka_gui/src/connect_button_select.rs b/czkawka_gui/src/connect_button_select.rs index ab2c4c0..920c4ae 100644 --- a/czkawka_gui/src/connect_button_select.rs +++ b/czkawka_gui/src/connect_button_select.rs @@ -10,10 +10,10 @@ use crate::notebook_enums::*; pub fn connect_button_select(gui_data: &GuiData) { let mut hashmap: HashMap> = Default::default(); { - hashmap.insert(NotebookMainEnum::SimilarImages, vec![PopoverTypes::All, PopoverTypes::ImageSize, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]); - hashmap.insert(NotebookMainEnum::SimilarVideos, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]); + hashmap.insert(NotebookMainEnum::SimilarImages, vec![PopoverTypes::All, PopoverTypes::Size, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]); + hashmap.insert(NotebookMainEnum::SimilarVideos, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date, PopoverTypes::Size]); hashmap.insert(NotebookMainEnum::Duplicate, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]); - hashmap.insert(NotebookMainEnum::SameMusic, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]); + hashmap.insert(NotebookMainEnum::SameMusic, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date, PopoverTypes::Size]); hashmap.insert(NotebookMainEnum::EmptyFiles, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom]); hashmap.insert(NotebookMainEnum::EmptyDirectories, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom]); @@ -63,7 +63,7 @@ fn show_required_popovers(popovers: &GuiPopovers, current_mode: &NotebookMainEnu buttons_popover_unselect_all.hide(); } - if vec.contains(&PopoverTypes::ImageSize) { + if vec.contains(&PopoverTypes::Size) { buttons_popover_select_all_images_except_biggest.show(); buttons_popover_select_all_images_except_smallest.show(); separator_select_image_size.show(); diff --git a/czkawka_gui/src/connect_popovers.rs b/czkawka_gui/src/connect_popovers.rs index 1d07122..a6d13bd 100644 --- a/czkawka_gui/src/connect_popovers.rs +++ b/czkawka_gui/src/connect_popovers.rs @@ -345,7 +345,7 @@ fn popover_custom_select_unselect(popover: >k::Popover, window_main: &Window, } } -fn popover_all_except_biggest_smallest(popover: >k::Popover, tree_view: >k::TreeView, column_color: i32, column_size_as_bytes: i32, column_dimensions: i32, column_button_selection: u32, except_biggest: bool) { +fn popover_all_except_biggest_smallest(popover: >k::Popover, tree_view: >k::TreeView, column_color: i32, column_size_as_bytes: i32, column_dimensions: Option, column_button_selection: u32, except_biggest: bool) { let model = get_list_store(tree_view); if let Some(iter) = model.iter_first() { @@ -373,22 +373,38 @@ fn popover_all_except_biggest_smallest(popover: >k::Popover, tree_view: >k:: } tree_iter_array.push(iter.clone()); let size_as_bytes = model.value(&iter, column_size_as_bytes).get::().unwrap(); - let dimensions_string = model.value(&iter, column_dimensions).get::().unwrap(); - let dimensions = change_dimension_to_krotka(dimensions_string); - let number_of_pixels = dimensions.0 * dimensions.1; + // If dimension exists, then needs to be checked images + if let Some(column_dimensions) = column_dimensions { + let dimensions_string = model.value(&iter, column_dimensions).get::().unwrap(); - if except_biggest { - if number_of_pixels > number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes > size_as_bytes_min_max) { - number_of_pixels_min_max = number_of_pixels; - size_as_bytes_min_max = size_as_bytes; - used_index = Some(current_index); + let dimensions = change_dimension_to_krotka(dimensions_string); + let number_of_pixels = dimensions.0 * dimensions.1; + + if except_biggest { + if number_of_pixels > number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes > size_as_bytes_min_max) { + number_of_pixels_min_max = number_of_pixels; + size_as_bytes_min_max = size_as_bytes; + used_index = Some(current_index); + } + } else { + if number_of_pixels < number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes < size_as_bytes_min_max) { + number_of_pixels_min_max = number_of_pixels; + size_as_bytes_min_max = size_as_bytes; + used_index = Some(current_index); + } } } else { - if number_of_pixels < number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes < size_as_bytes_min_max) { - number_of_pixels_min_max = number_of_pixels; - size_as_bytes_min_max = size_as_bytes; - used_index = Some(current_index); + if except_biggest { + if size_as_bytes > size_as_bytes_min_max { + size_as_bytes_min_max = size_as_bytes; + used_index = Some(current_index); + } + } else { + if size_as_bytes < size_as_bytes_min_max { + size_as_bytes_min_max = size_as_bytes; + used_index = Some(current_index); + } } } @@ -593,9 +609,9 @@ pub fn connect_popovers(gui_data: &GuiData) { popover_all_except_biggest_smallest( &popover_select, tree_view, - nb_object.column_color.expect("AEB can't be used without headers"), - nb_object.column_size_as_bytes.expect("AEB needs size as bytes column"), - nb_object.column_dimensions.expect("AEB needs dimensions column"), + nb_object.column_color.expect("AEBI can't be used without headers"), + nb_object.column_size_as_bytes.expect("AEBI needs size as bytes column"), + nb_object.column_dimensions, nb_object.column_selection as u32, true, ); @@ -613,9 +629,9 @@ pub fn connect_popovers(gui_data: &GuiData) { popover_all_except_biggest_smallest( &popover_select, tree_view, - nb_object.column_color.expect("AES can't be used without headers"), - nb_object.column_size_as_bytes.expect("AES needs size as bytes column"), - nb_object.column_dimensions.expect("AES needs dimensions column"), + nb_object.column_color.expect("AESI can't be used without headers"), + nb_object.column_size_as_bytes.expect("AESI needs size as bytes column"), + nb_object.column_dimensions, nb_object.column_selection as u32, false, ); diff --git a/czkawka_gui/src/gui_settings.rs b/czkawka_gui/src/gui_settings.rs index 7892bed..aa2f691 100644 --- a/czkawka_gui/src/gui_settings.rs +++ b/czkawka_gui/src/gui_settings.rs @@ -17,6 +17,8 @@ pub struct GuiSettings { // Duplicates pub check_button_settings_hide_hard_links: gtk::CheckButton, pub entry_settings_cache_file_minimal_size: gtk::Entry, + pub entry_settings_prehash_cache_file_minimal_size: gtk::Entry, + pub check_button_duplicates_use_prehash_cache: gtk::CheckButton, pub check_button_settings_show_preview_duplicates: gtk::CheckButton, pub check_button_settings_duplicates_delete_outdated_cache: gtk::CheckButton, pub button_settings_duplicates_clear_cache: gtk::Button, @@ -71,6 +73,8 @@ impl GuiSettings { let check_button_settings_show_preview_duplicates: gtk::CheckButton = builder.object("check_button_settings_show_preview_duplicates").unwrap(); let check_button_settings_duplicates_delete_outdated_cache: gtk::CheckButton = builder.object("check_button_settings_duplicates_delete_outdated_cache").unwrap(); let button_settings_duplicates_clear_cache: gtk::Button = builder.object("button_settings_duplicates_clear_cache").unwrap(); + let check_button_duplicates_use_prehash_cache: gtk::CheckButton = builder.object("check_button_duplicates_use_prehash_cache").unwrap(); + let entry_settings_prehash_cache_file_minimal_size: gtk::Entry = builder.object("entry_settings_prehash_cache_file_minimal_size").unwrap(); check_button_settings_hide_hard_links.set_tooltip_text(Some( "Hides all files except one, if are points to same data(are hardlinked).\n\nE.g. in case where on disk there is 7 files which are hardlinked to specific data and one different file with same data but different inode, then in duplicate finder will be visible only one unique file and one file from hardlinked ones.", @@ -80,7 +84,10 @@ impl GuiSettings { )); check_button_settings_show_preview_duplicates.set_tooltip_text(Some("Shows preview at right side, when selecting image file.")); check_button_settings_duplicates_delete_outdated_cache.set_tooltip_text(Some("Allows to delete outdated cache results which points to non-existent files.\n\nWhen enabled, app make sure when loading records, that all points to valid files and ignore broken ones.\n\nDisabling this option, will help to scan files on external drives, so cache entries about them will not be purged in next scan.\n\nIn case of having hundred of thousands records in cache, it is suggested to enable this option, to speedup cache loading and saving at start and end of scan.")); - button_settings_duplicates_clear_cache.set_tooltip_text(Some("Manually clear cache from outdated entries.\nShould be used only if automatic clearing was disabled.")); + button_settings_duplicates_clear_cache.set_tooltip_text(Some("Manually clear cache from outdated entries.\n\nShould be used only if automatic clearing was disabled.")); + check_button_duplicates_use_prehash_cache.set_tooltip_text(Some( + "Enables caching of prehash(hash computed from small part of file) which allows to earlier throw out non duplicated results.\n\nIt is disabled by default because can cause in some situations slowdowns.\n\nIt is heavily recommended to use it when scanning hundred of thousands or million files, because it can speedup search multiple times.", + )); // Similar Images let check_button_settings_show_preview_similar_images: gtk::CheckButton = builder.object("check_button_settings_show_preview_similar_images").unwrap(); @@ -126,6 +133,8 @@ impl GuiSettings { check_button_settings_use_trash, check_button_settings_hide_hard_links, entry_settings_cache_file_minimal_size, + entry_settings_prehash_cache_file_minimal_size, + check_button_duplicates_use_prehash_cache, check_button_settings_show_preview_duplicates, check_button_settings_duplicates_delete_outdated_cache, button_settings_duplicates_clear_cache, diff --git a/czkawka_gui/src/help_functions.rs b/czkawka_gui/src/help_functions.rs index b751671..e6bec45 100644 --- a/czkawka_gui/src/help_functions.rs +++ b/czkawka_gui/src/help_functions.rs @@ -33,7 +33,7 @@ pub const KEY_END: u32 = 110; #[derive(Eq, PartialEq)] pub enum PopoverTypes { All, - ImageSize, + Size, Reverse, Custom, Date, diff --git a/czkawka_gui/src/saving_loading.rs b/czkawka_gui/src/saving_loading.rs index b862997..3adb398 100644 --- a/czkawka_gui/src/saving_loading.rs +++ b/czkawka_gui/src/saving_loading.rs @@ -141,7 +141,7 @@ pub fn save_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb //// minimal cache file size data_to_save.push("--cache_minimal_file_size:".to_string()); let entry_settings_cache_file_minimal_size = settings.entry_settings_cache_file_minimal_size.clone(); - data_to_save.push(entry_settings_cache_file_minimal_size.text().as_str().parse::().unwrap_or(2 * 1024 * 1024).to_string()); + data_to_save.push(entry_settings_cache_file_minimal_size.text().as_str().parse::().unwrap_or(1024 * 1024 / 4).to_string()); //// Duplicates, delete outdated entries to trash data_to_save.push("--delete_outdated_entries_duplicates:".to_string()); @@ -157,6 +157,16 @@ pub fn save_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb data_to_save.push("--delete_outdated_entries_similar_videos:".to_string()); let check_button_settings_similar_videos_delete_outdated_cache = settings.check_button_settings_similar_videos_delete_outdated_cache.clone(); data_to_save.push(check_button_settings_similar_videos_delete_outdated_cache.is_active().to_string()); + + //// Use prehash cache system + data_to_save.push("--use_prehash_cache:".to_string()); + let check_button_duplicates_use_prehash_cache = settings.check_button_duplicates_use_prehash_cache.clone(); + data_to_save.push(check_button_duplicates_use_prehash_cache.is_active().to_string()); + + //// minimal prehash cache file size + data_to_save.push("--cache_prehash_minimal_file_size:".to_string()); + let entry_settings_prehash_cache_file_minimal_size = settings.entry_settings_prehash_cache_file_minimal_size.clone(); + data_to_save.push(entry_settings_prehash_cache_file_minimal_size.text().as_str().parse::().unwrap_or(0).to_string()); } // Creating/Opening config file @@ -213,6 +223,8 @@ enum TypeOfLoadedData { DeleteCacheDuplicates, DeleteCacheSimilarImages, DeleteCacheSimilarVideos, + UsePrehashCache, + CachePrehashMinimalSize, } pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNotebook, settings: &GuiSettings, text_view_errors: &TextView, scrolled_window_errors: &ScrolledWindow) { @@ -264,6 +276,8 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb let mut delete_outdated_cache_dupliactes: bool = true; let mut delete_outdated_cache_similar_images: bool = true; let mut delete_outdated_cache_similar_videos: bool = false; + let mut use_prehash_cache: bool = false; + let mut cache_prehash_minimal_size: u64 = 0; let mut current_type = TypeOfLoadedData::None; for (line_number, line) in loaded_data.replace("\r\n", "\n").split('\n').enumerate() { @@ -307,6 +321,10 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb current_type = TypeOfLoadedData::DeleteCacheSimilarVideos; } else if line.starts_with("--delete_outdated_entries_similar_images") { current_type = TypeOfLoadedData::DeleteCacheSimilarImages; + } else if line.starts_with("--use_prehash_cache") { + current_type = TypeOfLoadedData::UsePrehashCache; + } else if line.starts_with("--cache_prehash_minimal_file_size") { + current_type = TypeOfLoadedData::CachePrehashMinimalSize; } else if line.starts_with("--") { current_type = TypeOfLoadedData::None; add_text_to_text_view( @@ -512,6 +530,29 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb ); } } + TypeOfLoadedData::UsePrehashCache => { + let line = line.to_lowercase(); + if line == "1" || line == "true" { + use_prehash_cache = true; + } else if line == "0" || line == "false" { + use_prehash_cache = false; + } else { + add_text_to_text_view( + &text_view_errors, + format!("Found invalid data in line {} \"{}\" isn't proper value(0/1/true/false) when loading file {:?}", line_number, line, config_file).as_str(), + ); + } + } + TypeOfLoadedData::CachePrehashMinimalSize => { + if let Ok(number) = line.parse::() { + cache_prehash_minimal_size = number; + } else { + add_text_to_text_view( + &text_view_errors, + format!("Found invalid data in line {} \"{}\" isn't proper value(u64) when loading file {:?}", line_number, line, config_file).as_str(), + ); + } + } } } } @@ -566,8 +607,10 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb } settings.check_button_settings_hide_hard_links.set_active(hide_hard_links); settings.check_button_settings_use_cache.set_active(use_cache); + settings.check_button_duplicates_use_prehash_cache.set_active(use_prehash_cache); settings.check_button_settings_use_trash.set_active(use_trash); settings.entry_settings_cache_file_minimal_size.set_text(cache_minimal_size.to_string().as_str()); + settings.entry_settings_prehash_cache_file_minimal_size.set_text(cache_prehash_minimal_size.to_string().as_str()); } else { settings.check_button_settings_load_at_start.set_active(false); } @@ -650,10 +693,12 @@ pub fn reset_configuration(manual_clearing: bool, upper_notebook: &GuiUpperNoteb settings.check_button_settings_hide_hard_links.set_active(true); settings.check_button_settings_use_cache.set_active(true); settings.check_button_settings_use_trash.set_active(false); - settings.entry_settings_cache_file_minimal_size.set_text("524288"); + settings.entry_settings_cache_file_minimal_size.set_text("257144"); settings.check_button_settings_similar_videos_delete_outdated_cache.set_active(false); settings.check_button_settings_similar_images_delete_outdated_cache.set_active(true); settings.check_button_settings_duplicates_delete_outdated_cache.set_active(true); + settings.check_button_duplicates_use_prehash_cache.set_active(false); + settings.entry_settings_prehash_cache_file_minimal_size.set_text("0"); } if manual_clearing { add_text_to_text_view(&text_view_errors, "Current configuration was cleared."); diff --git a/czkawka_gui/ui/settings.glade b/czkawka_gui/ui/settings.glade index 3df4dd7..bcda5a3 100644 --- a/czkawka_gui/ui/settings.glade +++ b/czkawka_gui/ui/settings.glade @@ -301,7 +301,7 @@ Author: Rafał Mikrut True False - Minimal cached file size in bytes + Minimal size of files in bytes saved to cache True @@ -314,7 +314,7 @@ Author: Rafał Mikrut True True 15 - 524288 + 257144 False number @@ -332,6 +332,20 @@ Author: Rafał Mikrut 3 + + + Use prehash cache + True + True + False + True + + + False + True + 4 + + Remove outdated results from duplicates cache @@ -343,7 +357,50 @@ Author: Rafał Mikrut False False end - 4 + 5 + + + + + True + False + 4 + 4 + 4 + 4 + + + True + False + Minimal size of files in bytes saved to prehash cache + + + True + True + 0 + + + + + True + True + 15 + 1 + False + number + + + False + False + end + 1 + + + + + False + True + 6