From 338352f38450ff3c1153be1556cddccd0942417d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= Date: Sat, 26 Sep 2020 20:46:35 +0200 Subject: [PATCH] Added checking for duplicates by checking hash of first 1MB of file --- czkawka_cli/src/main.rs | 4 +++- czkawka_core/src/big_file.rs | 9 +++------ czkawka_core/src/duplicate.rs | 20 +++++++++++++++----- czkawka_core/src/empty_folder.rs | 10 +++++----- czkawka_gui/src/main.rs | 4 ++-- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/czkawka_cli/src/main.rs b/czkawka_cli/src/main.rs index 68cf9f6..109f03f 100644 --- a/czkawka_cli/src/main.rs +++ b/czkawka_cli/src/main.rs @@ -109,6 +109,8 @@ fn main() { df.set_check_method(duplicate::CheckingMethod::Size); } else if argument_name == "hash" { df.set_check_method(duplicate::CheckingMethod::Hash); + } else if argument_name == "hashmb" { + df.set_check_method(duplicate::CheckingMethod::HashMB); } else { println!("-l can only have values hash or size"); process::exit(1); @@ -272,7 +274,7 @@ Usage of Czkawka: -f file_to_save - saves results to file -s min_size - minimum size of checked files in bytes, assigning bigger value may speed up searching. -x allowed_extension - list of checked extension, e.g. "jpg,mp4" will allow to check "book.jpg" and "car.mp4" but not roman.png. There are also helpful macros which allow to easy use a typcal extension like IMAGE("jpg,kra,gif,png,bmp,tiff,webp,hdr,svg") or TEXT("txt,doc,docx,odt,rtf") - -l type_of_search - allows to use fastest which takes into account only size, and more accurate which check if file contnet is same(hashes). + -l type_of_search - allows to use fastest which takes into account only size(SIZE), more accurate which takes into account hash of only first 1MB of file(HASHMB) or fully accurate(the slowest solution) which check hash of all file(HASH). -delete - delete found files, by default remove all except the most oldest one, it can take arguments: aen(All except newest one), aeo(All except oldest one), on(Only one newest), oo(Only one oldest) Usage example: diff --git a/czkawka_core/src/big_file.rs b/czkawka_core/src/big_file.rs index 25ea587..23f3c29 100644 --- a/czkawka_core/src/big_file.rs +++ b/czkawka_core/src/big_file.rs @@ -87,7 +87,6 @@ impl BigFile { &self.information } - pub fn set_recursive_search(&mut self, recursive_search: bool) { self.recursive_search = recursive_search; } @@ -208,14 +207,14 @@ impl BigFile { Ok(t) => t, Err(_) => { self.text_messages.warnings.push("Unable to get creation date from file ".to_string() + current_file_name.as_str()); - continue + continue; } // Permissions Denied }, modified_date: match metadata.modified() { Ok(t) => t, Err(_) => { self.text_messages.warnings.push("Unable to get modification date from file ".to_string() + current_file_name.as_str()); - continue + continue; } // Permissions Denied }, }; @@ -234,8 +233,6 @@ impl BigFile { } } - - // Extract n biggest files to new TreeMap let mut new_map: BTreeMap> = Default::default(); @@ -376,7 +373,7 @@ impl PrintResults for BigFile { for (size, vector) in self.big_files.iter().rev() { // TODO Align all to same width for entry in vector { - println!("{} ({}) - {}", size.file_size(options::BINARY).unwrap(), size, entry.path); + println!("{} ({} bytes) - {}", size.file_size(options::BINARY).unwrap(), size, entry.path); } } Common::print_time(start_time, SystemTime::now(), "print_duplicated_entries".to_string()); diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index 1fcb3ce..96dea7d 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -12,11 +12,14 @@ use crate::common_items::ExcludedItems; use crate::common_messages::Messages; use crate::common_traits::*; +const HASH_MB_LIMIT_BYTES: u64 = 1024 * 1024; // 1MB + #[derive(PartialEq, Eq, Clone, Debug)] pub enum CheckingMethod { None, Size, Hash, + HashMB, } #[derive(Eq, PartialEq, Clone, Debug)] @@ -116,7 +119,7 @@ impl DuplicateFinder { pub fn find_duplicates(&mut self) { self.directories.optimize_directories(self.recursive_search, &mut self.text_messages); self.check_files_size(); - if self.check_method == CheckingMethod::Hash { + if self.check_method == CheckingMethod::Hash || self.check_method == CheckingMethod::HashMB { self.check_files_hash(); } self.delete_files(); @@ -354,6 +357,7 @@ impl DuplicateFinder { let mut hasher: blake3::Hasher = blake3::Hasher::new(); let mut buffer = [0u8; 16384]; + let mut readed_bytes: u64 = 0; loop { let n = match file_handler.read(&mut buffer) { Ok(t) => t, @@ -366,8 +370,14 @@ impl DuplicateFinder { if n == 0 { break; } + + readed_bytes += n as u64; self.information.bytes_read_when_hashing += n as u64; hasher.update(&buffer[..n]); + + if self.check_method == CheckingMethod::HashMB && readed_bytes >= HASH_MB_LIMIT_BYTES { + break; + } } if !error_reading_file { let hash_string: String = hasher.finalize().to_hex().to_string(); @@ -400,8 +410,8 @@ impl DuplicateFinder { let start_time: SystemTime = SystemTime::now(); match self.check_method { - CheckingMethod::Hash => { - for (_size, vector_vectors) in &self.files_with_identical_hashes { + CheckingMethod::Hash | CheckingMethod::HashMB => { + for vector_vectors in self.files_with_identical_hashes.values() { for vector in vector_vectors.iter() { let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages.warnings); self.information.gained_space += tuple.0; @@ -411,7 +421,7 @@ impl DuplicateFinder { } } CheckingMethod::Size => { - for (_size, vector) in &self.files_with_identical_size { + for vector in self.files_with_identical_size.values() { let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages.warnings); self.information.gained_space += tuple.0; self.information.number_of_removed_files += tuple.1; @@ -570,7 +580,7 @@ impl PrintResults for DuplicateFinder { let mut number_of_groups: u64 = 0; match self.check_method { - CheckingMethod::Hash => { + CheckingMethod::Hash | CheckingMethod::HashMB => { for (_size, vector) in self.files_with_identical_hashes.iter() { for j in vector { number_of_files += j.len() as u64; diff --git a/czkawka_core/src/empty_folder.rs b/czkawka_core/src/empty_folder.rs index 501e714..4379dc5 100644 --- a/czkawka_core/src/empty_folder.rs +++ b/czkawka_core/src/empty_folder.rs @@ -97,7 +97,7 @@ impl EmptyFolder { fn optimize_folders(&mut self) { let mut new_directory_folders: BTreeMap = Default::default(); - for (name,folder_entry) in &self.empty_folder_list { + for (name, folder_entry) in &self.empty_folder_list { match &folder_entry.parent_path { Some(t) => { if !self.empty_folder_list.contains_key(t) { @@ -212,7 +212,7 @@ impl EmptyFolder { // Now we check if checked folders are really empty, and if are, then if initial_checking { // We need to set empty folder list - for (name,folder_entry) in folders_checked { + for (name, folder_entry) in folders_checked { if folder_entry.is_empty != FolderEmptiness::No { self.empty_folder_list.insert(name, folder_entry); } @@ -220,7 +220,7 @@ impl EmptyFolder { } else { // We need to check if parent of folder isn't also empty, because we wan't to delete only parent with two empty folders except this folders and at the end parent folder let mut new_folders_list: BTreeMap = Default::default(); - for (name,folder_entry) in folders_checked { + for (name, folder_entry) in folders_checked { if folder_entry.is_empty != FolderEmptiness::No && self.empty_folder_list.contains_key(&name) { new_folders_list.insert(name, folder_entry); } @@ -235,10 +235,10 @@ impl EmptyFolder { fn delete_empty_folders(&mut self) { let start_time: SystemTime = SystemTime::now(); // Folders may be deleted or require too big privileges - for (name,_folder_entry) in &self.empty_folder_list { + for name in self.empty_folder_list.keys() { match fs::remove_dir_all(name) { Ok(_) => (), - Err(_) => self.text_messages.warnings.push(format!("Failed to remove folder {}",name)), + Err(_) => self.text_messages.warnings.push(format!("Failed to remove folder {}", name)), }; } diff --git a/czkawka_gui/src/main.rs b/czkawka_gui/src/main.rs index 929af76..70dac96 100644 --- a/czkawka_gui/src/main.rs +++ b/czkawka_gui/src/main.rs @@ -279,7 +279,7 @@ fn main() { let duplicates_group: usize; match check_method { - CheckingMethod::Hash => { + CheckingMethod::Hash | CheckingMethod::HashMB => { duplicates_number = information.number_of_duplicated_files_by_hash; duplicates_size = information.lost_space_by_hash; duplicates_group = information.number_of_groups_by_hash; @@ -315,7 +315,7 @@ fn main() { let col_indices = [0, 1, 2, 3]; match check_method { - CheckingMethod::Hash => { + CheckingMethod::Hash | CheckingMethod::HashMB => { let btreemap = df.get_files_sorted_by_hash(); for (size, vectors_vector) in btreemap.iter().rev() {