diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index 1cc4b39..d018319 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -33,7 +33,7 @@ pub enum DeleteMethod { OneNewest, } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct FileEntry { pub path: PathBuf, pub size: u64, @@ -51,7 +51,10 @@ pub struct Info { pub number_of_duplicated_files_by_size: usize, pub number_of_groups_by_hash: usize, pub number_of_duplicated_files_by_hash: usize, + pub number_of_duplicated_files_after_pre_hash: usize, + pub number_of_groups_after_pre_hash: usize, pub lost_space_by_size: u64, + pub lost_space_after_pre_hash: u64, pub lost_space_by_hash: u64, pub bytes_read_when_hashing: u64, pub number_of_removed_files: usize, @@ -208,21 +211,21 @@ impl DuplicateFinder { Ok(t) => t, Err(_) => { self.text_messages.warnings.push(format!("Cannot read entry in dir {}", current_folder.display())); - continue; + continue 'dir; } //Permissions denied }; let metadata: Metadata = match entry_data.metadata() { Ok(t) => t, Err(_) => { self.text_messages.warnings.push(format!("Cannot read metadata in dir {}", current_folder.display())); - continue; + continue 'dir; } //Permissions denied }; if metadata.is_dir() { self.information.number_of_checked_folders += 1; if !self.recursive_search { - continue; + continue 'dir; } let next_folder = current_folder.join(entry_data.file_name()); @@ -239,7 +242,7 @@ impl DuplicateFinder { // let mut have_valid_extension: bool; let file_name_lowercase: String = match entry_data.file_name().into_string() { Ok(t) => t, - Err(_) => continue, + Err(_) => continue 'dir, } .to_lowercase(); @@ -273,7 +276,7 @@ impl DuplicateFinder { }, Err(_) => { self.text_messages.warnings.push(format!("Unable to get modification date from file {}", current_file_name.display())); - continue; + continue 'dir; } // Permissions Denied }, }; @@ -318,10 +321,58 @@ impl DuplicateFinder { let start_time: SystemTime = SystemTime::now(); let mut file_handler: File; let mut hashmap_with_hash: HashMap>; + let mut pre_checked_map: BTreeMap> = Default::default(); + // 1 step - check only small part of file hash for (size, vector) in &self.files_with_identical_size { hashmap_with_hash = Default::default(); + for file_entry in vector { + if rx.is_some() && rx.unwrap().try_recv().is_ok() { + return false; + } + file_handler = match File::open(&file_entry.path) { + Ok(t) => t, + Err(_) => { + self.text_messages.warnings.push(format!("Unable to check hash of file {}", file_entry.path.display())); + continue; + } + }; + + let mut hasher: blake3::Hasher = blake3::Hasher::new(); + let mut buffer = [0u8; 1024 * 2]; + let n = match file_handler.read(&mut buffer) { + Ok(t) => t, + Err(_) => { + self.text_messages.warnings.push(format!("Error happened when checking hash of file {}", file_entry.path.display())); + continue; + } + }; + + self.information.bytes_read_when_hashing += n as u64; + hasher.update(&buffer[..n]); + + let hash_string: String = hasher.finalize().to_hex().to_string(); + hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new); + hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned()); + } + for (_string, mut vector) in hashmap_with_hash { + if vector.len() > 1 { + pre_checked_map.entry(*size).or_insert_with(Vec::new); + pre_checked_map.get_mut(size).unwrap().append(&mut vector); + } + } + } + for (size, vector) in pre_checked_map.iter() { + self.information.number_of_duplicated_files_after_pre_hash += vector.len() - 1; + self.information.number_of_groups_after_pre_hash += 1; + self.information.lost_space_after_pre_hash += (vector.len() as u64 - 1) * size; + } + + // 2 step - Check full file hash + for (size, vector) in &pre_checked_map { + hashmap_with_hash = Default::default(); + for file_entry in vector { if rx.is_some() && rx.unwrap().try_recv().is_ok() { return false; @@ -337,7 +388,7 @@ impl DuplicateFinder { let mut error_reading_file: bool = false; let mut hasher: blake3::Hasher = blake3::Hasher::new(); - let mut buffer = [0u8; 16384]; + let mut buffer = [0u8; 32 * 1024]; let mut read_bytes: u64 = 0; loop { let n = match file_handler.read(&mut buffer) { @@ -448,11 +499,20 @@ impl DebugPrint for DuplicateFinder { "Number of duplicated files by size(in groups) - {} ({})", self.information.number_of_duplicated_files_by_size, self.information.number_of_groups_by_size ); + println!( + "Number of duplicated files after pre hash(in groups) - {} ({})", + self.information.number_of_duplicated_files_after_pre_hash, self.information.number_of_groups_after_pre_hash + ); println!( "Number of duplicated files by hash(in groups) - {} ({})", self.information.number_of_duplicated_files_by_hash, self.information.number_of_groups_by_hash ); println!("Lost space by size - {} ({} bytes)", self.information.lost_space_by_size.file_size(options::BINARY).unwrap(), self.information.lost_space_by_size); + println!( + "Lost space after pre hash - {} ({} bytes)", + self.information.lost_space_after_pre_hash.file_size(options::BINARY).unwrap(), + self.information.lost_space_after_pre_hash + ); println!("Lost space by hash - {} ({} bytes)", self.information.lost_space_by_hash.file_size(options::BINARY).unwrap(), self.information.lost_space_by_hash); println!( "Gained space by removing duplicated entries - {} ({} bytes)", diff --git a/czkawka_gui/src/main.rs b/czkawka_gui/src/main.rs index bb559b5..636f9e0 100644 --- a/czkawka_gui/src/main.rs +++ b/czkawka_gui/src/main.rs @@ -1583,7 +1583,7 @@ fn main() { for (size, vectors_vector) in btreemap.iter().rev() { for vector in vectors_vector { let values: [&dyn ToValue; 6] = [ - &(vector.len().to_string() + " x " + size.to_string().as_str()), + &(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)), &(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)), &"".to_string(), // No text in 3 column &(0), // Not used here @@ -1611,7 +1611,7 @@ fn main() { for (size, vector) in btreemap.iter().rev() { let values: [&dyn ToValue; 6] = [ - &(vector.len().to_string() + " x " + size.to_string().as_str()), + &(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)), &(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)), &"".to_string(), // No text in 3 column &(0), // Not used here