Add pre hash check (#83)

2024-04-27 17:22:13 +12:00 · 2020-10-24 00:56:39 -04:00 · 2020-10-24 00:56:39 -04:00 · d996c3c46b
parent 8ecde0fc9a
commit d996c3c46b
2 changed files with 69 additions and 9 deletions
--- a/czkawka_core/src/duplicate.rs
+++ b/czkawka_core/src/duplicate.rs
@ -33,7 +33,7 @@ pub enum DeleteMethod {
    OneNewest,
 }

-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct FileEntry {
    pub path: PathBuf,
    pub size: u64,
@ -51,7 +51,10 @@ pub struct Info {
    pub number_of_duplicated_files_by_size: usize,
    pub number_of_groups_by_hash: usize,
    pub number_of_duplicated_files_by_hash: usize,
+    pub number_of_duplicated_files_after_pre_hash: usize,
+    pub number_of_groups_after_pre_hash: usize,
    pub lost_space_by_size: u64,
+    pub lost_space_after_pre_hash: u64,
    pub lost_space_by_hash: u64,
    pub bytes_read_when_hashing: u64,
    pub number_of_removed_files: usize,
@ -208,21 +211,21 @@ impl DuplicateFinder {
                    Ok(t) => t,
                    Err(_) => {
                        self.text_messages.warnings.push(format!("Cannot read entry in dir {}", current_folder.display()));
-                        continue;
+                        continue 'dir;
                    } //Permissions denied
                };
                let metadata: Metadata = match entry_data.metadata() {
                    Ok(t) => t,
                    Err(_) => {
                        self.text_messages.warnings.push(format!("Cannot read metadata in dir {}", current_folder.display()));
-                        continue;
+                        continue 'dir;
                    } //Permissions denied
                };
                if metadata.is_dir() {
                    self.information.number_of_checked_folders += 1;

                    if !self.recursive_search {
-                        continue;
+                        continue 'dir;
                    }

                    let next_folder = current_folder.join(entry_data.file_name());
@ -239,7 +242,7 @@ impl DuplicateFinder {
                    // let mut have_valid_extension: bool;
                    let file_name_lowercase: String = match entry_data.file_name().into_string() {
                        Ok(t) => t,
-                        Err(_) => continue,
+                        Err(_) => continue 'dir,
                    }
                    .to_lowercase();

@ -273,7 +276,7 @@ impl DuplicateFinder {
                                },
                                Err(_) => {
                                    self.text_messages.warnings.push(format!("Unable to get modification date from file {}", current_file_name.display()));
-                                    continue;
+                                    continue 'dir;
                                } // Permissions Denied
                            },
                        };
@ -318,10 +321,58 @@ impl DuplicateFinder {
        let start_time: SystemTime = SystemTime::now();
        let mut file_handler: File;
        let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>>;
+        let mut pre_checked_map: BTreeMap<u64, Vec<FileEntry>> = Default::default();

+        // 1 step - check only small part of file hash
        for (size, vector) in &self.files_with_identical_size {
            hashmap_with_hash = Default::default();

+            for file_entry in vector {
+                if rx.is_some() && rx.unwrap().try_recv().is_ok() {
+                    return false;
+                }
+                file_handler = match File::open(&file_entry.path) {
+                    Ok(t) => t,
+                    Err(_) => {
+                        self.text_messages.warnings.push(format!("Unable to check hash of file {}", file_entry.path.display()));
+                        continue;
+                    }
+                };
+
+                let mut hasher: blake3::Hasher = blake3::Hasher::new();
+                let mut buffer = [0u8; 1024 * 2];
+                let n = match file_handler.read(&mut buffer) {
+                    Ok(t) => t,
+                    Err(_) => {
+                        self.text_messages.warnings.push(format!("Error happened when checking hash of file {}", file_entry.path.display()));
+                        continue;
+                    }
+                };
+
+                self.information.bytes_read_when_hashing += n as u64;
+                hasher.update(&buffer[..n]);
+
+                let hash_string: String = hasher.finalize().to_hex().to_string();
+                hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
+                hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
+            }
+            for (_string, mut vector) in hashmap_with_hash {
+                if vector.len() > 1 {
+                    pre_checked_map.entry(*size).or_insert_with(Vec::new);
+                    pre_checked_map.get_mut(size).unwrap().append(&mut vector);
+                }
+            }
+        }
+        for (size, vector) in pre_checked_map.iter() {
+            self.information.number_of_duplicated_files_after_pre_hash += vector.len() - 1;
+            self.information.number_of_groups_after_pre_hash += 1;
+            self.information.lost_space_after_pre_hash += (vector.len() as u64 - 1) * size;
+        }
+
+        // 2 step - Check full file hash
+        for (size, vector) in &pre_checked_map {
+            hashmap_with_hash = Default::default();
+
            for file_entry in vector {
                if rx.is_some() && rx.unwrap().try_recv().is_ok() {
                    return false;
@ -337,7 +388,7 @@ impl DuplicateFinder {
                let mut error_reading_file: bool = false;

                let mut hasher: blake3::Hasher = blake3::Hasher::new();
-                let mut buffer = [0u8; 16384];
+                let mut buffer = [0u8; 32 * 1024];
                let mut read_bytes: u64 = 0;
                loop {
                    let n = match file_handler.read(&mut buffer) {
@ -448,11 +499,20 @@ impl DebugPrint for DuplicateFinder {
            "Number of duplicated files by size(in groups) - {} ({})",
            self.information.number_of_duplicated_files_by_size, self.information.number_of_groups_by_size
        );
+        println!(
+            "Number of duplicated files after pre hash(in groups) - {} ({})",
+            self.information.number_of_duplicated_files_after_pre_hash, self.information.number_of_groups_after_pre_hash
+        );
        println!(
            "Number of duplicated files by hash(in groups) - {} ({})",
            self.information.number_of_duplicated_files_by_hash, self.information.number_of_groups_by_hash
        );
        println!("Lost space by size - {} ({} bytes)", self.information.lost_space_by_size.file_size(options::BINARY).unwrap(), self.information.lost_space_by_size);
+        println!(
+            "Lost space after pre hash - {} ({} bytes)",
+            self.information.lost_space_after_pre_hash.file_size(options::BINARY).unwrap(),
+            self.information.lost_space_after_pre_hash
+        );
        println!("Lost space by hash - {} ({} bytes)", self.information.lost_space_by_hash.file_size(options::BINARY).unwrap(), self.information.lost_space_by_hash);
        println!(
            "Gained space by removing duplicated entries - {} ({} bytes)",
--- a/czkawka_gui/src/main.rs
+++ b/czkawka_gui/src/main.rs
@ -1583,7 +1583,7 @@ fn main() {
                                for (size, vectors_vector) in btreemap.iter().rev() {
                                    for vector in vectors_vector {
                                        let values: [&dyn ToValue; 6] = [
-                                            &(vector.len().to_string() + " x " + size.to_string().as_str()),
+                                            &(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
                                            &(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
                                            &"".to_string(), // No text in 3 column
                                            &(0),            // Not used here
@ -1611,7 +1611,7 @@ fn main() {

                                for (size, vector) in btreemap.iter().rev() {
                                    let values: [&dyn ToValue; 6] = [
-                                        &(vector.len().to_string() + " x " + size.to_string().as_str()),
+                                        &(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
                                        &(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
                                        &"".to_string(), // No text in 3 column
                                        &(0),            // Not used here