From 7ec665ab7a8b72d3e49a3edc55c6a651b6e0a1b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <41945903+qarmin@users.noreply.github.com> Date: Thu, 14 Jan 2021 12:17:15 +0100 Subject: [PATCH] Add cache for duplicate (#205) --- README.md | 9 +- czkawka_core/src/duplicate.rs | 342 ++++++++++++++++++++++++----- czkawka_core/src/similar_images.rs | 26 +-- instructions/Instruction.md | 8 +- 4 files changed, 311 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 9b4f794..8e13d5f 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,11 @@ - Written in memory safe Rust - Amazingly fast - due using more or less advanced algorithms and multithreading support - Free, Open Source without ads -- Multiplatform - works on Linux, Windows and macOS +- Multiplatform - works on Linux, Windows and macOS +- Cache support - second and further scans should be a lot of faster than first - CLI frontend, very fast to automate tasks -- GUI GTK frontend - uses modern GTK 3 and looks similar to FSlint -- Light/Dark theme match the appearance of the system(Linux only) -- Saving results to a file - allows reading entries found by the tool easily +- GUI frontend - uses modern GTK 3 and looks similar to FSlint - Rich search option - allows setting absolute included and excluded directories, set of allowed file extensions or excluded items with * wildcard -- Image previews to get quick view at the compared photos - Multiple tools to use: - Duplicates - Finds duplicates basing on file name, size, hash, first 1 MB of hash - Empty Folders - Finds empty folders with the help of advanced algorithm @@ -228,6 +226,7 @@ So still is a big room for improvements. | Non stripped binaries | | X | | | Redundant whitespace | | X | | | Multiple languages(po) | | X | X | +| Cache support | X | | X | | Project Activity | High | Very Low | High | ## Contributions diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index eea4314..9479ab7 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -1,9 +1,9 @@ use crossbeam_channel::Receiver; use humansize::{file_size_opts as options, FileSize}; use std::collections::{BTreeMap, HashMap}; -use std::fs::{File, Metadata}; +use std::fs::{File, Metadata, OpenOptions}; use std::io::prelude::*; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::{fs, thread}; @@ -13,14 +13,17 @@ use crate::common_extensions::Extensions; use crate::common_items::ExcludedItems; use crate::common_messages::Messages; use crate::common_traits::*; +use directories_next::ProjectDirs; use rayon::prelude::*; -use std::io::BufWriter; +use std::io::{BufReader, BufWriter}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; use std::thread::sleep; const HASH_MB_LIMIT_BYTES: u64 = 1024 * 1024; // 1MB +const CACHE_FILE_NAME: &str = "cache_duplicates.txt"; + #[derive(Debug)] pub struct ProgressData { pub checking_method: CheckingMethod, @@ -39,7 +42,7 @@ pub enum CheckingMethod { HashMB, } -#[derive(PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug, Copy)] pub enum HashType { Blake3, } @@ -58,6 +61,7 @@ pub struct FileEntry { pub path: PathBuf, pub size: u64, pub modified_date: u64, + pub hash: String, } /// Info struck with helpful information's about results @@ -349,6 +353,7 @@ impl DuplicateFinder { continue 'dir; } // Permissions Denied }, + hash: "".to_string(), }; // Adding files to BTreeMap @@ -520,6 +525,7 @@ impl DuplicateFinder { continue 'dir; } // Permissions Denied }, + hash: "".to_string(), }; // Adding files to BTreeMap @@ -631,8 +637,8 @@ impl DuplicateFinder { hasher.update(&buffer[..n]); let hash_string: String = hasher.finalize().to_hex().to_string(); - hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new); - hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned()); + hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new); + hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.clone()); } Some((*size, hashmap_with_hash, errors, bytes_read)) }) @@ -700,60 +706,191 @@ impl DuplicateFinder { //// PROGRESS THREAD END #[allow(clippy::type_complexity)] - let full_hash_results: Vec<(u64, HashMap>, Vec, u64)> = pre_checked_map - .par_iter() - .map(|(size, vec_file_entry)| { - let mut hashmap_with_hash: HashMap> = Default::default(); - let mut errors: Vec = Vec::new(); - let mut file_handler: File; - let mut bytes_read: u64 = 0; - atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); - 'fe: for file_entry in vec_file_entry { - if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { - check_was_breaked.store(true, Ordering::Relaxed); - return None; - } - file_handler = match File::open(&file_entry.path) { - Ok(t) => t, - Err(_) => { - errors.push(format!("Unable to check hash of file {}", file_entry.path.display())); - continue 'fe; - } - }; + let mut full_hash_results: Vec<(u64, HashMap>, Vec, u64)>; - let mut hasher: blake3::Hasher = blake3::Hasher::new(); - let mut buffer = [0u8; 1024 * 32]; - let mut current_file_read_bytes: u64 = 0; - - loop { - let n = match file_handler.read(&mut buffer) { - Ok(t) => t, - Err(_) => { - errors.push(format!("Error happened when checking hash of file {}", file_entry.path.display())); - continue 'fe; + match self.check_method { + CheckingMethod::HashMB => { + full_hash_results = pre_checked_map + .par_iter() + .map(|(size, vec_file_entry)| { + let mut hashmap_with_hash: HashMap> = Default::default(); + let mut errors: Vec = Vec::new(); + let mut file_handler: File; + let mut bytes_read: u64 = 0; + atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); + 'fe: for file_entry in vec_file_entry { + if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { + check_was_breaked.store(true, Ordering::Relaxed); + return None; } - }; - if n == 0 { - break; + file_handler = match File::open(&file_entry.path) { + Ok(t) => t, + Err(_) => { + errors.push(format!("Unable to check hash of file {}", file_entry.path.display())); + continue 'fe; + } + }; + + let mut hasher: blake3::Hasher = blake3::Hasher::new(); + let mut buffer = [0u8; 1024 * 128]; + let mut current_file_read_bytes: u64 = 0; + + loop { + let n = match file_handler.read(&mut buffer) { + Ok(t) => t, + Err(_) => { + errors.push(format!("Error happened when checking hash of file {}", file_entry.path.display())); + continue 'fe; + } + }; + if n == 0 { + break; + } + + current_file_read_bytes += n as u64; + bytes_read += n as u64; + hasher.update(&buffer[..n]); + + if current_file_read_bytes >= HASH_MB_LIMIT_BYTES { + break; + } + } + + let hash_string: String = hasher.finalize().to_hex().to_string(); + hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new); + hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned()); } + Some((*size, hashmap_with_hash, errors, bytes_read)) + }) + .while_some() + .collect(); + } + CheckingMethod::Hash => { + let loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, &self.hash_type) { + Some(t) => t, + None => Default::default(), + }; - current_file_read_bytes += n as u64; - bytes_read += n as u64; - hasher.update(&buffer[..n]); + let mut records_already_cached: HashMap> = Default::default(); + let mut non_cached_files_to_check: HashMap> = Default::default(); + for (size, vec_file_entry) in pre_checked_map { + #[allow(clippy::collapsible_if)] + if !loaded_hash_map.contains_key(&size) { + // If loaded data doesn't contains current info + non_cached_files_to_check.insert(size, vec_file_entry); + } else { + let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap(); - if self.check_method == CheckingMethod::HashMB && current_file_read_bytes >= HASH_MB_LIMIT_BYTES { - break; + for file_entry in vec_file_entry { + let mut found: bool = false; + for loaded_file_entry in loaded_vec_file_entry { + if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date { + records_already_cached.entry(file_entry.size).or_insert_with(Vec::new); + records_already_cached.get_mut(&file_entry.size).unwrap().push(loaded_file_entry.clone()); + found = true; + break; + } + } + + if !found { + non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new); + non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry); + } } } - - let hash_string: String = hasher.finalize().to_hex().to_string(); - hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new); - hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned()); } - Some((*size, hashmap_with_hash, errors, bytes_read)) - }) - .while_some() - .collect(); + + full_hash_results = non_cached_files_to_check + .par_iter() + .map(|(size, vec_file_entry)| { + let mut hashmap_with_hash: HashMap> = Default::default(); + let mut errors: Vec = Vec::new(); + let mut file_handler: File; + let mut bytes_read: u64 = 0; + atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed); + 'fe: for file_entry in vec_file_entry { + if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { + check_was_breaked.store(true, Ordering::Relaxed); + return None; + } + file_handler = match File::open(&file_entry.path) { + Ok(t) => t, + Err(_) => { + errors.push(format!("Unable to check hash of file {}", file_entry.path.display())); + continue 'fe; + } + }; + + let mut hasher: blake3::Hasher = blake3::Hasher::new(); + let mut buffer = [0u8; 1024 * 128]; + + loop { + let n = match file_handler.read(&mut buffer) { + Ok(t) => t, + Err(_) => { + errors.push(format!("Error happened when checking hash of file {}", file_entry.path.display())); + continue 'fe; + } + }; + if n == 0 { + break; + } + + bytes_read += n as u64; + hasher.update(&buffer[..n]); + } + + let hash_string: String = hasher.finalize().to_hex().to_string(); + let mut file_entry = file_entry.clone(); + file_entry.hash = hash_string.clone(); + hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new); + hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry); + } + Some((*size, hashmap_with_hash, errors, bytes_read)) + }) + .while_some() + .collect(); + + // Size, Vec + + 'main: for (size, vec_file_entry) in records_already_cached { + // Check if size already exists, if exists we must to change it outside because cannot have mut and non mut reference to full_hash_results + for (full_size, full_hashmap, _errors, _bytes_read) in &mut full_hash_results { + if size == *full_size { + for file_entry in vec_file_entry { + full_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new); + full_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry); + } + continue 'main; + } + } + // Size doesn't exists add results to files + let mut temp_hashmap: HashMap> = Default::default(); + for file_entry in vec_file_entry { + temp_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new); + temp_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry); + } + full_hash_results.push((size, temp_hashmap, Vec::new(), 0)); + } + + // Must save all results to file, old loaded from file with all currently counted results + let mut all_results: HashMap = Default::default(); + for (_size, vec_file_entry) in loaded_hash_map { + for file_entry in vec_file_entry { + all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); + } + } + for (_size, hashmap, _errors, _bytes_read) in &full_hash_results { + for vec_file_entry in hashmap.values() { + for file_entry in vec_file_entry { + all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); + } + } + } + save_hashes_to_file(&all_results, &mut self.text_messages, &self.hash_type); + } + _ => panic!("What"), + } // End thread which send info to gui progress_thread_run.store(false, Ordering::Relaxed); @@ -1169,3 +1306,104 @@ fn delete_files(vector: &[FileEntry], delete_method: &DeleteMethod, warnings: &m }; (gained_space, removed_files, failed_to_remove_files) } + +fn save_hashes_to_file(hashmap: &HashMap, text_messages: &mut Messages, type_of_hash: &HashType) { + println!("Trying to save {} files", hashmap.len()); + if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") { + let cache_dir = PathBuf::from(proj_dirs.cache_dir()); + if cache_dir.exists() { + if !cache_dir.is_dir() { + text_messages.messages.push(format!("Config dir {} is a file!", cache_dir.display())); + return; + } + } else if fs::create_dir_all(&cache_dir).is_err() { + text_messages.messages.push(format!("Cannot create config dir {}", cache_dir.display())); + return; + } + let cache_file = cache_dir.join(CACHE_FILE_NAME.replace(".", format!("_{:?}.", type_of_hash).as_str())); + let file_handler = match OpenOptions::new().truncate(true).write(true).create(true).open(&cache_file) { + Ok(t) => t, + Err(_) => { + text_messages.messages.push(format!("Cannot create or open cache file {}", cache_file.display())); + return; + } + }; + let mut writer = BufWriter::new(file_handler); + + for file_entry in hashmap.values() { + // Only cache bigger than 5MB files + if file_entry.size > 5 * 1024 * 1024 { + let string: String = format!("{}//{}//{}//{}", file_entry.path.display(), file_entry.size, file_entry.modified_date, file_entry.hash); + + if writeln!(writer, "{}", string).is_err() { + text_messages.messages.push(format!("Failed to save some data to cache file {}", cache_file.display())); + return; + }; + } + } + } +} + +fn load_hashes_from_file(text_messages: &mut Messages, type_of_hash: &HashType) -> Option>> { + if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") { + let cache_dir = PathBuf::from(proj_dirs.cache_dir()); + let cache_file = cache_dir.join(CACHE_FILE_NAME.replace(".", format!("_{:?}.", type_of_hash).as_str())); + let file_handler = match OpenOptions::new().read(true).open(&cache_file) { + Ok(t) => t, + Err(_) => { + // text_messages.messages.push(format!("Cannot find or open cache file {}", cache_file.display())); // This shouldn't be write to output + return None; + } + }; + + let reader = BufReader::new(file_handler); + + let mut hashmap_loaded_entries: BTreeMap> = Default::default(); + + // Read the file line by line using the lines() iterator from std::io::BufRead. + for (index, line) in reader.lines().enumerate() { + let line = match line { + Ok(t) => t, + Err(_) => { + text_messages.warnings.push(format!("Failed to load line number {} from cache file {}", index + 1, cache_file.display())); + return None; + } + }; + let uuu = line.split("//").collect::>(); + if uuu.len() != 4 { + text_messages + .warnings + .push(format!("Found invalid data(too much or too low amount of data) in line {} - ({}) in cache file {}", index + 1, line, cache_file.display())); + continue; + } + // Don't load cache data if destination file not exists + if Path::new(uuu[0]).exists() { + let file_entry = FileEntry { + path: PathBuf::from(uuu[0]), + size: match uuu[1].parse::() { + Ok(t) => t, + Err(_) => { + text_messages.warnings.push(format!("Found invalid size value in line {} - ({}) in cache file {}", index + 1, line, cache_file.display())); + continue; + } + }, + modified_date: match uuu[2].parse::() { + Ok(t) => t, + Err(_) => { + text_messages.warnings.push(format!("Found invalid modified date value in line {} - ({}) in cache file {}", index + 1, line, cache_file.display())); + continue; + } + }, + hash: uuu[3].to_string(), + }; + hashmap_loaded_entries.entry(file_entry.size).or_insert_with(Vec::new); + hashmap_loaded_entries.get_mut(&file_entry.size).unwrap().push(file_entry); + } + } + + return Some(hashmap_loaded_entries); + } + + text_messages.messages.push("Cannot find or open system config dir to save cache file".to_string()); + None +} diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index 4d1f8e3..0c2f8ea 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -326,20 +326,20 @@ impl SimilarImages { None => Default::default(), }; - let mut hashes_already_counted: HashMap = Default::default(); - let mut hashes_to_check: HashMap = Default::default(); + let mut records_already_cached: HashMap = Default::default(); + let mut non_cached_files_to_check: HashMap = Default::default(); for (name, file_entry) in &self.images_to_check { #[allow(clippy::collapsible_if)] if !loaded_hash_map.contains_key(name) { // If loaded data doesn't contains current image info - hashes_to_check.insert(name.clone(), file_entry.clone()); + non_cached_files_to_check.insert(name.clone(), file_entry.clone()); } else { if file_entry.size != loaded_hash_map.get(name).unwrap().size || file_entry.modified_date != loaded_hash_map.get(name).unwrap().modified_date { // When size or modification date of image changed, then it is clear that is different image - hashes_to_check.insert(name.clone(), file_entry.clone()); + non_cached_files_to_check.insert(name.clone(), file_entry.clone()); } else { // Checking may be omitted when already there is entry with same size and modification date - hashes_already_counted.insert(name.clone(), loaded_hash_map.get(name).unwrap().clone()); + records_already_cached.insert(name.clone(), loaded_hash_map.get(name).unwrap().clone()); } } } @@ -358,7 +358,7 @@ impl SimilarImages { let mut progress_send = progress_sender.clone(); let progress_thread_run = progress_thread_run.clone(); let atomic_file_counter = atomic_file_counter.clone(); - let images_to_check = hashes_to_check.len(); + let images_to_check = non_cached_files_to_check.len(); progress_thread_handle = thread::spawn(move || loop { progress_send .try_send(ProgressData { @@ -377,7 +377,7 @@ impl SimilarImages { progress_thread_handle = thread::spawn(|| {}); } //// PROGRESS THREAD END - let mut vec_file_entry: Vec<(FileEntry, Node)> = hashes_to_check + let mut vec_file_entry: Vec<(FileEntry, Node)> = non_cached_files_to_check .par_iter() .map(|file_entry| { atomic_file_counter.fetch_add(1, Ordering::Relaxed); @@ -416,7 +416,7 @@ impl SimilarImages { let hash_map_modification = SystemTime::now(); // Just connect loaded results with already calculated hashes - for (_name, file_entry) in hashes_already_counted { + for (_name, file_entry) in records_already_cached { vec_file_entry.push((file_entry.clone(), file_entry.hash)); } @@ -457,15 +457,15 @@ impl SimilarImages { // Maybe also add here progress report let mut new_vector: Vec> = Vec::new(); - let mut hashes_to_check = self.image_hashes.clone(); + let mut non_cached_files_to_check = self.image_hashes.clone(); for (hash, vec_file_entry) in &self.image_hashes { if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { return false; } - if !hashes_to_check.contains_key(hash) { + if !non_cached_files_to_check.contains_key(hash) { continue; } - hashes_to_check.remove(hash); + non_cached_files_to_check.remove(hash); let vector_with_found_similar_hashes = self.bktree.find(hash, similarity).collect::>(); if vector_with_found_similar_hashes.len() == 1 && vec_file_entry.len() == 1 { @@ -493,7 +493,7 @@ impl SimilarImages { panic!("I'm not sure if same hash can have distance > 0"); } - if let Some(vec_file_entry) = hashes_to_check.get(*similar_hash) { + if let Some(vec_file_entry) = non_cached_files_to_check.get(*similar_hash) { vector_of_similar_images.append( &mut (vec_file_entry .iter() @@ -515,7 +515,7 @@ impl SimilarImages { }) .collect::>()), ); - hashes_to_check.remove(*similar_hash); + non_cached_files_to_check.remove(*similar_hash); } } if vector_of_similar_images.len() > 1 { diff --git a/instructions/Instruction.md b/instructions/Instruction.md index a924f74..903cacc 100644 --- a/instructions/Instruction.md +++ b/instructions/Instruction.md @@ -137,17 +137,17 @@ Only some image extensions are supported, because I rely on image crate. Also so ## Config/Cache files For now Czkawka store only 2 files on disk: - `czkawka_gui_config.txt` - stores configuration of GUI which may be loaded at startup -- `cache_similar_image.txt` - stores cache data and hashes which may be used later without needing to compute image hash again - DO NOT TRY TO EDIT THIS FILE MANUALLY! - editing this file may cause app crashes. +- `cache_similar_image.txt` - stores cache data and hashes which may be used later without needing to compute image hash again - editing this file may cause app crashes. - `cache_broken_files.txt` - stores cache data of broken files +- `cache_duplicates_Blake3.txt` - stores cache data of duplicated files, to not get too big performance hit when saving/loading file, only already fully hashed files bigger than 5MB are stored. Similar files with replaced `Blake3` to e.g. `SHA256` may be shown, when support for new hashes will be introduced in Czkawka. - -First file is located in this path +Config files are located in this path Linux - `/home/username/.config/czkawka` Mac - `/Users/username/Library/Application Support/pl.Qarmin.Czkawka` Windows - `C:\Users\Username\AppData\Roaming\Qarmin\Czkawka\config` -Second with cache here: +Cache should be here: Linux - `/home/username/.cache/czkawka` Mac - `/Users/Username/Library/Caches/pl.Qarmin.Czkawka`