diff --git a/czkawka_core/src/common_dir_traversal.rs b/czkawka_core/src/common_dir_traversal.rs index f547f68..fe2414d 100644 --- a/czkawka_core/src/common_dir_traversal.rs +++ b/czkawka_core/src/common_dir_traversal.rs @@ -64,7 +64,6 @@ pub struct FileEntry { pub path: PathBuf, pub size: u64, pub modified_date: u64, - pub hash: String, } impl ResultEntry for FileEntry { @@ -468,7 +467,6 @@ fn process_file_in_file_mode( size: metadata.len(), modified_date: get_modified_time(&metadata, warnings, ¤t_file_name, false), path: current_file_name, - hash: String::new(), }; fe_result.push(fe); @@ -543,7 +541,6 @@ fn process_symlink_in_symlink_mode( size: metadata.len(), modified_date: get_modified_time(&metadata, warnings, ¤t_file_name, false), path: current_file_name, - hash: String::new(), }; fe_result.push(fe); diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index f88b2ee..3b36964 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -6,7 +6,7 @@ use std::io::prelude::*; use std::io::{self, Error, ErrorKind}; #[cfg(target_family = "unix")] use std::os::unix::fs::MetadataExt; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::atomic::Ordering; use std::{fs, mem}; @@ -15,6 +15,7 @@ use fun_time::fun_time; use humansize::{format_size, BINARY}; use log::debug; use rayon::prelude::*; +use serde::{Deserialize, Serialize}; use xxhash_rust::xxh3::Xxh3; use crate::common::{check_if_stop_received, delete_files_custom, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads}; @@ -44,6 +45,36 @@ impl HashType { } } +#[derive(Clone, Serialize, Deserialize, Debug, Default)] +pub struct DuplicateEntry { + pub path: PathBuf, + pub modified_date: u64, + pub size: u64, + pub hash: String, +} +impl ResultEntry for DuplicateEntry { + fn get_path(&self) -> &Path { + &self.path + } + fn get_modified_date(&self) -> u64 { + self.modified_date + } + fn get_size(&self) -> u64 { + self.size + } +} + +impl FileEntry { + fn into_duplicate_entry(self) -> DuplicateEntry { + DuplicateEntry { + size: self.size, + path: self.path, + modified_date: self.modified_date, + hash: String::new(), + } + } +} + #[derive(Default)] pub struct Info { pub number_of_groups_by_size: usize, @@ -62,21 +93,21 @@ pub struct DuplicateFinder { common_data: CommonToolData, information: Info, // File Size, File Entry - files_with_identical_names: BTreeMap>, + files_with_identical_names: BTreeMap>, // File (Size, Name), File Entry - files_with_identical_size_names: BTreeMap<(u64, String), Vec>, + files_with_identical_size_names: BTreeMap<(u64, String), Vec>, // File Size, File Entry - files_with_identical_size: BTreeMap>, + files_with_identical_size: BTreeMap>, // File Size, next grouped by file size, next grouped by hash - files_with_identical_hashes: BTreeMap>>, + files_with_identical_hashes: BTreeMap>>, // File Size, File Entry - files_with_identical_names_referenced: BTreeMap)>, + files_with_identical_names_referenced: BTreeMap)>, // File (Size, Name), File Entry - files_with_identical_size_names_referenced: BTreeMap<(u64, String), (FileEntry, Vec)>, + files_with_identical_size_names_referenced: BTreeMap<(u64, String), (DuplicateEntry, Vec)>, // File Size, File Entry - files_with_identical_size_referenced: BTreeMap)>, + files_with_identical_size_referenced: BTreeMap)>, // File Size, next grouped by file size, next grouped by hash - files_with_identical_hashes_referenced: BTreeMap)>>, + files_with_identical_hashes_referenced: BTreeMap)>>, check_method: CheckingMethod, hash_type: HashType, ignore_hard_links: bool, @@ -171,7 +202,16 @@ impl DuplicateFinder { self.common_data.text_messages.warnings.extend(warnings); // Create new BTreeMap without single size entries(files have not duplicates) - self.files_with_identical_names = grouped_file_entries.into_iter().filter(|(_name, vector)| vector.len() > 1).collect(); + self.files_with_identical_names = grouped_file_entries + .into_iter() + .filter_map(|(name, vector)| { + if vector.len() > 1 { + Some((name, vector.into_iter().map(FileEntry::into_duplicate_entry).collect())) + } else { + None + } + }) + .collect(); // Reference - only use in size, because later hash will be counted differently if self.common_data.use_reference_folders { @@ -188,7 +228,7 @@ impl DuplicateFinder { Some((files_from_referenced_folders.pop().unwrap(), normal_files)) } }) - .collect::)>>(); + .collect::)>>(); for (fe, vec_fe) in vec { self.files_with_identical_names_referenced.insert(fe.path.to_string_lossy().to_string(), (fe, vec_fe)); } @@ -236,7 +276,16 @@ impl DuplicateFinder { DirTraversalResult::SuccessFiles { grouped_file_entries, warnings } => { self.common_data.text_messages.warnings.extend(warnings); - self.files_with_identical_size_names = grouped_file_entries.into_iter().filter(|(_name, vector)| vector.len() > 1).collect(); + self.files_with_identical_size_names = grouped_file_entries + .into_iter() + .filter_map(|(size_name, vector)| { + if vector.len() > 1 { + Some((size_name, vector.into_iter().map(FileEntry::into_duplicate_entry).collect())) + } else { + None + } + }) + .collect(); // Reference - only use in size, because later hash will be counted differently if self.common_data.use_reference_folders { @@ -253,7 +302,7 @@ impl DuplicateFinder { Some((files_from_referenced_folders.pop().unwrap(), normal_files)) } }) - .collect::)>>(); + .collect::)>>(); for (fe, vec_fe) in vec { self.files_with_identical_size_names_referenced .insert((fe.size, fe.path.to_string_lossy().to_string()), (fe, vec_fe)); @@ -313,7 +362,8 @@ impl DuplicateFinder { let vector = if self.ignore_hard_links { filter_hard_links(&vec) } else { vec }; if vector.len() > 1 { - self.files_with_identical_size.insert(size, vector); + self.files_with_identical_size + .insert(size, vector.into_iter().map(FileEntry::into_duplicate_entry).collect()); } } @@ -367,7 +417,7 @@ impl DuplicateFinder { Some((files_from_referenced_folders.pop().unwrap(), normal_files)) } }) - .collect::)>>(); + .collect::)>>(); for (fe, vec_fe) in vec { self.files_with_identical_size_referenced.insert(fe.size, (fe, vec_fe)); } @@ -375,17 +425,17 @@ impl DuplicateFinder { } #[fun_time(message = "prehash_load_cache_at_start", level = "debug")] - fn prehash_load_cache_at_start(&mut self) -> (BTreeMap>, BTreeMap>, BTreeMap>) { + fn prehash_load_cache_at_start(&mut self) -> (BTreeMap>, BTreeMap>, BTreeMap>) { // Cache algorithm // - Load data from cache - // - Convert from BT> to BT + // - Convert from BT> to BT // - Save to proper values let loaded_hash_map; - let mut records_already_cached: BTreeMap> = Default::default(); - let mut non_cached_files_to_check: BTreeMap> = Default::default(); + let mut records_already_cached: BTreeMap> = Default::default(); + let mut non_cached_files_to_check: BTreeMap> = Default::default(); if self.use_prehash_cache { - let (messages, loaded_items) = load_cache_from_file_generalized_by_size::( + let (messages, loaded_items) = load_cache_from_file_generalized_by_size::( &get_duplicate_cache_file(&self.hash_type, true), self.get_delete_outdated_cache(), &self.files_with_identical_size, @@ -397,7 +447,7 @@ impl DuplicateFinder { for (size, mut vec_file_entry) in mem::take(&mut self.files_with_identical_size) { if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) { // TODO maybe hashmap is not needed when using < 4 elements - let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new(); + let mut cached_path_entries: HashMap<&Path, DuplicateEntry> = HashMap::new(); for file_entry in cached_vec_file_entry { cached_path_entries.insert(&file_entry.path, file_entry.clone()); } @@ -428,10 +478,14 @@ impl DuplicateFinder { } #[fun_time(message = "prehash_save_cache_at_exit", level = "debug")] - fn prehash_save_cache_at_exit(&mut self, loaded_hash_map: BTreeMap>, pre_hash_results: &Vec<(u64, BTreeMap>, Vec)>) { + fn prehash_save_cache_at_exit( + &mut self, + loaded_hash_map: BTreeMap>, + pre_hash_results: &Vec<(u64, BTreeMap>, Vec)>, + ) { if self.use_prehash_cache { // All results = records already cached + computed results - let mut save_cache_to_hashmap: BTreeMap = Default::default(); + let mut save_cache_to_hashmap: BTreeMap = Default::default(); for (size, vec_file_entry) in loaded_hash_map { if size >= self.minimal_prehash_cache_file_size { @@ -466,7 +520,7 @@ impl DuplicateFinder { &mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&Sender>, - pre_checked_map: &mut BTreeMap>, + pre_checked_map: &mut BTreeMap>, ) -> Option<()> { let check_type = self.hash_type; let (progress_thread_handle, progress_thread_run, _atomic_counter, _check_was_stopped) = @@ -489,10 +543,10 @@ impl DuplicateFinder { debug!("Starting calculating prehash"); #[allow(clippy::type_complexity)] - let pre_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check + let pre_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check .into_par_iter() .map(|(size, vec_file_entry)| { - let mut hashmap_with_hash: BTreeMap> = Default::default(); + let mut hashmap_with_hash: BTreeMap> = Default::default(); let mut errors: Vec = Vec::new(); let mut buffer = [0u8; 1024 * 32]; @@ -552,16 +606,16 @@ impl DuplicateFinder { #[fun_time(message = "full_hashing_load_cache_at_start", level = "debug")] fn full_hashing_load_cache_at_start( &mut self, - mut pre_checked_map: BTreeMap>, - ) -> (BTreeMap>, BTreeMap>, BTreeMap>) { + mut pre_checked_map: BTreeMap>, + ) -> (BTreeMap>, BTreeMap>, BTreeMap>) { let loaded_hash_map; - let mut records_already_cached: BTreeMap> = Default::default(); - let mut non_cached_files_to_check: BTreeMap> = Default::default(); + let mut records_already_cached: BTreeMap> = Default::default(); + let mut non_cached_files_to_check: BTreeMap> = Default::default(); if self.common_data.use_cache { debug!("full_hashing_load_cache_at_start - using cache"); let (messages, loaded_items) = - load_cache_from_file_generalized_by_size::(&get_duplicate_cache_file(&self.hash_type, false), self.get_delete_outdated_cache(), &pre_checked_map); + load_cache_from_file_generalized_by_size::(&get_duplicate_cache_file(&self.hash_type, false), self.get_delete_outdated_cache(), &pre_checked_map); self.get_text_messages_mut().extend_with_another_messages(messages); loaded_hash_map = loaded_items.unwrap_or_default(); @@ -569,7 +623,7 @@ impl DuplicateFinder { for (size, mut vec_file_entry) in pre_checked_map { if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) { // TODO maybe hashmap is not needed when using < 4 elements - let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new(); + let mut cached_path_entries: HashMap<&Path, DuplicateEntry> = HashMap::new(); for file_entry in cached_vec_file_entry { cached_path_entries.insert(&file_entry.path, file_entry.clone()); } @@ -603,9 +657,9 @@ impl DuplicateFinder { #[fun_time(message = "full_hashing_save_cache_at_exit", level = "debug")] fn full_hashing_save_cache_at_exit( &mut self, - records_already_cached: BTreeMap>, - full_hash_results: &mut Vec<(u64, BTreeMap>, Vec)>, - loaded_hash_map: BTreeMap>, + records_already_cached: BTreeMap>, + full_hash_results: &mut Vec<(u64, BTreeMap>, Vec)>, + loaded_hash_map: BTreeMap>, ) { if !self.common_data.use_cache { return; @@ -621,7 +675,7 @@ impl DuplicateFinder { } } // Size doesn't exists add results to files - let mut temp_hashmap: BTreeMap> = Default::default(); + let mut temp_hashmap: BTreeMap> = Default::default(); for file_entry in vec_file_entry { temp_hashmap.entry(file_entry.hash.clone()).or_default().push(file_entry); } @@ -629,7 +683,7 @@ impl DuplicateFinder { } // Must save all results to file, old loaded from file with all currently counted results - let mut all_results: BTreeMap = Default::default(); + let mut all_results: BTreeMap = Default::default(); for (_size, vec_file_entry) in loaded_hash_map { for file_entry in vec_file_entry { all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); @@ -653,7 +707,12 @@ impl DuplicateFinder { } #[fun_time(message = "full_hashing", level = "debug")] - fn full_hashing(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&Sender>, pre_checked_map: BTreeMap>) -> Option<()> { + fn full_hashing( + &mut self, + stop_receiver: Option<&Receiver<()>>, + progress_sender: Option<&Sender>, + pre_checked_map: BTreeMap>, + ) -> Option<()> { let (progress_thread_handle, progress_thread_run, _atomic_counter, _check_was_stopped) = prepare_thread_handler_common(progress_sender, 4, MAX_STAGE, 0, self.check_method, self.common_data.tool_type); @@ -675,10 +734,10 @@ impl DuplicateFinder { let check_type = self.hash_type; debug!("Starting full hashing of {} files", non_cached_files_to_check.values().map(Vec::len).sum::()); - let mut full_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check + let mut full_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check .into_par_iter() .map(|(size, vec_file_entry)| { - let mut hashmap_with_hash: BTreeMap> = Default::default(); + let mut hashmap_with_hash: BTreeMap> = Default::default(); let mut errors: Vec = Vec::new(); let mut buffer = [0u8; 1024 * 16]; @@ -748,7 +807,7 @@ impl DuplicateFinder { Some(all_results_with_same_size) } }) - .collect::)>>>(); + .collect::)>>>(); for vec_of_vec in vec { self.files_with_identical_hashes_referenced.insert(vec_of_vec[0].0.size, vec_of_vec); } @@ -777,7 +836,7 @@ impl DuplicateFinder { fn check_files_hash(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&Sender>) -> bool { assert_eq!(self.check_method, CheckingMethod::Hash); - let mut pre_checked_map: BTreeMap> = Default::default(); + let mut pre_checked_map: BTreeMap> = Default::default(); let ret = self.prehashing(stop_receiver, progress_sender, &mut pre_checked_map); if ret.is_none() { return false; @@ -813,7 +872,7 @@ impl DuplicateFinder { } CheckingMethod::Hash => { for vec_files in self.files_with_identical_hashes.values() { - let vev: Vec<&Vec> = vec_files.iter().collect::>(); + let vev: Vec<&Vec> = vec_files.iter().collect::>(); delete_files_custom(&vev, &self.common_data.delete_method, &mut self.common_data.text_messages, self.common_data.dry_run); } } @@ -843,7 +902,7 @@ impl DuplicateFinder { self.minimal_prehash_cache_file_size = minimal_prehash_cache_file_size; } - pub const fn get_files_sorted_by_names(&self) -> &BTreeMap> { + pub const fn get_files_sorted_by_names(&self) -> &BTreeMap> { &self.files_with_identical_names } @@ -851,15 +910,15 @@ impl DuplicateFinder { self.use_prehash_cache = use_prehash_cache; } - pub const fn get_files_sorted_by_size(&self) -> &BTreeMap> { + pub const fn get_files_sorted_by_size(&self) -> &BTreeMap> { &self.files_with_identical_size } - pub const fn get_files_sorted_by_size_name(&self) -> &BTreeMap<(u64, String), Vec> { + pub const fn get_files_sorted_by_size_name(&self) -> &BTreeMap<(u64, String), Vec> { &self.files_with_identical_size_names } - pub const fn get_files_sorted_by_hash(&self) -> &BTreeMap>> { + pub const fn get_files_sorted_by_hash(&self) -> &BTreeMap>> { &self.files_with_identical_hashes } @@ -887,19 +946,19 @@ impl DuplicateFinder { self.common_data.use_reference_folders } - pub fn get_files_with_identical_hashes_referenced(&self) -> &BTreeMap)>> { + pub fn get_files_with_identical_hashes_referenced(&self) -> &BTreeMap)>> { &self.files_with_identical_hashes_referenced } - pub fn get_files_with_identical_name_referenced(&self) -> &BTreeMap)> { + pub fn get_files_with_identical_name_referenced(&self) -> &BTreeMap)> { &self.files_with_identical_names_referenced } - pub fn get_files_with_identical_size_referenced(&self) -> &BTreeMap)> { + pub fn get_files_with_identical_size_referenced(&self) -> &BTreeMap)> { &self.files_with_identical_size_referenced } - pub fn get_files_with_identical_size_names_referenced(&self) -> &BTreeMap<(u64, String), (FileEntry, Vec)> { + pub fn get_files_with_identical_size_names_referenced(&self) -> &BTreeMap<(u64, String), (DuplicateEntry, Vec)> { &self.files_with_identical_size_names_referenced } } @@ -1195,7 +1254,7 @@ pub trait MyHasher { fn finalize(&self) -> String; } -fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashType, limit: u64) -> Result { +fn hash_calculation(buffer: &mut [u8], file_entry: &DuplicateEntry, hash_type: &HashType, limit: u64) -> Result { let mut file_handler = match File::open(&file_entry.path) { Ok(t) => t, Err(e) => return Err(format!("Unable to check hash of file {:?}, reason {e}", file_entry.path)), @@ -1355,7 +1414,7 @@ mod tests { let src = dir.path().join("a"); let mut file = File::create(&src)?; file.write_all(b"aa")?; - let e = FileEntry { path: src, ..Default::default() }; + let e = DuplicateEntry { path: src, ..Default::default() }; let r = hash_calculation(&mut buf, &e, &HashType::Blake3, 0).unwrap(); assert!(!r.is_empty()); Ok(()) @@ -1368,7 +1427,7 @@ mod tests { let src = dir.path().join("a"); let mut file = File::create(&src)?; file.write_all(b"aa")?; - let e = FileEntry { path: src, ..Default::default() }; + let e = DuplicateEntry { path: src, ..Default::default() }; let r1 = hash_calculation(&mut buf, &e, &HashType::Blake3, 1).unwrap(); let r2 = hash_calculation(&mut buf, &e, &HashType::Blake3, 2).unwrap(); let r3 = hash_calculation(&mut buf, &e, &HashType::Blake3, u64::MAX).unwrap(); @@ -1382,7 +1441,7 @@ mod tests { let dir = tempfile::Builder::new().tempdir()?; let mut buf = [0u8; 1 << 10]; let src = dir.path().join("a"); - let e = FileEntry { path: src, ..Default::default() }; + let e = DuplicateEntry { path: src, ..Default::default() }; let r = hash_calculation(&mut buf, &e, &HashType::Blake3, 0).unwrap_err(); assert!(!r.is_empty()); Ok(()) diff --git a/czkawka_gui/src/compute_results.rs b/czkawka_gui/src/compute_results.rs index dbba231..70851d5 100644 --- a/czkawka_gui/src/compute_results.rs +++ b/czkawka_gui/src/compute_results.rs @@ -13,7 +13,7 @@ use czkawka_core::bad_extensions::BadExtensions; use czkawka_core::big_file::BigFile; use czkawka_core::broken_files::BrokenFiles; use czkawka_core::common::{split_path, split_path_compare}; -use czkawka_core::common_dir_traversal::{CheckingMethod, FileEntry}; +use czkawka_core::common_dir_traversal::CheckingMethod; use czkawka_core::common_tool::CommonData; use czkawka_core::common_traits::ResultEntry; use czkawka_core::duplicate::DuplicateFinder; @@ -23,8 +23,7 @@ use czkawka_core::invalid_symlinks::InvalidSymlinks; use czkawka_core::localizer_core::generate_translation_hashmap; use czkawka_core::same_music::{MusicSimilarity, SameMusic}; use czkawka_core::similar_images; -use czkawka_core::similar_images::ImagesEntry; -use czkawka_core::similar_images::SimilarImages; +use czkawka_core::similar_images::{ImagesEntry, SimilarImages}; use czkawka_core::similar_videos::SimilarVideos; use czkawka_core::temporary::Temporary; @@ -1327,13 +1326,16 @@ fn computer_duplicate_finder( } } -fn vector_sort_unstable_entry_by_path(vector: &[FileEntry]) -> Vec { +fn vector_sort_unstable_entry_by_path(vector: &[T]) -> Vec +where + T: ResultEntry + Clone, +{ if vector.len() >= 2 { - let mut vector = vector.to_owned(); - vector.sort_unstable_by(|a, b| split_path_compare(a.path.as_path(), b.path.as_path())); + let mut vector = vector.to_vec(); + vector.sort_unstable_by(|a, b| split_path_compare(a.get_path(), b.get_path())); vector } else { - vector.to_owned() + vector.to_vec() } } diff --git a/krokiet/src/connect_scan.rs b/krokiet/src/connect_scan.rs index ccae959..b3620a9 100644 --- a/krokiet/src/connect_scan.rs +++ b/krokiet/src/connect_scan.rs @@ -7,8 +7,7 @@ use czkawka_core::common_dir_traversal::{FileEntry, ProgressData}; use czkawka_core::common_tool::CommonData; use czkawka_core::common_traits::ResultEntry; use czkawka_core::empty_files::EmptyFiles; -use czkawka_core::empty_folder::EmptyFolder; -use czkawka_core::empty_folder::FolderEntry; +use czkawka_core::empty_folder::{EmptyFolder, FolderEntry}; use czkawka_core::similar_images; use czkawka_core::similar_images::SimilarImages; use humansize::{format_size, BINARY};