From e50d9306837aca4412a655810425c57212d108fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <41945903+qarmin@users.noreply.github.com> Date: Fri, 13 Oct 2023 08:33:35 +0200 Subject: [PATCH] Fix problem with not proper loading cached results in duplicate mode (#1086) * Fix problem with not proper loading cached results in duplicate mode * Dbg --- Changelog.md | 12 +++++++++-- README.md | 2 +- czkawka_core/src/duplicate.rs | 34 ++++++++++++++++++------------ czkawka_core/src/similar_images.rs | 3 +-- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/Changelog.md b/Changelog.md index b883061..7ebc7de 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,9 +1,17 @@ ## Version 6.1.0 - ? -- BREAKING CHANGE - Changed cache saving method, deduplicated, optimized and simplified procedure - [#1072](https://github.com/qarmin/czkawka/pull/1072) +- BREAKING CHANGE - Changed cache saving method, deduplicated, optimized and simplified procedure(all files needs to be hashed again) - [#1072](https://github.com/qarmin/czkawka/pull/1072) - Remove up to 170ms of delay after ending scan - [#1070](https://github.com/qarmin/czkawka/pull/1070) - Added logger with useful info when debugging app (level can be adjusted via e.g. `RUST_LOG=debug` env) - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070) -- Core code cleanup - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070) +- Core code cleanup - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070), [#1082](https://github.com/qarmin/czkawka/pull/1082) - Updated list of bad extensions and support for finding invalid jar files - [#1070](https://github.com/qarmin/czkawka/pull/1070) +- More default excluded items on Windows(like pagefile) - [#1074](https://github.com/qarmin/czkawka/pull/1074) +- Unified printing/saving method to files/terminal and fixed some differences/bugs - [#1082](https://github.com/qarmin/czkawka/pull/1082) +- Uses fun_time library to print how much functions take time - [#1082](https://github.com/qarmin/czkawka/pull/1082) +- Added exporting results into json file format - [#1083](https://github.com/qarmin/czkawka/pull/1083) +- Added new test/regression suite for CI - [#1083](https://github.com/qarmin/czkawka/pull/1083) +- Added ability to use relative paths - [#1083](https://github.com/qarmin/czkawka/pull/1083) +- Fixed stability problem, that could remove invalid file in CLI - [#1083](https://github.com/qarmin/czkawka/pull/1083) +- Fixed problem with invalid cache loading - [#0000] - Fix Windows gui crashes by using gtk 4.6 instead 4.8 or 4.10 - [#992](https://github.com/qarmin/czkawka/pull/992) - Fixed printing info about duplicated music files - [#1016](https://github.com/qarmin/czkawka/pull/1016) - Fixed printing info about duplicated video files - [#1017](https://github.com/qarmin/czkawka/pull/1017) diff --git a/README.md b/README.md index 8e247fe..71080b9 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ - Temporary Files - Finds temporary files - Similar Images - Finds images which are not exactly the same (different resolution, watermarks) - Similar Videos - Looks for visually similar videos - - Same Music - Searches for music with the same artist, album etc. + - Same Music - Searches for similar music by tags or by reading content and comparing it - Invalid Symbolic Links - Shows symbolic links which point to non-existent files/directories - Broken Files - Finds files that are invalid or corrupted - Bad Extensions - Lists files whose content not match with their extension diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index cf14bd7..96a34f4 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::collections::{BTreeMap, HashSet}; use std::fmt::Debug; use std::fs::File; @@ -424,11 +425,14 @@ impl DuplicateFinder { debug!("prehash_load_cache_at_start - started diff between loaded and prechecked files"); for (size, mut vec_file_entry) in mem::take(&mut self.files_with_identical_size) { if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) { - // TODO maybe hashset is not needed when using < 4 elements - let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::>(); + // TODO maybe hashmap is not needed when using < 4 elements + let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new(); + for file_entry in cached_vec_file_entry { + cached_path_entries.insert(&file_entry.path, file_entry.clone()); + } for file_entry in vec_file_entry { - if cached_path_entries.contains(&file_entry.path) { - records_already_cached.entry(size).or_default().push(file_entry); + if let Some(cached_file_entry) = cached_path_entries.remove(file_entry.path.as_path()) { + records_already_cached.entry(size).or_default().push(cached_file_entry); } else { non_cached_files_to_check.entry(size).or_default().push(file_entry); } @@ -508,7 +512,7 @@ impl DuplicateFinder { debug!("Starting calculating prehash"); #[allow(clippy::type_complexity)] let pre_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check - .par_iter() + .into_par_iter() .map(|(size, vec_file_entry)| { let mut hashmap_with_hash: BTreeMap> = Default::default(); let mut errors: Vec = Vec::new(); @@ -519,15 +523,16 @@ impl DuplicateFinder { check_was_stopped.store(true, Ordering::Relaxed); return None; } - for file_entry in vec_file_entry { - match hash_calculation(&mut buffer, file_entry, &check_type, 0) { + for mut file_entry in vec_file_entry { + match hash_calculation(&mut buffer, &file_entry, &check_type, 0) { Ok(hash_string) => { - hashmap_with_hash.entry(hash_string.clone()).or_default().push(file_entry.clone()); + file_entry.hash = hash_string.clone(); + hashmap_with_hash.entry(hash_string.clone()).or_default().push(file_entry); } Err(s) => errors.push(s), } } - Some((*size, hashmap_with_hash, errors)) + Some((size, hashmap_with_hash, errors)) }) .while_some() .collect(); @@ -581,11 +586,14 @@ impl DuplicateFinder { debug!("full_hashing_load_cache_at_start - started diff between loaded and prechecked files"); for (size, mut vec_file_entry) in pre_checked_map { if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) { - // TODO maybe hashset is not needed when using < 4 elements - let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::>(); + // TODO maybe hashmap is not needed when using < 4 elements + let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new(); + for file_entry in cached_vec_file_entry { + cached_path_entries.insert(&file_entry.path, file_entry.clone()); + } for file_entry in vec_file_entry { - if cached_path_entries.contains(&file_entry.path) { - records_already_cached.entry(size).or_default().push(file_entry); + if let Some(cached_file_entry) = cached_path_entries.remove(file_entry.path.as_path()) { + records_already_cached.entry(size).or_default().push(cached_file_entry); } else { non_cached_files_to_check.entry(size).or_default().push(file_entry); } diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index c080172..c797522 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -783,7 +783,7 @@ impl SimilarImages { // Validating if group contains duplicated results let mut result_hashset: HashSet = Default::default(); let mut found = false; - // dbg!(collected_similar_images.len()); + for vec_file_entry in collected_similar_images.values() { if vec_file_entry.is_empty() { println!("Empty group"); @@ -1338,7 +1338,6 @@ mod tests { similar_images.find_similar_hashes(None, None); let res = similar_images.get_similar_images(); - // dbg!(&res); assert!(res.is_empty()); } }