1
0
Fork 0
mirror of synced 2024-05-03 03:52:58 +12:00

Fix problem with not proper loading cached results in duplicate mode (#1086)

* Fix problem with not proper loading cached results in duplicate mode

* Dbg
This commit is contained in:
Rafał Mikrut 2023-10-13 08:33:35 +02:00 committed by GitHub
parent 0462324607
commit e50d930683
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 18 deletions

View file

@ -1,9 +1,17 @@
## Version 6.1.0 - ?
- BREAKING CHANGE - Changed cache saving method, deduplicated, optimized and simplified procedure - [#1072](https://github.com/qarmin/czkawka/pull/1072)
- BREAKING CHANGE - Changed cache saving method, deduplicated, optimized and simplified procedure(all files needs to be hashed again) - [#1072](https://github.com/qarmin/czkawka/pull/1072)
- Remove up to 170ms of delay after ending scan - [#1070](https://github.com/qarmin/czkawka/pull/1070)
- Added logger with useful info when debugging app (level can be adjusted via e.g. `RUST_LOG=debug` env) - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070)
- Core code cleanup - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070)
- Core code cleanup - [#1072](https://github.com/qarmin/czkawka/pull/1072), [#1070](https://github.com/qarmin/czkawka/pull/1070), [#1082](https://github.com/qarmin/czkawka/pull/1082)
- Updated list of bad extensions and support for finding invalid jar files - [#1070](https://github.com/qarmin/czkawka/pull/1070)
- More default excluded items on Windows(like pagefile) - [#1074](https://github.com/qarmin/czkawka/pull/1074)
- Unified printing/saving method to files/terminal and fixed some differences/bugs - [#1082](https://github.com/qarmin/czkawka/pull/1082)
- Uses fun_time library to print how much functions take time - [#1082](https://github.com/qarmin/czkawka/pull/1082)
- Added exporting results into json file format - [#1083](https://github.com/qarmin/czkawka/pull/1083)
- Added new test/regression suite for CI - [#1083](https://github.com/qarmin/czkawka/pull/1083)
- Added ability to use relative paths - [#1083](https://github.com/qarmin/czkawka/pull/1083)
- Fixed stability problem, that could remove invalid file in CLI - [#1083](https://github.com/qarmin/czkawka/pull/1083)
- Fixed problem with invalid cache loading - [#0000]
- Fix Windows gui crashes by using gtk 4.6 instead 4.8 or 4.10 - [#992](https://github.com/qarmin/czkawka/pull/992)
- Fixed printing info about duplicated music files - [#1016](https://github.com/qarmin/czkawka/pull/1016)
- Fixed printing info about duplicated video files - [#1017](https://github.com/qarmin/czkawka/pull/1017)

View file

@ -20,7 +20,7 @@
- Temporary Files - Finds temporary files
- Similar Images - Finds images which are not exactly the same (different resolution, watermarks)
- Similar Videos - Looks for visually similar videos
- Same Music - Searches for music with the same artist, album etc.
- Same Music - Searches for similar music by tags or by reading content and comparing it
- Invalid Symbolic Links - Shows symbolic links which point to non-existent files/directories
- Broken Files - Finds files that are invalid or corrupted
- Bad Extensions - Lists files whose content not match with their extension

View file

@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::collections::{BTreeMap, HashSet};
use std::fmt::Debug;
use std::fs::File;
@ -424,11 +425,14 @@ impl DuplicateFinder {
debug!("prehash_load_cache_at_start - started diff between loaded and prechecked files");
for (size, mut vec_file_entry) in mem::take(&mut self.files_with_identical_size) {
if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) {
// TODO maybe hashset is not needed when using < 4 elements
let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::<HashSet<_>>();
// TODO maybe hashmap is not needed when using < 4 elements
let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new();
for file_entry in cached_vec_file_entry {
cached_path_entries.insert(&file_entry.path, file_entry.clone());
}
for file_entry in vec_file_entry {
if cached_path_entries.contains(&file_entry.path) {
records_already_cached.entry(size).or_default().push(file_entry);
if let Some(cached_file_entry) = cached_path_entries.remove(file_entry.path.as_path()) {
records_already_cached.entry(size).or_default().push(cached_file_entry);
} else {
non_cached_files_to_check.entry(size).or_default().push(file_entry);
}
@ -508,7 +512,7 @@ impl DuplicateFinder {
debug!("Starting calculating prehash");
#[allow(clippy::type_complexity)]
let pre_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>)> = non_cached_files_to_check
.par_iter()
.into_par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
@ -519,15 +523,16 @@ impl DuplicateFinder {
check_was_stopped.store(true, Ordering::Relaxed);
return None;
}
for file_entry in vec_file_entry {
match hash_calculation(&mut buffer, file_entry, &check_type, 0) {
for mut file_entry in vec_file_entry {
match hash_calculation(&mut buffer, &file_entry, &check_type, 0) {
Ok(hash_string) => {
hashmap_with_hash.entry(hash_string.clone()).or_default().push(file_entry.clone());
file_entry.hash = hash_string.clone();
hashmap_with_hash.entry(hash_string.clone()).or_default().push(file_entry);
}
Err(s) => errors.push(s),
}
}
Some((*size, hashmap_with_hash, errors))
Some((size, hashmap_with_hash, errors))
})
.while_some()
.collect();
@ -581,11 +586,14 @@ impl DuplicateFinder {
debug!("full_hashing_load_cache_at_start - started diff between loaded and prechecked files");
for (size, mut vec_file_entry) in pre_checked_map {
if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) {
// TODO maybe hashset is not needed when using < 4 elements
let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::<HashSet<_>>();
// TODO maybe hashmap is not needed when using < 4 elements
let mut cached_path_entries: HashMap<&Path, FileEntry> = HashMap::new();
for file_entry in cached_vec_file_entry {
cached_path_entries.insert(&file_entry.path, file_entry.clone());
}
for file_entry in vec_file_entry {
if cached_path_entries.contains(&file_entry.path) {
records_already_cached.entry(size).or_default().push(file_entry);
if let Some(cached_file_entry) = cached_path_entries.remove(file_entry.path.as_path()) {
records_already_cached.entry(size).or_default().push(cached_file_entry);
} else {
non_cached_files_to_check.entry(size).or_default().push(file_entry);
}

View file

@ -783,7 +783,7 @@ impl SimilarImages {
// Validating if group contains duplicated results
let mut result_hashset: HashSet<String> = Default::default();
let mut found = false;
// dbg!(collected_similar_images.len());
for vec_file_entry in collected_similar_images.values() {
if vec_file_entry.is_empty() {
println!("Empty group");
@ -1338,7 +1338,6 @@ mod tests {
similar_images.find_similar_hashes(None, None);
let res = similar_images.get_similar_images();
// dbg!(&res);
assert!(res.is_empty());
}
}