Add cache for duplicate (#205)

This commit is contained in:
Rafał Mikrut 2021-01-14 12:17:15 +01:00 committed by GitHub
parent 6e89bcb507
commit 7ec665ab7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 311 additions and 74 deletions

View File

@ -6,13 +6,11 @@
- Written in memory safe Rust
- Amazingly fast - due using more or less advanced algorithms and multithreading support
- Free, Open Source without ads
- Multiplatform - works on Linux, Windows and macOS
- Multiplatform - works on Linux, Windows and macOS
- Cache support - second and further scans should be a lot of faster than first
- CLI frontend, very fast to automate tasks
- GUI GTK frontend - uses modern GTK 3 and looks similar to FSlint
- Light/Dark theme match the appearance of the system(Linux only)
- Saving results to a file - allows reading entries found by the tool easily
- GUI frontend - uses modern GTK 3 and looks similar to FSlint
- Rich search option - allows setting absolute included and excluded directories, set of allowed file extensions or excluded items with * wildcard
- Image previews to get quick view at the compared photos
- Multiple tools to use:
- Duplicates - Finds duplicates basing on file name, size, hash, first 1 MB of hash
- Empty Folders - Finds empty folders with the help of advanced algorithm
@ -228,6 +226,7 @@ So still is a big room for improvements.
| Non stripped binaries | | X | |
| Redundant whitespace | | X | |
| Multiple languages(po) | | X | X |
| Cache support | X | | X |
| Project Activity | High | Very Low | High |
## Contributions

View File

@ -1,9 +1,9 @@
use crossbeam_channel::Receiver;
use humansize::{file_size_opts as options, FileSize};
use std::collections::{BTreeMap, HashMap};
use std::fs::{File, Metadata};
use std::fs::{File, Metadata, OpenOptions};
use std::io::prelude::*;
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use std::{fs, thread};
@ -13,14 +13,17 @@ use crate::common_extensions::Extensions;
use crate::common_items::ExcludedItems;
use crate::common_messages::Messages;
use crate::common_traits::*;
use directories_next::ProjectDirs;
use rayon::prelude::*;
use std::io::BufWriter;
use std::io::{BufReader, BufWriter};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread::sleep;
const HASH_MB_LIMIT_BYTES: u64 = 1024 * 1024; // 1MB
const CACHE_FILE_NAME: &str = "cache_duplicates.txt";
#[derive(Debug)]
pub struct ProgressData {
pub checking_method: CheckingMethod,
@ -39,7 +42,7 @@ pub enum CheckingMethod {
HashMB,
}
#[derive(PartialEq, Eq, Clone, Debug)]
#[derive(PartialEq, Eq, Clone, Debug, Copy)]
pub enum HashType {
Blake3,
}
@ -58,6 +61,7 @@ pub struct FileEntry {
pub path: PathBuf,
pub size: u64,
pub modified_date: u64,
pub hash: String,
}
/// Info struck with helpful information's about results
@ -349,6 +353,7 @@ impl DuplicateFinder {
continue 'dir;
} // Permissions Denied
},
hash: "".to_string(),
};
// Adding files to BTreeMap
@ -520,6 +525,7 @@ impl DuplicateFinder {
continue 'dir;
} // Permissions Denied
},
hash: "".to_string(),
};
// Adding files to BTreeMap
@ -631,8 +637,8 @@ impl DuplicateFinder {
hasher.update(&buffer[..n]);
let hash_string: String = hasher.finalize().to_hex().to_string();
hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.clone());
}
Some((*size, hashmap_with_hash, errors, bytes_read))
})
@ -700,60 +706,191 @@ impl DuplicateFinder {
//// PROGRESS THREAD END
#[allow(clippy::type_complexity)]
let full_hash_results: Vec<(u64, HashMap<String, Vec<FileEntry>>, Vec<String>, u64)> = pre_checked_map
.par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
let mut file_handler: File;
let mut bytes_read: u64 = 0;
atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
'fe: for file_entry in vec_file_entry {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_breaked.store(true, Ordering::Relaxed);
return None;
}
file_handler = match File::open(&file_entry.path) {
Ok(t) => t,
Err(_) => {
errors.push(format!("Unable to check hash of file {}", file_entry.path.display()));
continue 'fe;
}
};
let mut full_hash_results: Vec<(u64, HashMap<String, Vec<FileEntry>>, Vec<String>, u64)>;
let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 1024 * 32];
let mut current_file_read_bytes: u64 = 0;
loop {
let n = match file_handler.read(&mut buffer) {
Ok(t) => t,
Err(_) => {
errors.push(format!("Error happened when checking hash of file {}", file_entry.path.display()));
continue 'fe;
match self.check_method {
CheckingMethod::HashMB => {
full_hash_results = pre_checked_map
.par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
let mut file_handler: File;
let mut bytes_read: u64 = 0;
atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
'fe: for file_entry in vec_file_entry {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_breaked.store(true, Ordering::Relaxed);
return None;
}
};
if n == 0 {
break;
file_handler = match File::open(&file_entry.path) {
Ok(t) => t,
Err(_) => {
errors.push(format!("Unable to check hash of file {}", file_entry.path.display()));
continue 'fe;
}
};
let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 1024 * 128];
let mut current_file_read_bytes: u64 = 0;
loop {
let n = match file_handler.read(&mut buffer) {
Ok(t) => t,
Err(_) => {
errors.push(format!("Error happened when checking hash of file {}", file_entry.path.display()));
continue 'fe;
}
};
if n == 0 {
break;
}
current_file_read_bytes += n as u64;
bytes_read += n as u64;
hasher.update(&buffer[..n]);
if current_file_read_bytes >= HASH_MB_LIMIT_BYTES {
break;
}
}
let hash_string: String = hasher.finalize().to_hex().to_string();
hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
}
Some((*size, hashmap_with_hash, errors, bytes_read))
})
.while_some()
.collect();
}
CheckingMethod::Hash => {
let loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, &self.hash_type) {
Some(t) => t,
None => Default::default(),
};
current_file_read_bytes += n as u64;
bytes_read += n as u64;
hasher.update(&buffer[..n]);
let mut records_already_cached: HashMap<u64, Vec<FileEntry>> = Default::default();
let mut non_cached_files_to_check: HashMap<u64, Vec<FileEntry>> = Default::default();
for (size, vec_file_entry) in pre_checked_map {
#[allow(clippy::collapsible_if)]
if !loaded_hash_map.contains_key(&size) {
// If loaded data doesn't contains current info
non_cached_files_to_check.insert(size, vec_file_entry);
} else {
let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap();
if self.check_method == CheckingMethod::HashMB && current_file_read_bytes >= HASH_MB_LIMIT_BYTES {
break;
for file_entry in vec_file_entry {
let mut found: bool = false;
for loaded_file_entry in loaded_vec_file_entry {
if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date {
records_already_cached.entry(file_entry.size).or_insert_with(Vec::new);
records_already_cached.get_mut(&file_entry.size).unwrap().push(loaded_file_entry.clone());
found = true;
break;
}
}
if !found {
non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new);
non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry);
}
}
}
let hash_string: String = hasher.finalize().to_hex().to_string();
hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
}
Some((*size, hashmap_with_hash, errors, bytes_read))
})
.while_some()
.collect();
full_hash_results = non_cached_files_to_check
.par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
let mut file_handler: File;
let mut bytes_read: u64 = 0;
atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
'fe: for file_entry in vec_file_entry {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_breaked.store(true, Ordering::Relaxed);
return None;
}
file_handler = match File::open(&file_entry.path) {
Ok(t) => t,
Err(_) => {
errors.push(format!("Unable to check hash of file {}", file_entry.path.display()));
continue 'fe;
}
};
let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 1024 * 128];
loop {
let n = match file_handler.read(&mut buffer) {
Ok(t) => t,
Err(_) => {
errors.push(format!("Error happened when checking hash of file {}", file_entry.path.display()));
continue 'fe;
}
};
if n == 0 {
break;
}
bytes_read += n as u64;
hasher.update(&buffer[..n]);
}
let hash_string: String = hasher.finalize().to_hex().to_string();
let mut file_entry = file_entry.clone();
file_entry.hash = hash_string.clone();
hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry);
}
Some((*size, hashmap_with_hash, errors, bytes_read))
})
.while_some()
.collect();
// Size, Vec
'main: for (size, vec_file_entry) in records_already_cached {
// Check if size already exists, if exists we must to change it outside because cannot have mut and non mut reference to full_hash_results
for (full_size, full_hashmap, _errors, _bytes_read) in &mut full_hash_results {
if size == *full_size {
for file_entry in vec_file_entry {
full_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
full_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
}
continue 'main;
}
}
// Size doesn't exists add results to files
let mut temp_hashmap: HashMap<String, Vec<FileEntry>> = Default::default();
for file_entry in vec_file_entry {
temp_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
temp_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
}
full_hash_results.push((size, temp_hashmap, Vec::new(), 0));
}
// Must save all results to file, old loaded from file with all currently counted results
let mut all_results: HashMap<String, FileEntry> = Default::default();
for (_size, vec_file_entry) in loaded_hash_map {
for file_entry in vec_file_entry {
all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
}
}
for (_size, hashmap, _errors, _bytes_read) in &full_hash_results {
for vec_file_entry in hashmap.values() {
for file_entry in vec_file_entry {
all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
}
}
}
save_hashes_to_file(&all_results, &mut self.text_messages, &self.hash_type);
}
_ => panic!("What"),
}
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
@ -1169,3 +1306,104 @@ fn delete_files(vector: &[FileEntry], delete_method: &DeleteMethod, warnings: &m
};
(gained_space, removed_files, failed_to_remove_files)
}
fn save_hashes_to_file(hashmap: &HashMap<String, FileEntry>, text_messages: &mut Messages, type_of_hash: &HashType) {
println!("Trying to save {} files", hashmap.len());
if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") {
let cache_dir = PathBuf::from(proj_dirs.cache_dir());
if cache_dir.exists() {
if !cache_dir.is_dir() {
text_messages.messages.push(format!("Config dir {} is a file!", cache_dir.display()));
return;
}
} else if fs::create_dir_all(&cache_dir).is_err() {
text_messages.messages.push(format!("Cannot create config dir {}", cache_dir.display()));
return;
}
let cache_file = cache_dir.join(CACHE_FILE_NAME.replace(".", format!("_{:?}.", type_of_hash).as_str()));
let file_handler = match OpenOptions::new().truncate(true).write(true).create(true).open(&cache_file) {
Ok(t) => t,
Err(_) => {
text_messages.messages.push(format!("Cannot create or open cache file {}", cache_file.display()));
return;
}
};
let mut writer = BufWriter::new(file_handler);
for file_entry in hashmap.values() {
// Only cache bigger than 5MB files
if file_entry.size > 5 * 1024 * 1024 {
let string: String = format!("{}//{}//{}//{}", file_entry.path.display(), file_entry.size, file_entry.modified_date, file_entry.hash);
if writeln!(writer, "{}", string).is_err() {
text_messages.messages.push(format!("Failed to save some data to cache file {}", cache_file.display()));
return;
};
}
}
}
}
fn load_hashes_from_file(text_messages: &mut Messages, type_of_hash: &HashType) -> Option<BTreeMap<u64, Vec<FileEntry>>> {
if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") {
let cache_dir = PathBuf::from(proj_dirs.cache_dir());
let cache_file = cache_dir.join(CACHE_FILE_NAME.replace(".", format!("_{:?}.", type_of_hash).as_str()));
let file_handler = match OpenOptions::new().read(true).open(&cache_file) {
Ok(t) => t,
Err(_) => {
// text_messages.messages.push(format!("Cannot find or open cache file {}", cache_file.display())); // This shouldn't be write to output
return None;
}
};
let reader = BufReader::new(file_handler);
let mut hashmap_loaded_entries: BTreeMap<u64, Vec<FileEntry>> = Default::default();
// Read the file line by line using the lines() iterator from std::io::BufRead.
for (index, line) in reader.lines().enumerate() {
let line = match line {
Ok(t) => t,
Err(_) => {
text_messages.warnings.push(format!("Failed to load line number {} from cache file {}", index + 1, cache_file.display()));
return None;
}
};
let uuu = line.split("//").collect::<Vec<&str>>();
if uuu.len() != 4 {
text_messages
.warnings
.push(format!("Found invalid data(too much or too low amount of data) in line {} - ({}) in cache file {}", index + 1, line, cache_file.display()));
continue;
}
// Don't load cache data if destination file not exists
if Path::new(uuu[0]).exists() {
let file_entry = FileEntry {
path: PathBuf::from(uuu[0]),
size: match uuu[1].parse::<u64>() {
Ok(t) => t,
Err(_) => {
text_messages.warnings.push(format!("Found invalid size value in line {} - ({}) in cache file {}", index + 1, line, cache_file.display()));
continue;
}
},
modified_date: match uuu[2].parse::<u64>() {
Ok(t) => t,
Err(_) => {
text_messages.warnings.push(format!("Found invalid modified date value in line {} - ({}) in cache file {}", index + 1, line, cache_file.display()));
continue;
}
},
hash: uuu[3].to_string(),
};
hashmap_loaded_entries.entry(file_entry.size).or_insert_with(Vec::new);
hashmap_loaded_entries.get_mut(&file_entry.size).unwrap().push(file_entry);
}
}
return Some(hashmap_loaded_entries);
}
text_messages.messages.push("Cannot find or open system config dir to save cache file".to_string());
None
}

View File

@ -326,20 +326,20 @@ impl SimilarImages {
None => Default::default(),
};
let mut hashes_already_counted: HashMap<String, FileEntry> = Default::default();
let mut hashes_to_check: HashMap<String, FileEntry> = Default::default();
let mut records_already_cached: HashMap<String, FileEntry> = Default::default();
let mut non_cached_files_to_check: HashMap<String, FileEntry> = Default::default();
for (name, file_entry) in &self.images_to_check {
#[allow(clippy::collapsible_if)]
if !loaded_hash_map.contains_key(name) {
// If loaded data doesn't contains current image info
hashes_to_check.insert(name.clone(), file_entry.clone());
non_cached_files_to_check.insert(name.clone(), file_entry.clone());
} else {
if file_entry.size != loaded_hash_map.get(name).unwrap().size || file_entry.modified_date != loaded_hash_map.get(name).unwrap().modified_date {
// When size or modification date of image changed, then it is clear that is different image
hashes_to_check.insert(name.clone(), file_entry.clone());
non_cached_files_to_check.insert(name.clone(), file_entry.clone());
} else {
// Checking may be omitted when already there is entry with same size and modification date
hashes_already_counted.insert(name.clone(), loaded_hash_map.get(name).unwrap().clone());
records_already_cached.insert(name.clone(), loaded_hash_map.get(name).unwrap().clone());
}
}
}
@ -358,7 +358,7 @@ impl SimilarImages {
let mut progress_send = progress_sender.clone();
let progress_thread_run = progress_thread_run.clone();
let atomic_file_counter = atomic_file_counter.clone();
let images_to_check = hashes_to_check.len();
let images_to_check = non_cached_files_to_check.len();
progress_thread_handle = thread::spawn(move || loop {
progress_send
.try_send(ProgressData {
@ -377,7 +377,7 @@ impl SimilarImages {
progress_thread_handle = thread::spawn(|| {});
}
//// PROGRESS THREAD END
let mut vec_file_entry: Vec<(FileEntry, Node)> = hashes_to_check
let mut vec_file_entry: Vec<(FileEntry, Node)> = non_cached_files_to_check
.par_iter()
.map(|file_entry| {
atomic_file_counter.fetch_add(1, Ordering::Relaxed);
@ -416,7 +416,7 @@ impl SimilarImages {
let hash_map_modification = SystemTime::now();
// Just connect loaded results with already calculated hashes
for (_name, file_entry) in hashes_already_counted {
for (_name, file_entry) in records_already_cached {
vec_file_entry.push((file_entry.clone(), file_entry.hash));
}
@ -457,15 +457,15 @@ impl SimilarImages {
// Maybe also add here progress report
let mut new_vector: Vec<Vec<FileEntry>> = Vec::new();
let mut hashes_to_check = self.image_hashes.clone();
let mut non_cached_files_to_check = self.image_hashes.clone();
for (hash, vec_file_entry) in &self.image_hashes {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
return false;
}
if !hashes_to_check.contains_key(hash) {
if !non_cached_files_to_check.contains_key(hash) {
continue;
}
hashes_to_check.remove(hash);
non_cached_files_to_check.remove(hash);
let vector_with_found_similar_hashes = self.bktree.find(hash, similarity).collect::<Vec<_>>();
if vector_with_found_similar_hashes.len() == 1 && vec_file_entry.len() == 1 {
@ -493,7 +493,7 @@ impl SimilarImages {
panic!("I'm not sure if same hash can have distance > 0");
}
if let Some(vec_file_entry) = hashes_to_check.get(*similar_hash) {
if let Some(vec_file_entry) = non_cached_files_to_check.get(*similar_hash) {
vector_of_similar_images.append(
&mut (vec_file_entry
.iter()
@ -515,7 +515,7 @@ impl SimilarImages {
})
.collect::<Vec<_>>()),
);
hashes_to_check.remove(*similar_hash);
non_cached_files_to_check.remove(*similar_hash);
}
}
if vector_of_similar_images.len() > 1 {

View File

@ -137,17 +137,17 @@ Only some image extensions are supported, because I rely on image crate. Also so
## Config/Cache files
For now Czkawka store only 2 files on disk:
- `czkawka_gui_config.txt` - stores configuration of GUI which may be loaded at startup
- `cache_similar_image.txt` - stores cache data and hashes which may be used later without needing to compute image hash again - DO NOT TRY TO EDIT THIS FILE MANUALLY! - editing this file may cause app crashes.
- `cache_similar_image.txt` - stores cache data and hashes which may be used later without needing to compute image hash again - editing this file may cause app crashes.
- `cache_broken_files.txt` - stores cache data of broken files
- `cache_duplicates_Blake3.txt` - stores cache data of duplicated files, to not get too big performance hit when saving/loading file, only already fully hashed files bigger than 5MB are stored. Similar files with replaced `Blake3` to e.g. `SHA256` may be shown, when support for new hashes will be introduced in Czkawka.
First file is located in this path
Config files are located in this path
Linux - `/home/username/.config/czkawka`
Mac - `/Users/username/Library/Application Support/pl.Qarmin.Czkawka`
Windows - `C:\Users\Username\AppData\Roaming\Qarmin\Czkawka\config`
Second with cache here:
Cache should be here:
Linux - `/home/username/.cache/czkawka`
Mac - `/Users/Username/Library/Caches/pl.Qarmin.Czkawka`