Add prehash cache support (#477)

This commit is contained in:
Rafał Mikrut 2021-12-01 20:09:04 +01:00 committed by GitHub
parent 51271dcdf0
commit 1fd53b854b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 427 additions and 220 deletions

View File

@ -22,7 +22,7 @@ pub enum Commands {
minimal_file_size: u64,
#[structopt(short = "i", long, parse(try_from_str = parse_maximal_file_size), default_value = "18446744073709551615", help = "Maximum size in bytes", long_help = "Maximum size of checked files in bytes, assigning lower value may speed up searching")]
maximal_file_size: u64,
#[structopt(short = "c", long, parse(try_from_str = parse_minimal_file_size), default_value = "524288", help = "Minimum cached file size in bytes", long_help = "Minimum size of cached files in bytes, assigning bigger value may speed up will cause that lower amount of files will be cached, but loading of cache will be faster")]
#[structopt(short = "c", long, parse(try_from_str = parse_minimal_file_size), default_value = "257144", help = "Minimum cached file size in bytes", long_help = "Minimum size of cached files in bytes, assigning bigger value may speed up will cause that lower amount of files will be cached, but loading of cache will be faster")]
minimal_cached_file_size: u64,
#[structopt(flatten)]
allowed_extensions: AllowedExtensions,

View File

@ -117,9 +117,6 @@ pub struct Info {
pub number_of_duplicated_files_by_name: usize,
pub lost_space_by_size: u64,
pub lost_space_by_hash: u64,
pub bytes_read_when_hashing: u64,
pub number_of_removed_files: usize,
pub number_of_failed_to_remove_files: usize,
pub gained_space: u64,
}
@ -149,7 +146,9 @@ pub struct DuplicateFinder {
dryrun: bool,
stopped_search: bool,
use_cache: bool,
use_prehash_cache: bool,
minimal_cache_file_size: u64,
minimal_prehash_cache_file_size: u64,
delete_outdated_cache: bool,
}
@ -174,7 +173,9 @@ impl DuplicateFinder {
hash_type: HashType::Blake3,
dryrun: false,
use_cache: true,
minimal_cache_file_size: 2 * 1024 * 1024, // By default cache only >= 1MB files
use_prehash_cache: true,
minimal_cache_file_size: 1024 * 1024 / 4, // By default cache only >= 256 KB files
minimal_prehash_cache_file_size: 0,
delete_outdated_cache: true,
}
}
@ -229,6 +230,10 @@ impl DuplicateFinder {
self.minimal_cache_file_size = minimal_cache_file_size;
}
pub fn set_minimal_prehash_cache_file_size(&mut self, minimal_prehash_cache_file_size: u64) {
self.minimal_prehash_cache_file_size = minimal_prehash_cache_file_size;
}
pub const fn get_files_sorted_by_names(&self) -> &BTreeMap<String, Vec<FileEntry>> {
&self.files_with_identical_names
}
@ -237,6 +242,10 @@ impl DuplicateFinder {
self.use_cache = use_cache;
}
pub fn set_use_prehash_cache(&mut self, use_prehash_cache: bool) {
self.use_prehash_cache = use_prehash_cache;
}
pub const fn get_files_sorted_by_size(&self) -> &BTreeMap<u64, Vec<FileEntry>> {
&self.files_with_identical_size
}
@ -659,6 +668,8 @@ impl DuplicateFinder {
/// The slowest checking type, which must be applied after checking for size
fn check_files_hash(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
assert_eq!(self.check_method, CheckingMethod::Hash);
let check_type = Arc::new(self.hash_type);
let start_time: SystemTime = SystemTime::now();
@ -699,57 +710,136 @@ impl DuplicateFinder {
//// PROGRESS THREAD END
#[allow(clippy::type_complexity)]
let pre_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>, u64)> = self
.files_with_identical_size
.par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
let mut bytes_read: u64 = 0;
let mut buffer = [0u8; 1024 * 2];
///////////////////////////////////////////////////////////////////////////// PREHASHING START
{
let loaded_hash_map;
let mut records_already_cached: BTreeMap<u64, Vec<FileEntry>> = Default::default();
let mut non_cached_files_to_check: BTreeMap<u64, Vec<FileEntry>> = Default::default();
atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
for file_entry in vec_file_entry {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_breaked.store(true, Ordering::Relaxed);
return None;
}
match hash_calculation(&mut buffer, file_entry, &check_type, 0) {
Ok((hash_string, bytes)) => {
bytes_read += bytes;
hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.clone());
}
Err(s) => errors.push(s),
// Cache algorithm
// - Load data from cache
// - Convert from BT<u64,Vec<FileEntry>> to BT<String,FileEntry>
// - Save to proper values
if self.use_prehash_cache {
loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, true) {
Some(t) => t,
None => Default::default(),
};
let mut loaded_hash_map2: BTreeMap<String, FileEntry> = Default::default();
for vec_file_entry in loaded_hash_map.values() {
for file_entry in vec_file_entry {
loaded_hash_map2.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
}
}
Some((*size, hashmap_with_hash, errors, bytes_read))
})
.while_some()
.collect();
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
#[allow(clippy::if_same_then_else)]
for vec_file_entry in self.files_with_identical_size.values() {
for file_entry in vec_file_entry {
let name = file_entry.path.to_string_lossy().to_string();
if !loaded_hash_map2.contains_key(&name) {
// If loaded data doesn't contains current image info
non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new);
non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry.clone());
} else if file_entry.size != loaded_hash_map2.get(&name).unwrap().size || file_entry.modified_date != loaded_hash_map2.get(&name).unwrap().modified_date {
// When size or modification date of image changed, then it is clear that is different image
non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new);
non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry.clone());
} else {
// Checking may be omitted when already there is entry with same size and modification date
records_already_cached.entry(file_entry.size).or_insert_with(Vec::new);
records_already_cached.get_mut(&file_entry.size).unwrap().push(file_entry.clone());
}
}
}
} else {
loaded_hash_map = Default::default();
mem::swap(&mut self.files_with_identical_size, &mut non_cached_files_to_check);
}
// Check if user aborted search(only from GUI)
if check_was_breaked.load(Ordering::Relaxed) {
return false;
}
#[allow(clippy::type_complexity)]
let pre_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>)> = non_cached_files_to_check
.par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
let mut buffer = [0u8; 1024 * 2];
// Check results
for (size, hash_map, mut errors, bytes_read) in pre_hash_results {
self.information.bytes_read_when_hashing += bytes_read;
self.text_messages.warnings.append(&mut errors);
for (_hash, mut vec_file_entry) in hash_map {
if vec_file_entry.len() > 1 {
pre_checked_map.entry(size).or_insert_with(Vec::new);
pre_checked_map.get_mut(&size).unwrap().append(&mut vec_file_entry);
atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
for file_entry in vec_file_entry {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_breaked.store(true, Ordering::Relaxed);
return None;
}
match hash_calculation(&mut buffer, file_entry, &check_type, 0) {
Ok(hash_string) => {
hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.clone());
}
Err(s) => errors.push(s),
}
}
Some((*size, hashmap_with_hash, errors))
})
.while_some()
.collect();
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
// Check if user aborted search(only from GUI)
if check_was_breaked.load(Ordering::Relaxed) {
return false;
}
// Add data from cache
for (size, vec_file_entry) in &records_already_cached {
pre_checked_map.entry(*size).or_insert_with(Vec::new);
pre_checked_map.get_mut(size).unwrap().append(&mut vec_file_entry.clone());
}
// Check results
for (size, hash_map, errors) in &pre_hash_results {
self.text_messages.warnings.append(&mut errors.clone());
for vec_file_entry in hash_map.values() {
if vec_file_entry.len() > 1 {
pre_checked_map.entry(*size).or_insert_with(Vec::new);
pre_checked_map.get_mut(size).unwrap().append(&mut vec_file_entry.clone());
}
}
}
if self.use_prehash_cache {
println!("non cached - {}", non_cached_files_to_check.values().map(|e| e.len()).sum::<usize>());
println!("cached - {}", records_already_cached.values().map(|e| e.len()).sum::<usize>());
// All results = records already cached + computed results
let mut save_cache_to_hashmap: BTreeMap<String, FileEntry> = Default::default();
for (size, vec_file_entry) in loaded_hash_map {
if size >= self.minimal_prehash_cache_file_size {
for file_entry in vec_file_entry {
save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
}
}
}
for (size, hash_map, _errors) in &pre_hash_results {
if *size >= self.minimal_prehash_cache_file_size {
for vec_file_entry in hash_map.values() {
for file_entry in vec_file_entry {
save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
}
}
}
}
save_hashes_to_file(&save_cache_to_hashmap, &mut self.text_messages, &self.hash_type, true, self.minimal_prehash_cache_file_size);
}
}
///////////////////////////////////////////////////////////////////////////// PREHASHING END
Common::print_time(start_time, SystemTime::now(), "check_files_hash - prehash".to_string());
let start_time: SystemTime = SystemTime::now();
@ -766,7 +856,7 @@ impl DuplicateFinder {
let progress_send = progress_sender.clone();
let progress_thread_run = progress_thread_run.clone();
let atomic_file_counter = atomic_file_counter.clone();
let files_to_check = pre_checked_map.iter().map(|e| e.1.len()).sum();
let files_to_check = pre_checked_map.iter().map(|(_size, vec_file_entry)| vec_file_entry.len()).sum();
let checking_method = self.check_method.clone();
progress_thread_handle = thread::spawn(move || loop {
progress_send
@ -789,154 +879,151 @@ impl DuplicateFinder {
//// PROGRESS THREAD END
#[allow(clippy::type_complexity)]
let mut full_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>, u64)>;
///////////////////////////////////////////////////////////////////////////// HASHING START
{
#[allow(clippy::type_complexity)]
let mut full_hash_results: Vec<(u64, BTreeMap<String, Vec<FileEntry>>, Vec<String>)>;
match self.check_method {
CheckingMethod::Hash => {
let loaded_hash_map;
let loaded_hash_map;
let mut records_already_cached: BTreeMap<u64, Vec<FileEntry>> = Default::default();
let mut non_cached_files_to_check: BTreeMap<u64, Vec<FileEntry>> = Default::default();
let mut records_already_cached: BTreeMap<u64, Vec<FileEntry>> = Default::default();
let mut non_cached_files_to_check: BTreeMap<u64, Vec<FileEntry>> = Default::default();
if self.use_cache {
loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, false) {
Some(t) => t,
None => Default::default(),
};
if self.use_cache {
loaded_hash_map = match load_hashes_from_file(&mut self.text_messages, self.delete_outdated_cache, &self.hash_type, false) {
Some(t) => t,
None => Default::default(),
};
for (size, vec_file_entry) in pre_checked_map {
#[allow(clippy::collapsible_if)]
if !loaded_hash_map.contains_key(&size) {
// If loaded data doesn't contains current info
non_cached_files_to_check.insert(size, vec_file_entry);
} else {
let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap();
for (size, vec_file_entry) in pre_checked_map {
#[allow(clippy::collapsible_if)]
if !loaded_hash_map.contains_key(&size) {
// If loaded data doesn't contains current info
non_cached_files_to_check.insert(size, vec_file_entry);
} else {
let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap();
for file_entry in vec_file_entry {
let mut found: bool = false;
for loaded_file_entry in loaded_vec_file_entry {
if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date {
records_already_cached.entry(file_entry.size).or_insert_with(Vec::new);
records_already_cached.get_mut(&file_entry.size).unwrap().push(loaded_file_entry.clone());
found = true;
break;
}
}
if !found {
non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new);
non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry);
}
}
}
}
} else {
loaded_hash_map = Default::default();
mem::swap(&mut pre_checked_map, &mut non_cached_files_to_check);
}
full_hash_results = non_cached_files_to_check
.par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
let mut buffer = [0u8; 1024 * 128];
atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
for file_entry in vec_file_entry {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_breaked.store(true, Ordering::Relaxed);
return None;
}
match hash_calculation(&mut buffer, file_entry, &check_type, u64::MAX) {
Ok(hash_string) => {
let mut file_entry = file_entry.clone();
file_entry.hash = hash_string.clone();
hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry);
}
Err(s) => errors.push(s),
}
}
Some((*size, hashmap_with_hash, errors))
})
.while_some()
.collect();
if self.use_cache {
'main: for (size, vec_file_entry) in records_already_cached {
// Check if size already exists, if exists we must to change it outside because cannot have mut and non mut reference to full_hash_results
for (full_size, full_hashmap, _errors) in &mut full_hash_results {
if size == *full_size {
for file_entry in vec_file_entry {
let mut found: bool = false;
for loaded_file_entry in loaded_vec_file_entry {
if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date {
records_already_cached.entry(file_entry.size).or_insert_with(Vec::new);
records_already_cached.get_mut(&file_entry.size).unwrap().push(loaded_file_entry.clone());
found = true;
break;
}
}
if !found {
non_cached_files_to_check.entry(file_entry.size).or_insert_with(Vec::new);
non_cached_files_to_check.get_mut(&file_entry.size).unwrap().push(file_entry);
}
full_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
full_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
}
continue 'main;
}
}
} else {
loaded_hash_map = Default::default();
mem::swap(&mut pre_checked_map, &mut non_cached_files_to_check);
// Size doesn't exists add results to files
let mut temp_hashmap: BTreeMap<String, Vec<FileEntry>> = Default::default();
for file_entry in vec_file_entry {
temp_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
temp_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
}
full_hash_results.push((size, temp_hashmap, Vec::new()));
}
full_hash_results = non_cached_files_to_check
.par_iter()
.map(|(size, vec_file_entry)| {
let mut hashmap_with_hash: BTreeMap<String, Vec<FileEntry>> = Default::default();
let mut errors: Vec<String> = Vec::new();
let mut bytes_read: u64 = 0;
let mut buffer = [0u8; 1024 * 128];
atomic_file_counter.fetch_add(vec_file_entry.len(), Ordering::Relaxed);
for file_entry in vec_file_entry {
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_breaked.store(true, Ordering::Relaxed);
return None;
}
match hash_calculation(&mut buffer, file_entry, &check_type, u64::MAX) {
Ok((hash_string, bytes)) => {
bytes_read += bytes;
let mut file_entry = file_entry.clone();
file_entry.hash = hash_string.clone();
hashmap_with_hash.entry(hash_string.clone()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry);
}
Err(s) => errors.push(s),
}
}
Some((*size, hashmap_with_hash, errors, bytes_read))
})
.while_some()
.collect();
if self.use_cache {
'main: for (size, vec_file_entry) in records_already_cached {
// Check if size already exists, if exists we must to change it outside because cannot have mut and non mut reference to full_hash_results
for (full_size, full_hashmap, _errors, _bytes_read) in &mut full_hash_results {
if size == *full_size {
for file_entry in vec_file_entry {
full_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
full_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
}
continue 'main;
}
}
// Size doesn't exists add results to files
let mut temp_hashmap: BTreeMap<String, Vec<FileEntry>> = Default::default();
for file_entry in vec_file_entry {
temp_hashmap.entry(file_entry.hash.clone()).or_insert_with(Vec::new);
temp_hashmap.get_mut(&file_entry.hash).unwrap().push(file_entry);
}
full_hash_results.push((size, temp_hashmap, Vec::new(), 0));
// Must save all results to file, old loaded from file with all currently counted results
let mut all_results: BTreeMap<String, FileEntry> = Default::default();
for (_size, vec_file_entry) in loaded_hash_map {
for file_entry in vec_file_entry {
all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
}
// Must save all results to file, old loaded from file with all currently counted results
let mut all_results: BTreeMap<String, FileEntry> = Default::default();
for (_size, vec_file_entry) in loaded_hash_map {
}
for (_size, hashmap, _errors) in &full_hash_results {
for vec_file_entry in hashmap.values() {
for file_entry in vec_file_entry {
all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
}
}
for (_size, hashmap, _errors, _bytes_read) in &full_hash_results {
for vec_file_entry in hashmap.values() {
for file_entry in vec_file_entry {
all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
}
}
}
save_hashes_to_file(&all_results, &mut self.text_messages, &self.hash_type, false, self.minimal_cache_file_size);
}
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
// Check if user aborted search(only from GUI)
if check_was_breaked.load(Ordering::Relaxed) {
return false;
}
for (size, hash_map, mut errors) in full_hash_results {
self.text_messages.warnings.append(&mut errors);
for (_hash, vec_file_entry) in hash_map {
if vec_file_entry.len() > 1 {
self.files_with_identical_hashes.entry(size).or_insert_with(Vec::new);
self.files_with_identical_hashes.get_mut(&size).unwrap().push(vec_file_entry);
}
save_hashes_to_file(&all_results, &mut self.text_messages, &self.hash_type, false, self.minimal_cache_file_size);
}
}
_ => panic!("What"),
}
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
/////////////////////////
// Check if user aborted search(only from GUI)
if check_was_breaked.load(Ordering::Relaxed) {
return false;
}
for (size, hash_map, mut errors, bytes_read) in full_hash_results {
self.information.bytes_read_when_hashing += bytes_read;
self.text_messages.warnings.append(&mut errors);
for (_hash, vec_file_entry) in hash_map {
if vec_file_entry.len() > 1 {
self.files_with_identical_hashes.entry(size).or_insert_with(Vec::new);
self.files_with_identical_hashes.get_mut(&size).unwrap().push(vec_file_entry);
for (size, vector_vectors) in &self.files_with_identical_hashes {
for vector in vector_vectors {
self.information.number_of_duplicated_files_by_hash += vector.len() - 1;
self.information.number_of_groups_by_hash += 1;
self.information.lost_space_by_hash += (vector.len() as u64 - 1) * size;
}
}
}
/////////////////////////
for (size, vector_vectors) in &self.files_with_identical_hashes {
for vector in vector_vectors {
self.information.number_of_duplicated_files_by_hash += vector.len() - 1;
self.information.number_of_groups_by_hash += 1;
self.information.lost_space_by_hash += (vector.len() as u64 - 1) * size;
}
}
///////////////////////////////////////////////////////////////////////////// HASHING END
Common::print_time(start_time, SystemTime::now(), "check_files_hash - full hash".to_string());
@ -959,8 +1046,6 @@ impl DuplicateFinder {
for vector in self.files_with_identical_names.values() {
let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun);
self.information.gained_space += tuple.0;
self.information.number_of_removed_files += tuple.1;
self.information.number_of_failed_to_remove_files += tuple.2;
}
}
CheckingMethod::Hash => {
@ -968,8 +1053,6 @@ impl DuplicateFinder {
for vector in vector_vectors.iter() {
let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun);
self.information.gained_space += tuple.0;
self.information.number_of_removed_files += tuple.1;
self.information.number_of_failed_to_remove_files += tuple.2;
}
}
}
@ -977,8 +1060,6 @@ impl DuplicateFinder {
for vector in self.files_with_identical_size.values() {
let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages, self.dryrun);
self.information.gained_space += tuple.0;
self.information.number_of_removed_files += tuple.1;
self.information.number_of_failed_to_remove_files += tuple.2;
}
}
CheckingMethod::None => {
@ -1031,13 +1112,6 @@ impl DebugPrint for DuplicateFinder {
self.information.gained_space.file_size(options::BINARY).unwrap(),
self.information.gained_space
);
println!(
"Bytes read when hashing - {} ({} bytes)",
self.information.bytes_read_when_hashing.file_size(options::BINARY).unwrap(),
self.information.bytes_read_when_hashing
);
println!("Number of removed files - {}", self.information.number_of_removed_files);
println!("Number of failed to remove files - {}", self.information.number_of_failed_to_remove_files);
println!("### Other");
@ -1363,7 +1437,7 @@ pub trait MyHasher {
fn finalize(&self) -> String;
}
fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashType, limit: u64) -> Result<(String, u64), String> {
fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashType, limit: u64) -> Result<String, String> {
let mut file_handler = match File::open(&file_entry.path) {
Ok(t) => t,
Err(e) => return Err(format!("Unable to check hash of file {}, reason {}", file_entry.path.display(), e)),
@ -1384,7 +1458,7 @@ fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashT
break;
}
}
Ok((hasher.finalize(), current_file_read_bytes))
Ok(hasher.finalize())
}
fn get_file_hash_name(type_of_hash: &HashType, is_prehash: bool) -> String {
@ -1560,8 +1634,7 @@ mod tests {
file.write_all(b"aa")?;
let e = FileEntry { path: src, ..Default::default() };
let r = hash_calculation(&mut buf, &e, &HashType::Blake3, 0).unwrap();
assert_eq!(2, r.1);
assert!(!r.0.is_empty());
assert!(!r.is_empty());
Ok(())
}

View File

@ -457,7 +457,7 @@ impl SimilarVideos {
hashmap_with_file_entries.insert(file_entry.vhash.src_path().to_string_lossy().to_string(), file_entry.clone());
vector_of_hashes.push(file_entry.vhash.clone());
} else {
self.text_messages.errors.push(file_entry.error.clone());
self.text_messages.warnings.push(file_entry.error.clone());
}
}

View File

@ -92,7 +92,9 @@ pub fn connect_button_search(
let radio_button_hash_type_xxh3 = gui_data.main_notebook.radio_button_hash_type_xxh3.clone();
let check_button_settings_hide_hard_links = gui_data.settings.check_button_settings_hide_hard_links.clone();
let check_button_settings_use_cache = gui_data.settings.check_button_settings_use_cache.clone();
let check_button_duplicates_use_prehash_cache = gui_data.settings.check_button_duplicates_use_prehash_cache.clone();
let entry_settings_cache_file_minimal_size = gui_data.settings.entry_settings_cache_file_minimal_size.clone();
let entry_settings_prehash_cache_file_minimal_size = gui_data.settings.entry_settings_prehash_cache_file_minimal_size.clone();
let radio_button_similar_hash_size_4 = gui_data.main_notebook.radio_button_similar_hash_size_4.clone();
let radio_button_similar_hash_size_8 = gui_data.main_notebook.radio_button_similar_hash_size_8.clone();
let radio_button_similar_hash_size_16 = gui_data.main_notebook.radio_button_similar_hash_size_16.clone();
@ -118,7 +120,7 @@ pub fn connect_button_search(
let allowed_extensions = entry_allowed_extensions.text().as_str().to_string();
let hide_hard_links = check_button_settings_hide_hard_links.is_active();
let use_cache = check_button_settings_use_cache.is_active();
let minimal_cache_file_size = entry_settings_cache_file_minimal_size.text().as_str().parse::<u64>().unwrap_or(2 * 1024 * 1024);
let minimal_cache_file_size = entry_settings_cache_file_minimal_size.text().as_str().parse::<u64>().unwrap_or(1024 * 1024 / 4);
let show_dialog = Arc::new(AtomicBool::new(true));
@ -170,6 +172,9 @@ pub fn connect_button_search(
panic!("No radio button is pressed");
}
let use_prehash_cache = check_button_duplicates_use_prehash_cache.is_active();
let minimal_prehash_cache_file_size = entry_settings_prehash_cache_file_minimal_size.text().as_str().parse::<u64>().unwrap_or(0);
let delete_outdated_cache = check_button_settings_duplicates_delete_outdated_cache.is_active();
let futures_sender_duplicate_files = futures_sender_duplicate_files.clone();
@ -184,10 +189,12 @@ pub fn connect_button_search(
df.set_minimal_file_size(minimal_file_size);
df.set_maximal_file_size(maximal_file_size);
df.set_minimal_cache_file_size(minimal_cache_file_size);
df.set_minimal_prehash_cache_file_size(minimal_prehash_cache_file_size);
df.set_check_method(check_method);
df.set_hash_type(hash_type);
df.set_ignore_hard_links(hide_hard_links);
df.set_use_cache(use_cache);
df.set_use_prehash_cache(use_prehash_cache);
df.set_delete_outdated_cache(delete_outdated_cache);
df.find_duplicates(Some(&stop_receiver), Some(&futures_sender_duplicate_files));
let _ = glib_stop_sender.send(Message::Duplicates(df));

View File

@ -10,10 +10,10 @@ use crate::notebook_enums::*;
pub fn connect_button_select(gui_data: &GuiData) {
let mut hashmap: HashMap<NotebookMainEnum, Vec<PopoverTypes>> = Default::default();
{
hashmap.insert(NotebookMainEnum::SimilarImages, vec![PopoverTypes::All, PopoverTypes::ImageSize, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]);
hashmap.insert(NotebookMainEnum::SimilarVideos, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]);
hashmap.insert(NotebookMainEnum::SimilarImages, vec![PopoverTypes::All, PopoverTypes::Size, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]);
hashmap.insert(NotebookMainEnum::SimilarVideos, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date, PopoverTypes::Size]);
hashmap.insert(NotebookMainEnum::Duplicate, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]);
hashmap.insert(NotebookMainEnum::SameMusic, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date]);
hashmap.insert(NotebookMainEnum::SameMusic, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom, PopoverTypes::Date, PopoverTypes::Size]);
hashmap.insert(NotebookMainEnum::EmptyFiles, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom]);
hashmap.insert(NotebookMainEnum::EmptyDirectories, vec![PopoverTypes::All, PopoverTypes::Reverse, PopoverTypes::Custom]);
@ -63,7 +63,7 @@ fn show_required_popovers(popovers: &GuiPopovers, current_mode: &NotebookMainEnu
buttons_popover_unselect_all.hide();
}
if vec.contains(&PopoverTypes::ImageSize) {
if vec.contains(&PopoverTypes::Size) {
buttons_popover_select_all_images_except_biggest.show();
buttons_popover_select_all_images_except_smallest.show();
separator_select_image_size.show();

View File

@ -345,7 +345,7 @@ fn popover_custom_select_unselect(popover: &gtk::Popover, window_main: &Window,
}
}
fn popover_all_except_biggest_smallest(popover: &gtk::Popover, tree_view: &gtk::TreeView, column_color: i32, column_size_as_bytes: i32, column_dimensions: i32, column_button_selection: u32, except_biggest: bool) {
fn popover_all_except_biggest_smallest(popover: &gtk::Popover, tree_view: &gtk::TreeView, column_color: i32, column_size_as_bytes: i32, column_dimensions: Option<i32>, column_button_selection: u32, except_biggest: bool) {
let model = get_list_store(tree_view);
if let Some(iter) = model.iter_first() {
@ -373,22 +373,38 @@ fn popover_all_except_biggest_smallest(popover: &gtk::Popover, tree_view: &gtk::
}
tree_iter_array.push(iter.clone());
let size_as_bytes = model.value(&iter, column_size_as_bytes).get::<u64>().unwrap();
let dimensions_string = model.value(&iter, column_dimensions).get::<String>().unwrap();
let dimensions = change_dimension_to_krotka(dimensions_string);
let number_of_pixels = dimensions.0 * dimensions.1;
// If dimension exists, then needs to be checked images
if let Some(column_dimensions) = column_dimensions {
let dimensions_string = model.value(&iter, column_dimensions).get::<String>().unwrap();
if except_biggest {
if number_of_pixels > number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes > size_as_bytes_min_max) {
number_of_pixels_min_max = number_of_pixels;
size_as_bytes_min_max = size_as_bytes;
used_index = Some(current_index);
let dimensions = change_dimension_to_krotka(dimensions_string);
let number_of_pixels = dimensions.0 * dimensions.1;
if except_biggest {
if number_of_pixels > number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes > size_as_bytes_min_max) {
number_of_pixels_min_max = number_of_pixels;
size_as_bytes_min_max = size_as_bytes;
used_index = Some(current_index);
}
} else {
if number_of_pixels < number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes < size_as_bytes_min_max) {
number_of_pixels_min_max = number_of_pixels;
size_as_bytes_min_max = size_as_bytes;
used_index = Some(current_index);
}
}
} else {
if number_of_pixels < number_of_pixels_min_max || (number_of_pixels == number_of_pixels_min_max && size_as_bytes < size_as_bytes_min_max) {
number_of_pixels_min_max = number_of_pixels;
size_as_bytes_min_max = size_as_bytes;
used_index = Some(current_index);
if except_biggest {
if size_as_bytes > size_as_bytes_min_max {
size_as_bytes_min_max = size_as_bytes;
used_index = Some(current_index);
}
} else {
if size_as_bytes < size_as_bytes_min_max {
size_as_bytes_min_max = size_as_bytes;
used_index = Some(current_index);
}
}
}
@ -593,9 +609,9 @@ pub fn connect_popovers(gui_data: &GuiData) {
popover_all_except_biggest_smallest(
&popover_select,
tree_view,
nb_object.column_color.expect("AEB can't be used without headers"),
nb_object.column_size_as_bytes.expect("AEB needs size as bytes column"),
nb_object.column_dimensions.expect("AEB needs dimensions column"),
nb_object.column_color.expect("AEBI can't be used without headers"),
nb_object.column_size_as_bytes.expect("AEBI needs size as bytes column"),
nb_object.column_dimensions,
nb_object.column_selection as u32,
true,
);
@ -613,9 +629,9 @@ pub fn connect_popovers(gui_data: &GuiData) {
popover_all_except_biggest_smallest(
&popover_select,
tree_view,
nb_object.column_color.expect("AES can't be used without headers"),
nb_object.column_size_as_bytes.expect("AES needs size as bytes column"),
nb_object.column_dimensions.expect("AES needs dimensions column"),
nb_object.column_color.expect("AESI can't be used without headers"),
nb_object.column_size_as_bytes.expect("AESI needs size as bytes column"),
nb_object.column_dimensions,
nb_object.column_selection as u32,
false,
);

View File

@ -17,6 +17,8 @@ pub struct GuiSettings {
// Duplicates
pub check_button_settings_hide_hard_links: gtk::CheckButton,
pub entry_settings_cache_file_minimal_size: gtk::Entry,
pub entry_settings_prehash_cache_file_minimal_size: gtk::Entry,
pub check_button_duplicates_use_prehash_cache: gtk::CheckButton,
pub check_button_settings_show_preview_duplicates: gtk::CheckButton,
pub check_button_settings_duplicates_delete_outdated_cache: gtk::CheckButton,
pub button_settings_duplicates_clear_cache: gtk::Button,
@ -71,6 +73,8 @@ impl GuiSettings {
let check_button_settings_show_preview_duplicates: gtk::CheckButton = builder.object("check_button_settings_show_preview_duplicates").unwrap();
let check_button_settings_duplicates_delete_outdated_cache: gtk::CheckButton = builder.object("check_button_settings_duplicates_delete_outdated_cache").unwrap();
let button_settings_duplicates_clear_cache: gtk::Button = builder.object("button_settings_duplicates_clear_cache").unwrap();
let check_button_duplicates_use_prehash_cache: gtk::CheckButton = builder.object("check_button_duplicates_use_prehash_cache").unwrap();
let entry_settings_prehash_cache_file_minimal_size: gtk::Entry = builder.object("entry_settings_prehash_cache_file_minimal_size").unwrap();
check_button_settings_hide_hard_links.set_tooltip_text(Some(
"Hides all files except one, if are points to same data(are hardlinked).\n\nE.g. in case where on disk there is 7 files which are hardlinked to specific data and one different file with same data but different inode, then in duplicate finder will be visible only one unique file and one file from hardlinked ones.",
@ -80,7 +84,10 @@ impl GuiSettings {
));
check_button_settings_show_preview_duplicates.set_tooltip_text(Some("Shows preview at right side, when selecting image file."));
check_button_settings_duplicates_delete_outdated_cache.set_tooltip_text(Some("Allows to delete outdated cache results which points to non-existent files.\n\nWhen enabled, app make sure when loading records, that all points to valid files and ignore broken ones.\n\nDisabling this option, will help to scan files on external drives, so cache entries about them will not be purged in next scan.\n\nIn case of having hundred of thousands records in cache, it is suggested to enable this option, to speedup cache loading and saving at start and end of scan."));
button_settings_duplicates_clear_cache.set_tooltip_text(Some("Manually clear cache from outdated entries.\nShould be used only if automatic clearing was disabled."));
button_settings_duplicates_clear_cache.set_tooltip_text(Some("Manually clear cache from outdated entries.\n\nShould be used only if automatic clearing was disabled."));
check_button_duplicates_use_prehash_cache.set_tooltip_text(Some(
"Enables caching of prehash(hash computed from small part of file) which allows to earlier throw out non duplicated results.\n\nIt is disabled by default because can cause in some situations slowdowns.\n\nIt is heavily recommended to use it when scanning hundred of thousands or million files, because it can speedup search multiple times.",
));
// Similar Images
let check_button_settings_show_preview_similar_images: gtk::CheckButton = builder.object("check_button_settings_show_preview_similar_images").unwrap();
@ -126,6 +133,8 @@ impl GuiSettings {
check_button_settings_use_trash,
check_button_settings_hide_hard_links,
entry_settings_cache_file_minimal_size,
entry_settings_prehash_cache_file_minimal_size,
check_button_duplicates_use_prehash_cache,
check_button_settings_show_preview_duplicates,
check_button_settings_duplicates_delete_outdated_cache,
button_settings_duplicates_clear_cache,

View File

@ -33,7 +33,7 @@ pub const KEY_END: u32 = 110;
#[derive(Eq, PartialEq)]
pub enum PopoverTypes {
All,
ImageSize,
Size,
Reverse,
Custom,
Date,

View File

@ -141,7 +141,7 @@ pub fn save_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb
//// minimal cache file size
data_to_save.push("--cache_minimal_file_size:".to_string());
let entry_settings_cache_file_minimal_size = settings.entry_settings_cache_file_minimal_size.clone();
data_to_save.push(entry_settings_cache_file_minimal_size.text().as_str().parse::<u64>().unwrap_or(2 * 1024 * 1024).to_string());
data_to_save.push(entry_settings_cache_file_minimal_size.text().as_str().parse::<u64>().unwrap_or(1024 * 1024 / 4).to_string());
//// Duplicates, delete outdated entries to trash
data_to_save.push("--delete_outdated_entries_duplicates:".to_string());
@ -157,6 +157,16 @@ pub fn save_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb
data_to_save.push("--delete_outdated_entries_similar_videos:".to_string());
let check_button_settings_similar_videos_delete_outdated_cache = settings.check_button_settings_similar_videos_delete_outdated_cache.clone();
data_to_save.push(check_button_settings_similar_videos_delete_outdated_cache.is_active().to_string());
//// Use prehash cache system
data_to_save.push("--use_prehash_cache:".to_string());
let check_button_duplicates_use_prehash_cache = settings.check_button_duplicates_use_prehash_cache.clone();
data_to_save.push(check_button_duplicates_use_prehash_cache.is_active().to_string());
//// minimal prehash cache file size
data_to_save.push("--cache_prehash_minimal_file_size:".to_string());
let entry_settings_prehash_cache_file_minimal_size = settings.entry_settings_prehash_cache_file_minimal_size.clone();
data_to_save.push(entry_settings_prehash_cache_file_minimal_size.text().as_str().parse::<u64>().unwrap_or(0).to_string());
}
// Creating/Opening config file
@ -213,6 +223,8 @@ enum TypeOfLoadedData {
DeleteCacheDuplicates,
DeleteCacheSimilarImages,
DeleteCacheSimilarVideos,
UsePrehashCache,
CachePrehashMinimalSize,
}
pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNotebook, settings: &GuiSettings, text_view_errors: &TextView, scrolled_window_errors: &ScrolledWindow) {
@ -264,6 +276,8 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb
let mut delete_outdated_cache_dupliactes: bool = true;
let mut delete_outdated_cache_similar_images: bool = true;
let mut delete_outdated_cache_similar_videos: bool = false;
let mut use_prehash_cache: bool = false;
let mut cache_prehash_minimal_size: u64 = 0;
let mut current_type = TypeOfLoadedData::None;
for (line_number, line) in loaded_data.replace("\r\n", "\n").split('\n').enumerate() {
@ -307,6 +321,10 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb
current_type = TypeOfLoadedData::DeleteCacheSimilarVideos;
} else if line.starts_with("--delete_outdated_entries_similar_images") {
current_type = TypeOfLoadedData::DeleteCacheSimilarImages;
} else if line.starts_with("--use_prehash_cache") {
current_type = TypeOfLoadedData::UsePrehashCache;
} else if line.starts_with("--cache_prehash_minimal_file_size") {
current_type = TypeOfLoadedData::CachePrehashMinimalSize;
} else if line.starts_with("--") {
current_type = TypeOfLoadedData::None;
add_text_to_text_view(
@ -512,6 +530,29 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb
);
}
}
TypeOfLoadedData::UsePrehashCache => {
let line = line.to_lowercase();
if line == "1" || line == "true" {
use_prehash_cache = true;
} else if line == "0" || line == "false" {
use_prehash_cache = false;
} else {
add_text_to_text_view(
&text_view_errors,
format!("Found invalid data in line {} \"{}\" isn't proper value(0/1/true/false) when loading file {:?}", line_number, line, config_file).as_str(),
);
}
}
TypeOfLoadedData::CachePrehashMinimalSize => {
if let Ok(number) = line.parse::<u64>() {
cache_prehash_minimal_size = number;
} else {
add_text_to_text_view(
&text_view_errors,
format!("Found invalid data in line {} \"{}\" isn't proper value(u64) when loading file {:?}", line_number, line, config_file).as_str(),
);
}
}
}
}
}
@ -566,8 +607,10 @@ pub fn load_configuration(manual_execution: bool, upper_notebook: &GuiUpperNoteb
}
settings.check_button_settings_hide_hard_links.set_active(hide_hard_links);
settings.check_button_settings_use_cache.set_active(use_cache);
settings.check_button_duplicates_use_prehash_cache.set_active(use_prehash_cache);
settings.check_button_settings_use_trash.set_active(use_trash);
settings.entry_settings_cache_file_minimal_size.set_text(cache_minimal_size.to_string().as_str());
settings.entry_settings_prehash_cache_file_minimal_size.set_text(cache_prehash_minimal_size.to_string().as_str());
} else {
settings.check_button_settings_load_at_start.set_active(false);
}
@ -650,10 +693,12 @@ pub fn reset_configuration(manual_clearing: bool, upper_notebook: &GuiUpperNoteb
settings.check_button_settings_hide_hard_links.set_active(true);
settings.check_button_settings_use_cache.set_active(true);
settings.check_button_settings_use_trash.set_active(false);
settings.entry_settings_cache_file_minimal_size.set_text("524288");
settings.entry_settings_cache_file_minimal_size.set_text("257144");
settings.check_button_settings_similar_videos_delete_outdated_cache.set_active(false);
settings.check_button_settings_similar_images_delete_outdated_cache.set_active(true);
settings.check_button_settings_duplicates_delete_outdated_cache.set_active(true);
settings.check_button_duplicates_use_prehash_cache.set_active(false);
settings.entry_settings_prehash_cache_file_minimal_size.set_text("0");
}
if manual_clearing {
add_text_to_text_view(&text_view_errors, "Current configuration was cleared.");

View File

@ -301,7 +301,7 @@ Author: Rafał Mikrut
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">Minimal cached file size in bytes</property>
<property name="label" translatable="yes">Minimal size of files in bytes saved to cache</property>
</object>
<packing>
<property name="expand">True</property>
@ -314,7 +314,7 @@ Author: Rafał Mikrut
<property name="visible">True</property>
<property name="can-focus">True</property>
<property name="max-length">15</property>
<property name="text" translatable="yes">524288</property>
<property name="text" translatable="yes">257144</property>
<property name="caps-lock-warning">False</property>
<property name="input-purpose">number</property>
</object>
@ -332,6 +332,20 @@ Author: Rafał Mikrut
<property name="position">3</property>
</packing>
</child>
<child>
<object class="GtkCheckButton" id="check_button_duplicates_use_prehash_cache">
<property name="label" translatable="yes">Use prehash cache</property>
<property name="visible">True</property>
<property name="can-focus">True</property>
<property name="receives-default">False</property>
<property name="draw-indicator">True</property>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">4</property>
</packing>
</child>
<child>
<object class="GtkButton" id="button_settings_duplicates_clear_cache">
<property name="label" translatable="yes">Remove outdated results from duplicates cache</property>
@ -343,7 +357,50 @@ Author: Rafał Mikrut
<property name="expand">False</property>
<property name="fill">False</property>
<property name="pack-type">end</property>
<property name="position">4</property>
<property name="position">5</property>
</packing>
</child>
<child>
<object class="GtkBox">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="margin-left">4</property>
<property name="margin-right">4</property>
<property name="margin-start">4</property>
<property name="margin-end">4</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">Minimal size of files in bytes saved to prehash cache</property>
</object>
<packing>
<property name="expand">True</property>
<property name="fill">True</property>
<property name="position">0</property>
</packing>
</child>
<child>
<object class="GtkEntry" id="entry_settings_prehash_cache_file_minimal_size">
<property name="visible">True</property>
<property name="can-focus">True</property>
<property name="max-length">15</property>
<property name="text" translatable="yes">1</property>
<property name="caps-lock-warning">False</property>
<property name="input-purpose">number</property>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">False</property>
<property name="pack-type">end</property>
<property name="position">1</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">6</property>
</packing>
</child>
</object>