From e976d40eeeaebd833fcf237a3504135165e2d2ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <41945903+qarmin@users.noreply.github.com> Date: Sat, 7 Oct 2023 18:04:17 +0200 Subject: [PATCH] Loading saving cache improvements (#1072) * Loading cache * Loading * Loading x2 * Optimization * Cache common * Delete outdated cache * Common * Loading cache/save almost * Simplified a lot of cache concept * Fix regression --- Cargo.lock | 58 ++--- czkawka_core/src/big_file.rs | 2 +- czkawka_core/src/broken_files.rs | 129 ++------- czkawka_core/src/common_cache.rs | 237 +++++++++++++++++ czkawka_core/src/common_dir_traversal.rs | 15 +- czkawka_core/src/common_messages.rs | 7 + czkawka_core/src/common_tool.rs | 3 + czkawka_core/src/common_traits.rs | 2 + czkawka_core/src/duplicate.rs | 246 ++++++------------ czkawka_core/src/lib.rs | 1 + czkawka_core/src/same_music.rs | 133 ++-------- czkawka_core/src/similar_images.rs | 165 +++--------- czkawka_core/src/similar_videos.rs | 108 ++------ .../connect_things/connect_button_search.rs | 1 + .../src/connect_things/connect_settings.rs | 46 +++- 15 files changed, 507 insertions(+), 646 deletions(-) create mode 100644 czkawka_core/src/common_cache.rs diff --git a/Cargo.lock b/Cargo.lock index a951b06..baf6cf8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -129,7 +129,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -247,9 +247,9 @@ checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bzip2" @@ -397,7 +397,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -692,7 +692,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -724,7 +724,7 @@ checksum = "c2ad8cef1d801a4686bfd8919f0b30eac4c8e48968c437a6405ded4fb5272d2b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -990,7 +990,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1191,7 +1191,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1460,7 +1460,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.37", + "syn 2.0.38", "unic-langid", ] @@ -1474,7 +1474,7 @@ dependencies = [ "i18n-config", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1730,9 +1730,9 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "linked-hash-map" @@ -1793,7 +1793,7 @@ checksum = "764b60e1ddd07e5665a6a17636a95cd7d8f3b86c73503a69c32979d05f72f3cf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -2238,9 +2238,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.67" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c" dependencies = [ "unicode-ident", ] @@ -2442,7 +2442,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.37", + "syn 2.0.38", "walkdir", ] @@ -2520,9 +2520,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.15" +version = "0.38.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2f9da0cbd88f9f09e7814e388301c8414c51c62aa6ce1e4b5c551d49d96e531" +checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7" dependencies = [ "bitflags 2.4.0", "errno", @@ -2603,7 +2603,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -2662,9 +2662,9 @@ dependencies = [ [[package]] name = "sharded-slab" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1b21f559e07218024e7e9f90f96f601825397de0e25420135f7f952453fed0b" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" dependencies = [ "lazy_static", ] @@ -2955,9 +2955,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.37" +version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ "proc-macro2", "quote", @@ -3022,7 +3022,7 @@ checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -3174,7 +3174,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -3416,7 +3416,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", "wasm-bindgen-shared", ] @@ -3438,7 +3438,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3629,9 +3629,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "winnow" -version = "0.5.15" +version = "0.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c2e3184b9c4e92ad5167ca73039d0c42476302ab603e2fec4487511f38ccefc" +checksum = "037711d82167854aff2018dfd193aa0fef5370f456732f0d5a0c59b0f1b4b907" dependencies = [ "memchr", ] diff --git a/czkawka_core/src/big_file.rs b/czkawka_core/src/big_file.rs index 6e9039f..e16d5a7 100644 --- a/czkawka_core/src/big_file.rs +++ b/czkawka_core/src/big_file.rs @@ -17,7 +17,7 @@ use crate::common_dir_traversal::{common_get_entry_data_metadata, common_read_di use crate::common_tool::{CommonData, CommonToolData}; use crate::common_traits::{DebugPrint, PrintResults, SaveResults}; -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct FileEntry { pub path: PathBuf, pub size: u64, diff --git a/czkawka_core/src/broken_files.rs b/czkawka_core/src/broken_files.rs index 5697cd0..92c1521 100644 --- a/czkawka_core/src/broken_files.rs +++ b/czkawka_core/src/broken_files.rs @@ -1,7 +1,7 @@ use std::collections::BTreeMap; use std::fs::{DirEntry, File, Metadata}; use std::io::prelude::*; -use std::io::{BufReader, BufWriter}; +use std::io::BufWriter; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -18,11 +18,11 @@ use rayon::prelude::*; use serde::{Deserialize, Serialize}; use crate::common::{ - check_folder_children, create_crash_message, open_cache_folder, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, AUDIO_FILES_EXTENSIONS, + check_folder_children, create_crash_message, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, AUDIO_FILES_EXTENSIONS, IMAGE_RS_BROKEN_FILES_EXTENSIONS, PDF_FILES_EXTENSIONS, ZIP_FILES_EXTENSIONS, }; +use crate::common_cache::{get_broken_files_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized}; use crate::common_dir_traversal::{common_get_entry_data_metadata, common_read_dir, get_lowercase_name, get_modified_time, CheckingMethod, ProgressData, ToolType}; -use crate::common_messages::Messages; use crate::common_tool::{CommonData, CommonToolData}; use crate::common_traits::*; @@ -40,6 +40,17 @@ pub struct FileEntry { pub type_of_file: TypeOfFile, pub error_string: String, } +impl ResultEntry for FileEntry { + fn get_path(&self) -> &Path { + &self.path + } + fn get_modified_date(&self) -> u64 { + self.modified_date + } + fn get_size(&self) -> u64 { + self.size + } +} #[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] pub enum TypeOfFile { @@ -218,11 +229,8 @@ impl BrokenFiles { } let type_of_file = check_extension_availability(&file_name_lowercase); - if type_of_file == TypeOfFile::Unknown { - return None; - } - if !check_extension_allowed(&type_of_file, &self.checked_types) { + if !check_if_file_extension_is_allowed(&type_of_file, &self.checked_types) { return None; } @@ -349,26 +357,15 @@ impl BrokenFiles { let files_to_check = mem::take(&mut self.files_to_check); if self.common_data.use_cache { - loaded_hash_map = match load_cache_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache) { - Some(t) => t, - None => Default::default(), - }; + let (messages, loaded_items) = load_cache_from_file_generalized_by_path::(&get_broken_files_cache_file(), self.get_delete_outdated_cache(), &files_to_check); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); for (name, file_entry) in files_to_check { - let checked_extension = check_extension_allowed(&file_entry.type_of_file, &self.checked_types); // Only broken - - #[allow(clippy::if_same_then_else)] - if checked_extension && !loaded_hash_map.contains_key(&name) { - // If loaded data doesn't contains current info - non_cached_files_to_check.insert(name, file_entry.clone()); - } else if checked_extension && file_entry.size != loaded_hash_map.get(&name).unwrap().size - || file_entry.modified_date != loaded_hash_map.get(&name).unwrap().modified_date - { - // When size or modification date of image changed, then it is clear that is different image - non_cached_files_to_check.insert(name, file_entry); + if let Some(cached_file_entry) = loaded_hash_map.get(&name) { + records_already_cached.insert(name.clone(), cached_file_entry.clone()); } else { - // Checking may be omitted when already there is entry with same size and modification date - records_already_cached.insert(name.clone(), loaded_hash_map.get(&name).unwrap().clone()); + non_cached_files_to_check.insert(name, file_entry); } } } else { @@ -440,7 +437,9 @@ impl BrokenFiles { for (_name, file_entry) in loaded_hash_map { all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); } - save_cache_to_file(&all_results, &mut self.common_data.text_messages, self.common_data.save_also_as_json); + + let messages = save_cache_to_file_generalized(&get_broken_files_cache_file(), &all_results, self.common_data.save_also_as_json, 0); + self.get_text_messages_mut().extend_with_another_messages(messages); } debug!("save_to_cache - end"); } @@ -536,84 +535,6 @@ impl PrintResults for BrokenFiles { } } -fn save_cache_to_file(old_hashmap: &BTreeMap, text_messages: &mut Messages, save_also_as_json: bool) { - let mut hashmap: BTreeMap = Default::default(); - for (path, fe) in old_hashmap { - if fe.size > 1024 { - hashmap.insert(path.clone(), fe.clone()); - } - } - let hashmap = &hashmap; - - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = open_cache_folder(&get_cache_file(), true, save_also_as_json, &mut text_messages.warnings) { - { - let writer = BufWriter::new(file_handler.unwrap()); // Unwrap because cannot fail here - if let Err(e) = bincode::serialize_into(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file.display(), e)); - return; - } - } - if save_also_as_json { - if let Some(file_handler_json) = file_handler_json { - let writer = BufWriter::new(file_handler_json); - if let Err(e) = serde_json::to_writer(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file_json.display(), e)); - return; - } - } - } - - text_messages.messages.push(format!("Properly saved to file {} cache entries.", hashmap.len())); - } -} - -fn load_cache_from_file(text_messages: &mut Messages, delete_outdated_cache: bool) -> Option> { - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = open_cache_folder(&get_cache_file(), false, true, &mut text_messages.warnings) { - let mut hashmap_loaded_entries: BTreeMap; - if let Some(file_handler) = file_handler { - let reader = BufReader::new(file_handler); - hashmap_loaded_entries = match bincode::deserialize_from(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file.display(), e)); - return None; - } - }; - } else { - let reader = BufReader::new(file_handler_json.unwrap()); // Unwrap cannot fail, because at least one file must be valid - hashmap_loaded_entries = match serde_json::from_reader(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file_json.display(), e)); - return None; - } - }; - } - - // Don't load cache data if destination file not exists - if delete_outdated_cache { - hashmap_loaded_entries.retain(|src_path, _file_entry| Path::new(src_path).exists()); - } - - text_messages.messages.push(format!("Properly loaded {} cache entries.", hashmap_loaded_entries.len())); - - return Some(hashmap_loaded_entries); - } - None -} - -fn get_cache_file() -> String { - "cache_broken_files.bin".to_string() -} - fn check_extension_availability(file_name_lowercase: &str) -> TypeOfFile { if IMAGE_RS_BROKEN_FILES_EXTENSIONS.iter().any(|e| file_name_lowercase.ends_with(e)) { TypeOfFile::Image @@ -628,7 +549,7 @@ fn check_extension_availability(file_name_lowercase: &str) -> TypeOfFile { } } -fn check_extension_allowed(type_of_file: &TypeOfFile, checked_types: &CheckedTypes) -> bool { +fn check_if_file_extension_is_allowed(type_of_file: &TypeOfFile, checked_types: &CheckedTypes) -> bool { ((*type_of_file == TypeOfFile::Image) && ((*checked_types & CheckedTypes::IMAGE) == CheckedTypes::IMAGE)) || ((*type_of_file == TypeOfFile::PDF) && ((*checked_types & CheckedTypes::PDF) == CheckedTypes::PDF)) || ((*type_of_file == TypeOfFile::ArchiveZip) && ((*checked_types & CheckedTypes::ARCHIVE) == CheckedTypes::ARCHIVE)) diff --git a/czkawka_core/src/common_cache.rs b/czkawka_core/src/common_cache.rs new file mode 100644 index 0000000..b5268a7 --- /dev/null +++ b/czkawka_core/src/common_cache.rs @@ -0,0 +1,237 @@ +use crate::common; +use crate::common_messages::Messages; +use crate::common_traits::ResultEntry; +use crate::duplicate::HashType; +use crate::similar_images::{convert_algorithm_to_string, convert_filters_to_string}; +use image::imageops::FilterType; +use image_hasher::HashAlg; +use log::debug; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::io::{BufReader, BufWriter}; + +pub fn get_broken_files_cache_file() -> String { + "cache_broken_files_61.bin".to_string() +} + +pub fn get_similar_images_cache_file(hash_size: &u8, hash_alg: &HashAlg, image_filter: &FilterType) -> String { + format!( + "cache_similar_images_{}_{}_{}_61.bin", + hash_size, + convert_algorithm_to_string(hash_alg), + convert_filters_to_string(image_filter), + ) +} + +pub fn get_similar_videos_cache_file() -> String { + "cache_similar_videos_61.bin".to_string() +} +pub fn get_similar_music_cache_file(checking_tags: bool) -> &'static str { + if checking_tags { + "cache_same_music_tags_61.bin" + } else { + "cache_same_music_fingerprints_61.bin" + } +} + +pub fn get_duplicate_cache_file(type_of_hash: &HashType, is_prehash: bool) -> String { + let prehash_str = if is_prehash { "_prehash" } else { "" }; + format!("cache_duplicates_{type_of_hash:?}{prehash_str}_61.bin") +} + +pub fn save_cache_to_file_generalized(cache_file_name: &str, hashmap: &BTreeMap, save_also_as_json: bool, minimum_file_size: u64) -> Messages +where + T: Serialize + ResultEntry + Sized + Send + Sync, +{ + debug!("Saving cache to file {} (or also json alternative) - {} results", cache_file_name, hashmap.len()); + let mut text_messages = Messages::new(); + if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = + common::open_cache_folder(cache_file_name, true, save_also_as_json, &mut text_messages.warnings) + { + let hashmap_to_save = hashmap.values().filter(|t| t.get_size() >= minimum_file_size).collect::>(); + + { + let writer = BufWriter::new(file_handler.unwrap()); // Unwrap because cannot fail here + if let Err(e) = bincode::serialize_into(writer, &hashmap_to_save) { + text_messages + .warnings + .push(format!("Cannot write data to cache file {}, reason {}", cache_file.display(), e)); + debug!("Failed to save cache to file {:?}", cache_file); + return text_messages; + } + debug!("Saved binary to file {:?}", cache_file); + } + if save_also_as_json { + if let Some(file_handler_json) = file_handler_json { + let writer = BufWriter::new(file_handler_json); + if let Err(e) = serde_json::to_writer(writer, &hashmap_to_save) { + text_messages + .warnings + .push(format!("Cannot write data to cache file {}, reason {}", cache_file_json.display(), e)); + debug!("Failed to save cache to file {:?}", cache_file_json); + return text_messages; + } + debug!("Saved json to file {:?}", cache_file_json); + } + } + + text_messages.messages.push(format!("Properly saved to file {} cache entries.", hashmap.len())); + } else { + debug!("Failed to save cache to file {cache_file_name} because not exists"); + } + text_messages +} + +pub fn load_cache_from_file_generalized_by_path(cache_file_name: &str, delete_outdated_cache: bool, used_files: &BTreeMap) -> (Messages, Option>) +where + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, +{ + let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, used_files); + let Some(vec_loaded_entries) = vec_loaded_cache else { + return (text_messages, None); + }; + + debug!("Converting cache Vec into BTreeMap"); + let map_loaded_entries: BTreeMap = vec_loaded_entries + .into_iter() + .map(|file_entry| (file_entry.get_path().to_string_lossy().into_owned(), file_entry)) + .collect(); + debug!("Converted cache Vec into BTreeMap"); + + (text_messages, Some(map_loaded_entries)) +} + +pub fn load_cache_from_file_generalized_by_size( + cache_file_name: &str, + delete_outdated_cache: bool, + cache_not_converted: &BTreeMap>, +) -> (Messages, Option>>) +where + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, +{ + debug!("Converting cache BtreeMap> into BTreeMap"); + let mut used_files: BTreeMap = Default::default(); + for file_entry in cache_not_converted.values().flatten() { + used_files.insert(file_entry.get_path().to_string_lossy().into_owned(), file_entry.clone()); + } + debug!("Converted cache BtreeMap> into BTreeMap"); + + let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, &used_files); + let Some(vec_loaded_entries) = vec_loaded_cache else { + return (text_messages, None); + }; + + debug!("Converting cache Vec into BTreeMap>"); + let mut map_loaded_entries: BTreeMap> = Default::default(); + for file_entry in vec_loaded_entries { + map_loaded_entries.entry(file_entry.get_size()).or_default().push(file_entry); + } + debug!("Converted cache Vec into BTreeMap>"); + + (text_messages, Some(map_loaded_entries)) +} + +pub fn load_cache_from_file_generalized_by_path_from_size( + cache_file_name: &str, + delete_outdated_cache: bool, + cache_not_converted: &BTreeMap>, +) -> (Messages, Option>) +where + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, +{ + debug!("Converting cache BtreeMap> into BTreeMap"); + let mut used_files: BTreeMap = Default::default(); + for file_entry in cache_not_converted.values().flatten() { + used_files.insert(file_entry.get_path().to_string_lossy().into_owned(), file_entry.clone()); + } + debug!("Converted cache BtreeMap> into BTreeMap"); + + let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, &used_files); + let Some(vec_loaded_entries) = vec_loaded_cache else { + return (text_messages, None); + }; + + debug!("Converting cache Vec into BTreeMap"); + let map_loaded_entries: BTreeMap = vec_loaded_entries + .into_iter() + .map(|file_entry| (file_entry.get_path().to_string_lossy().into_owned(), file_entry)) + .collect(); + debug!("Converted cache Vec into BTreeMap"); + + (text_messages, Some(map_loaded_entries)) +} + +fn load_cache_from_file_generalized(cache_file_name: &str, delete_outdated_cache: bool, used_files: &BTreeMap) -> (Messages, Option>) +where + for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone, +{ + debug!("Loading cache from file {} (or json alternative)", cache_file_name); + let mut text_messages = Messages::new(); + + if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = common::open_cache_folder(cache_file_name, false, true, &mut text_messages.warnings) { + let mut vec_loaded_entries: Vec; + if let Some(file_handler) = file_handler { + let reader = BufReader::new(file_handler); + + vec_loaded_entries = match bincode::deserialize_from(reader) { + Ok(t) => t, + Err(e) => { + text_messages + .warnings + .push(format!("Failed to load data from cache file {}, reason {}", cache_file.display(), e)); + debug!("Failed to load cache from file {:?}", cache_file); + return (text_messages, None); + } + }; + } else { + let reader = BufReader::new(file_handler_json.unwrap()); // Unwrap cannot fail, because at least one file must be valid + vec_loaded_entries = match serde_json::from_reader(reader) { + Ok(t) => t, + Err(e) => { + text_messages + .warnings + .push(format!("Failed to load data from cache file {}, reason {}", cache_file_json.display(), e)); + debug!("Failed to load cache from file {:?}", cache_file); + return (text_messages, None); + } + }; + } + + // Don't load cache data if destination file not exists + debug!("Starting to removing outdated cache entries"); + let initial_number_of_entries = vec_loaded_entries.len(); + vec_loaded_entries = vec_loaded_entries + .into_par_iter() + .filter(|file_entry| { + if delete_outdated_cache && !file_entry.get_path().exists() { + return false; + } + + let file_entry_path_str = file_entry.get_path().to_string_lossy().to_string(); + if let Some(used_file) = used_files.get(&file_entry_path_str) { + if file_entry.get_size() != used_file.get_size() { + return false; + } + if file_entry.get_modified_date() != used_file.get_modified_date() { + return false; + } + } + + true + }) + .collect(); + debug!( + "Completed removing outdated cache entries, removed {} out of all {} entries", + initial_number_of_entries - vec_loaded_entries.len(), + initial_number_of_entries + ); + + text_messages.messages.push(format!("Properly loaded {} cache entries.", vec_loaded_entries.len())); + + debug!("Loaded cache from file {cache_file_name} (or json alternative) - {} results", vec_loaded_entries.len()); + return (text_messages, Some(vec_loaded_entries)); + } + debug!("Failed to load cache from file {cache_file_name} because not exists"); + (text_messages, None) +} diff --git a/czkawka_core/src/common_dir_traversal.rs b/czkawka_core/src/common_dir_traversal.rs index 9f7f5a6..0a3f661 100644 --- a/czkawka_core/src/common_dir_traversal.rs +++ b/czkawka_core/src/common_dir_traversal.rs @@ -8,6 +8,7 @@ use std::time::UNIX_EPOCH; use crossbeam_channel::Receiver; use futures::channel::mpsc::UnboundedSender; use rayon::prelude::*; +use serde::{Deserialize, Serialize}; use crate::common::{prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads}; use crate::common_directory::Directories; @@ -44,7 +45,7 @@ pub enum ToolType { None, } -#[derive(PartialEq, Eq, Clone, Debug, Copy, Default)] +#[derive(PartialEq, Eq, Clone, Debug, Copy, Default, Deserialize, Serialize)] pub enum CheckingMethod { #[default] None, @@ -56,7 +57,7 @@ pub enum CheckingMethod { AudioContent, } -#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct FileEntry { pub path: PathBuf, pub size: u64, @@ -69,19 +70,25 @@ impl ResultEntry for FileEntry { fn get_path(&self) -> &Path { &self.path } + fn get_modified_date(&self) -> u64 { + self.modified_date + } + fn get_size(&self) -> u64 { + self.size + } } // Symlinks const MAX_NUMBER_OF_SYMLINK_JUMPS: i32 = 20; -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct SymlinkInfo { pub destination_path: PathBuf, pub type_of_error: ErrorType, } -#[derive(Clone, Debug, PartialEq, Eq, Copy)] +#[derive(Clone, Debug, PartialEq, Eq, Copy, Deserialize, Serialize)] pub enum ErrorType { InfiniteRecursion, NonExistentFile, diff --git a/czkawka_core/src/common_messages.rs b/czkawka_core/src/common_messages.rs index 5feb1cc..0b45743 100644 --- a/czkawka_core/src/common_messages.rs +++ b/czkawka_core/src/common_messages.rs @@ -53,4 +53,11 @@ impl Messages { self.warnings.extend(warnings); self.errors.extend(errors); } + + pub fn extend_with_another_messages(&mut self, messages: Messages) { + let (messages, warnings, errors) = (messages.messages, messages.warnings, messages.errors); + self.messages.extend(messages); + self.warnings.extend(warnings); + self.errors.extend(errors); + } } diff --git a/czkawka_core/src/common_tool.rs b/czkawka_core/src/common_tool.rs index e2ae4b0..30ecaf9 100644 --- a/czkawka_core/src/common_tool.rs +++ b/czkawka_core/src/common_tool.rs @@ -103,6 +103,9 @@ pub trait CommonData { fn get_text_messages(&self) -> &Messages { &self.get_cd().text_messages } + fn get_text_messages_mut(&mut self) -> &mut Messages { + &mut self.get_cd_mut().text_messages + } fn set_save_also_as_json(&mut self, save_also_as_json: bool) { self.get_cd_mut().save_also_as_json = save_also_as_json; diff --git a/czkawka_core/src/common_traits.rs b/czkawka_core/src/common_traits.rs index 7c64b4c..60953ae 100644 --- a/czkawka_core/src/common_traits.rs +++ b/czkawka_core/src/common_traits.rs @@ -14,4 +14,6 @@ pub trait PrintResults { pub trait ResultEntry { fn get_path(&self) -> &Path; + fn get_modified_date(&self) -> u64; + fn get_size(&self) -> u64; } diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index 4d6ad02..5ef7699 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -1,13 +1,12 @@ use std::collections::BTreeMap; -#[cfg(target_family = "unix")] use std::collections::HashSet; use std::fs::File; use std::hash::Hasher; use std::io::prelude::*; -use std::io::{self, BufReader, BufWriter, Error, ErrorKind}; +use std::io::{self, BufWriter, Error, ErrorKind}; #[cfg(target_family = "unix")] use std::os::unix::fs::MetadataExt; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::atomic::Ordering; use std::{fs, mem}; @@ -18,13 +17,12 @@ use log::{debug, info}; use rayon::prelude::*; use xxhash_rust::xxh3::Xxh3; -use crate::common::{open_cache_folder, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads}; +use crate::common::{prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads}; +use crate::common_cache::{get_duplicate_cache_file, load_cache_from_file_generalized_by_size, save_cache_to_file_generalized}; use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType}; use crate::common_messages::Messages; use crate::common_tool::{CommonData, CommonToolData}; use crate::common_traits::*; -use crate::flc; -use crate::localizer_core::generate_translation_hashmap; const TEMP_HARDLINK_FILE: &str = "rzeczek.rxrxrxl"; @@ -480,7 +478,13 @@ impl DuplicateFinder { } DirTraversalResult::Stopped => false, }; - debug!("check_file_size - after calculating size stats/duplicates"); + debug!( + "check_file_size - after calculating size stats/duplicates, found in {} groups, {} files with same size | referenced {} groups, {} files", + self.files_with_identical_size.len(), + self.files_with_identical_size.values().map(Vec::len).sum::(), + self.files_with_identical_size_referenced.len(), + self.files_with_identical_size_referenced.values().map(|(_fe, vec)| vec.len()).sum::() + ); res } @@ -535,34 +539,39 @@ impl DuplicateFinder { if self.use_prehash_cache { debug!("prehash_load_cache_at_start - using prehash cache start"); - loaded_hash_map = match load_hashes_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache, &self.hash_type, true) { - Some(t) => t, - None => Default::default(), - }; - let mut loaded_hash_map2: BTreeMap = Default::default(); - for vec_file_entry in loaded_hash_map.values() { - for file_entry in vec_file_entry { - loaded_hash_map2.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone()); - } - } + let (messages, loaded_items) = load_cache_from_file_generalized_by_size::( + &get_duplicate_cache_file(&self.hash_type, true), + self.get_delete_outdated_cache(), + &self.files_with_identical_size, + ); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); - #[allow(clippy::if_same_then_else)] - for vec_file_entry in self.files_with_identical_size.values() { - for file_entry in vec_file_entry { - let name = file_entry.path.to_string_lossy().to_string(); - if !loaded_hash_map2.contains_key(&name) { - // If loaded data doesn't contains current image info - non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry.clone()); - } else if file_entry.size != loaded_hash_map2.get(&name).unwrap().size || file_entry.modified_date != loaded_hash_map2.get(&name).unwrap().modified_date { - // When size or modification date of image changed, then it is clear that is different image - non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry.clone()); - } else { - // Checking may be omitted when already there is entry with same size and modification date - records_already_cached.entry(file_entry.size).or_default().push(file_entry.clone()); + debug!("prehash_load_cache_at_start - started diff between loaded and prechecked files"); + for (size, mut vec_file_entry) in mem::take(&mut self.files_with_identical_size) { + if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) { + // TODO maybe hashset is not needed when using < 4 elements + let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::>(); + for file_entry in vec_file_entry { + if cached_path_entries.contains(&file_entry.path) { + records_already_cached.entry(size).or_default().push(file_entry); + } else { + non_cached_files_to_check.entry(size).or_default().push(file_entry); + } } + } else { + non_cached_files_to_check.entry(size).or_default().append(&mut vec_file_entry); } } + + debug!( + "prehash_load_cache_at_start - completed diff between loaded and prechecked files, {}({}) - non cached, {}({}) - already cached", + non_cached_files_to_check.values().map(Vec::len).sum::(), + format_size(non_cached_files_to_check.values().map(|v| v.iter().map(|e| e.size).sum::()).sum::(), BINARY), + records_already_cached.values().map(Vec::len).sum::(), + format_size(records_already_cached.values().map(|v| v.iter().map(|e| e.size).sum::()).sum::(), BINARY), + ); } else { debug!("prehash_load_cache_at_start - not using prehash cache start"); loaded_hash_map = Default::default(); @@ -596,13 +605,14 @@ impl DuplicateFinder { } } - save_hashes_to_file( + let messages = save_cache_to_file_generalized( + &get_duplicate_cache_file(&self.hash_type, true), &save_cache_to_hashmap, - &mut self.common_data.text_messages, - &self.hash_type, - true, + self.common_data.save_also_as_json, self.minimal_prehash_cache_file_size, ); + self.get_text_messages_mut().extend_with_another_messages(messages); + debug!("prehash_save_cache_at_exit - saving prehash cache end"); } } @@ -691,35 +701,35 @@ impl DuplicateFinder { if self.common_data.use_cache { debug!("full_hashing_load_cache_at_start - using cache"); - loaded_hash_map = match load_hashes_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache, &self.hash_type, false) { - Some(t) => t, - None => Default::default(), - }; - - for (size, vec_file_entry) in pre_checked_map { - #[allow(clippy::collapsible_if)] - if !loaded_hash_map.contains_key(&size) { - // If loaded data doesn't contains current info - non_cached_files_to_check.insert(size, vec_file_entry); - } else { - let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap(); + let (messages, loaded_items) = + load_cache_from_file_generalized_by_size::(&get_duplicate_cache_file(&self.hash_type, false), self.get_delete_outdated_cache(), &pre_checked_map); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); + debug!("full_hashing_load_cache_at_start - started diff between loaded and prechecked files"); + for (size, mut vec_file_entry) in pre_checked_map { + if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) { + // TODO maybe hashset is not needed when using < 4 elements + let cached_path_entries = cached_vec_file_entry.iter().map(|e| &e.path).collect::>(); for file_entry in vec_file_entry { - let mut found: bool = false; - for loaded_file_entry in loaded_vec_file_entry { - if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date { - records_already_cached.entry(file_entry.size).or_default().push(loaded_file_entry.clone()); - found = true; - break; - } - } - - if !found { - non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry); + if cached_path_entries.contains(&file_entry.path) { + records_already_cached.entry(size).or_default().push(file_entry); + } else { + non_cached_files_to_check.entry(size).or_default().push(file_entry); } } + } else { + non_cached_files_to_check.entry(size).or_default().append(&mut vec_file_entry); } } + + debug!( + "full_hashing_load_cache_at_start - completed diff between loaded and prechecked files - {}({}) non cached, {}({}) already cached", + non_cached_files_to_check.len(), + format_size(non_cached_files_to_check.values().map(|v| v.iter().map(|e| e.size).sum::()).sum::(), BINARY), + records_already_cached.len(), + format_size(records_already_cached.values().map(|v| v.iter().map(|e| e.size).sum::()).sum::(), BINARY), + ); } else { debug!("full_hashing_load_cache_at_start - not using cache"); loaded_hash_map = Default::default(); @@ -771,7 +781,15 @@ impl DuplicateFinder { } } } - save_hashes_to_file(&all_results, &mut self.common_data.text_messages, &self.hash_type, false, self.minimal_cache_file_size); + + let messages = save_cache_to_file_generalized( + &get_duplicate_cache_file(&self.hash_type, false), + &all_results, + self.common_data.save_also_as_json, + self.minimal_cache_file_size, + ); + self.get_text_messages_mut().extend_with_another_messages(messages); + debug!("full_hashing_save_cache_at_exit - end"); } @@ -1318,112 +1336,6 @@ pub fn make_hard_link(src: &Path, dst: &Path) -> io::Result<()> { result } -pub fn save_hashes_to_file(hashmap: &BTreeMap, text_messages: &mut Messages, type_of_hash: &HashType, is_prehash: bool, minimal_cache_file_size: u64) { - if let Some(((file_handler, cache_file), (_json_file, _json_name))) = open_cache_folder(&get_file_hash_name(type_of_hash, is_prehash), true, false, &mut text_messages.warnings) - { - let mut writer = BufWriter::new(file_handler.unwrap()); // Unwrap cannot fail - - let mut how_much = 0; - for file_entry in hashmap.values() { - if file_entry.size >= minimal_cache_file_size { - let string: String = format!("{}//{}//{}//{}", file_entry.path.display(), file_entry.size, file_entry.modified_date, file_entry.hash); - - if let Err(e) = writeln!(writer, "{string}") { - text_messages - .warnings - .push(format!("Failed to save some data to cache file {}, reason {}", cache_file.display(), e)); - return; - } - how_much += 1; - } - } - - text_messages - .messages - .push(flc!("core_saving_to_cache", generate_translation_hashmap(vec![("number", how_much.to_string())]))); - } -} - -pub fn load_hashes_from_file(text_messages: &mut Messages, delete_outdated_cache: bool, type_of_hash: &HashType, is_prehash: bool) -> Option>> { - if let Some(((file_handler, cache_file), (_json_file, _json_name))) = - open_cache_folder(&get_file_hash_name(type_of_hash, is_prehash), false, false, &mut text_messages.warnings) - { - // Unwrap could fail when failed to open cache file, but json would exists - let Some(file_handler) = file_handler else { - return Default::default(); - }; - let reader = BufReader::new(file_handler); - - let mut hashmap_loaded_entries: BTreeMap> = Default::default(); - - // Read the file line by line using the lines() iterator from std::io::BufRead. - for (index, line) in reader.lines().enumerate() { - let line = match line { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load line number {} from cache file {}, reason {}", index + 1, cache_file.display(), e)); - return None; - } - }; - let uuu = line.split("//").collect::>(); - if uuu.len() != 4 { - text_messages.warnings.push(format!( - "Found invalid data(too much or too low amount of data) in line {} - ({}) in cache file {}", - index + 1, - line, - cache_file.display() - )); - continue; - } - // Don't load cache data if destination file not exists - if !delete_outdated_cache || Path::new(uuu[0]).exists() { - let file_entry = FileEntry { - path: PathBuf::from(uuu[0]), - size: match uuu[1].parse::() { - Ok(t) => t, - Err(e) => { - text_messages.warnings.push(format!( - "Found invalid size value in line {} - ({}) in cache file {}, reason {}", - index + 1, - line, - cache_file.display(), - e - )); - continue; - } - }, - modified_date: match uuu[2].parse::() { - Ok(t) => t, - Err(e) => { - text_messages.warnings.push(format!( - "Found invalid modified date value in line {} - ({}) in cache file {}, reason {}", - index + 1, - line, - cache_file.display(), - e - )); - continue; - } - }, - hash: uuu[3].to_string(), - symlink_info: None, - }; - hashmap_loaded_entries.entry(file_entry.size).or_default().push(file_entry); - } - } - - text_messages.messages.push(flc!( - "core_loading_from_cache", - generate_translation_hashmap(vec![("number", hashmap_loaded_entries.values().map(std::vec::Vec::len).sum::().to_string())]) - )); - - return Some(hashmap_loaded_entries); - } - None -} - pub trait MyHasher { fn update(&mut self, bytes: &[u8]); fn finalize(&self) -> String; @@ -1453,11 +1365,6 @@ fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashT Ok(hasher.finalize()) } -fn get_file_hash_name(type_of_hash: &HashType, is_prehash: bool) -> String { - let prehash_str = if is_prehash { "_prehash" } else { "" }; - format!("cache_duplicates_{type_of_hash:?}{prehash_str}.txt") -} - impl MyHasher for blake3::Hasher { fn update(&mut self, bytes: &[u8]) { self.update(bytes); @@ -1502,6 +1409,7 @@ mod tests { use std::os::fs::MetadataExt; #[cfg(target_family = "unix")] use std::os::unix::fs::MetadataExt; + use std::path::PathBuf; use super::*; diff --git a/czkawka_core/src/lib.rs b/czkawka_core/src/lib.rs index fdb4897..947b785 100644 --- a/czkawka_core/src/lib.rs +++ b/czkawka_core/src/lib.rs @@ -19,6 +19,7 @@ pub mod temporary; pub mod bad_extensions; pub mod common; +pub mod common_cache; pub mod common_dir_traversal; pub mod common_directory; pub mod common_extensions; diff --git a/czkawka_core/src/same_music.rs b/czkawka_core/src/same_music.rs index 31c9822..ac2a7b8 100644 --- a/czkawka_core/src/same_music.rs +++ b/czkawka_core/src/same_music.rs @@ -1,8 +1,8 @@ use std::cmp::max; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashSet}; use std::fs::File; use std::io::prelude::*; -use std::io::{BufReader, BufWriter}; +use std::io::BufWriter; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -23,11 +23,9 @@ use symphonia::core::io::MediaSourceStream; use symphonia::core::meta::MetadataOptions; use symphonia::core::probe::Hint; -use crate::common::{ - create_crash_message, filter_reference_folders_generic, open_cache_folder, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, AUDIO_FILES_EXTENSIONS, -}; +use crate::common::{create_crash_message, filter_reference_folders_generic, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, AUDIO_FILES_EXTENSIONS}; +use crate::common_cache::{get_similar_music_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized}; use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType}; -use crate::common_messages::Messages; use crate::common_tool::{CommonData, CommonToolData}; use crate::common_traits::*; @@ -71,6 +69,12 @@ impl ResultEntry for MusicEntry { fn get_path(&self) -> &Path { &self.path } + fn get_modified_date(&self) -> u64 { + self.modified_date + } + fn get_size(&self) -> u64 { + self.size + } } impl FileEntry { @@ -102,7 +106,7 @@ pub struct Info { pub struct SameMusic { common_data: CommonToolData, information: Info, - music_to_check: HashMap, + music_to_check: BTreeMap, music_entries: Vec, duplicated_music_entries: Vec>, duplicated_music_entries_referenced: Vec<(MusicEntry, Vec)>, @@ -221,32 +225,24 @@ impl SameMusic { } } - fn load_cache(&mut self, checking_tags: bool) -> (HashMap, HashMap, HashMap) { + fn load_cache(&mut self, checking_tags: bool) -> (BTreeMap, BTreeMap, BTreeMap) { debug!("load_cache - start, using cache {}", self.common_data.use_cache); let loaded_hash_map; - let mut records_already_cached: HashMap = Default::default(); - let mut non_cached_files_to_check: HashMap = Default::default(); + let mut records_already_cached: BTreeMap = Default::default(); + let mut non_cached_files_to_check: BTreeMap = Default::default(); if self.common_data.use_cache { - loaded_hash_map = match load_cache_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache, checking_tags) { - Some(t) => t, - None => Default::default(), - }; + let (messages, loaded_items) = + load_cache_from_file_generalized_by_path::(get_similar_music_cache_file(checking_tags), self.get_delete_outdated_cache(), &self.music_to_check); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); - for (name, file_entry) in &self.music_to_check { - if !loaded_hash_map.contains_key(name) { - // If loaded data doesn't contains current image info - non_cached_files_to_check.insert(name.clone(), file_entry.clone()); + for (name, file_entry) in mem::take(&mut self.music_to_check) { + if let Some(cached_file_entry) = loaded_hash_map.get(&name) { + records_already_cached.insert(name.clone(), cached_file_entry.clone()); } else { - let loaded_item = loaded_hash_map.get(name).unwrap(); - if file_entry.size != loaded_item.size || file_entry.modified_date != loaded_item.modified_date { - // When size or modification date of image changed, then it is clear that is different image - non_cached_files_to_check.insert(name.clone(), file_entry.clone()); - } else { - // Checking may be omitted when already there is entry with same size and modification date - records_already_cached.insert(name.clone(), loaded_item.clone()); - } + non_cached_files_to_check.insert(name, file_entry); } } } else { @@ -257,18 +253,20 @@ impl SameMusic { (loaded_hash_map, records_already_cached, non_cached_files_to_check) } - fn save_cache(&mut self, vec_file_entry: Vec, loaded_hash_map: HashMap, checking_tags: bool) { + fn save_cache(&mut self, vec_file_entry: Vec, loaded_hash_map: BTreeMap, checking_tags: bool) { debug!("save_cache - start, using cache {}", self.common_data.use_cache); if !self.common_data.use_cache { return; } // Must save all results to file, old loaded from file with all currently counted results - let mut all_results: HashMap = loaded_hash_map; + let mut all_results: BTreeMap = loaded_hash_map; for file_entry in vec_file_entry { all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); } - save_cache_to_file(&all_results, &mut self.common_data.text_messages, self.common_data.save_also_as_json, checking_tags); + + let messages = save_cache_to_file_generalized(get_similar_music_cache_file(checking_tags), &all_results, self.common_data.save_also_as_json, 0); + self.get_text_messages_mut().extend_with_another_messages(messages); debug!("save_cache - end"); } @@ -745,74 +743,6 @@ impl SameMusic { } } -fn save_cache_to_file(hashmap: &HashMap, text_messages: &mut Messages, save_also_as_json: bool, checking_tags: bool) { - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = - open_cache_folder(get_cache_file(checking_tags), true, save_also_as_json, &mut text_messages.warnings) - { - { - let writer = BufWriter::new(file_handler.unwrap()); // Unwrap because cannot fail here - if let Err(e) = bincode::serialize_into(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file.display(), e)); - return; - } - } - if save_also_as_json { - if let Some(file_handler_json) = file_handler_json { - let writer = BufWriter::new(file_handler_json); - if let Err(e) = serde_json::to_writer(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file_json.display(), e)); - return; - } - } - } - - text_messages.messages.push(format!("Properly saved to file {} cache entries.", hashmap.len())); - } -} - -fn load_cache_from_file(text_messages: &mut Messages, delete_outdated_cache: bool, checking_tags: bool) -> Option> { - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = open_cache_folder(get_cache_file(checking_tags), false, true, &mut text_messages.warnings) { - let mut hashmap_loaded_entries: HashMap; - if let Some(file_handler) = file_handler { - let reader = BufReader::new(file_handler); - hashmap_loaded_entries = match bincode::deserialize_from(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file.display(), e)); - return None; - } - }; - } else { - let reader = BufReader::new(file_handler_json.unwrap()); // Unwrap cannot fail, because at least one file must be valid - hashmap_loaded_entries = match serde_json::from_reader(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file_json.display(), e)); - return None; - } - }; - } - - // Don't load cache data if destination file not exists - if delete_outdated_cache { - hashmap_loaded_entries.retain(|src_path, _file_entry| Path::new(src_path).exists()); - } - - text_messages.messages.push(format!("Properly loaded {} cache entries.", hashmap_loaded_entries.len())); - - return Some(hashmap_loaded_entries); - } - None -} - // TODO this should be taken from rusty-chromaprint repo, not reimplemented here fn calc_fingerprint_helper(path: impl AsRef, config: &Configuration) -> anyhow::Result> { let path = path.as_ref(); @@ -976,15 +906,6 @@ fn read_single_file_tag(path: &str, music_entry: &mut MusicEntry) -> bool { true } -// Using different cache folders, because loading cache just for finding duplicated tags would be really slow -fn get_cache_file(checking_tags: bool) -> &'static str { - if checking_tags { - "cache_same_music_tags.bin" - } else { - "cache_same_music_fingerprints.bin" - } -} - impl Default for SameMusic { fn default() -> Self { Self::new() diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index 341a7ea..1394d32 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fs::{DirEntry, File, Metadata}; use std::io::{Write, *}; use std::path::{Path, PathBuf}; @@ -19,11 +19,11 @@ use serde::{Deserialize, Serialize}; #[cfg(feature = "heif")] use crate::common::get_dynamic_image_from_heic; use crate::common::{ - check_folder_children, create_crash_message, get_dynamic_image_from_raw_image, open_cache_folder, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, - HEIC_EXTENSIONS, IMAGE_RS_SIMILAR_IMAGES_EXTENSIONS, RAW_IMAGE_EXTENSIONS, + check_folder_children, create_crash_message, get_dynamic_image_from_raw_image, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, HEIC_EXTENSIONS, + IMAGE_RS_SIMILAR_IMAGES_EXTENSIONS, RAW_IMAGE_EXTENSIONS, }; +use crate::common_cache::{get_similar_images_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized}; use crate::common_dir_traversal::{common_get_entry_data_metadata, common_read_dir, get_lowercase_name, get_modified_time, CheckingMethod, ProgressData, ToolType}; -use crate::common_messages::Messages; use crate::common_tool::{CommonData, CommonToolData}; use crate::common_traits::{DebugPrint, PrintResults, ResultEntry, SaveResults}; use crate::flc; @@ -51,6 +51,12 @@ impl ResultEntry for FileEntry { fn get_path(&self) -> &Path { &self.path } + fn get_modified_date(&self) -> u64 { + self.modified_date + } + fn get_size(&self) -> u64 { + self.size + } } /// Used by CLI tool when we cannot use directly values @@ -89,7 +95,7 @@ pub struct SimilarImages { image_hashes: HashMap>, // Hashmap with image hashes and Vector with names of files similarity: u32, - images_to_check: HashMap, + images_to_check: BTreeMap, hash_size: u8, hash_alg: HashAlg, image_filter: FilterType, @@ -273,38 +279,27 @@ impl SimilarImages { } } - fn hash_images_load_cache(&mut self) -> (HashMap, HashMap, HashMap) { + fn hash_images_load_cache(&mut self) -> (BTreeMap, BTreeMap, BTreeMap) { debug!("hash_images_load_cache - start, use cache: {}", self.common_data.use_cache); let loaded_hash_map; - let mut records_already_cached: HashMap = Default::default(); - let mut non_cached_files_to_check: HashMap = Default::default(); + let mut records_already_cached: BTreeMap = Default::default(); + let mut non_cached_files_to_check: BTreeMap = Default::default(); if self.common_data.use_cache { - loaded_hash_map = match load_hashes_from_file( - &mut self.common_data.text_messages, - self.common_data.delete_outdated_cache, - self.hash_size, - self.hash_alg, - self.image_filter, - ) { - Some(t) => t, - None => Default::default(), - }; + let (messages, loaded_items) = load_cache_from_file_generalized_by_path::( + &get_similar_images_cache_file(&self.hash_size, &self.hash_alg, &self.image_filter), + self.get_delete_outdated_cache(), + &self.images_to_check, + ); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); - for (name, file_entry) in &self.images_to_check { - if !loaded_hash_map.contains_key(name) { - // If loaded data doesn't contains current image info - non_cached_files_to_check.insert(name.clone(), file_entry.clone()); + for (name, file_entry) in mem::take(&mut self.images_to_check) { + if let Some(cached_file_entry) = loaded_hash_map.get(&name) { + records_already_cached.insert(name.clone(), cached_file_entry.clone()); } else { - let loaded_item = loaded_hash_map.get(name).unwrap(); - if file_entry.size != loaded_item.size || file_entry.modified_date != loaded_item.modified_date { - // When size or modification date of image changed, then it is clear that is different image - non_cached_files_to_check.insert(name.clone(), file_entry.clone()); - } else { - // Checking may be omitted when already there is entry with same size and modification date - records_already_cached.insert(name.clone(), loaded_item.clone()); - } + non_cached_files_to_check.insert(name, file_entry); } } } else { @@ -373,22 +368,22 @@ impl SimilarImages { true } - fn save_to_cache(&mut self, vec_file_entry: Vec<(FileEntry, ImHash)>, loaded_hash_map: HashMap) { + fn save_to_cache(&mut self, vec_file_entry: Vec<(FileEntry, ImHash)>, loaded_hash_map: BTreeMap) { debug!("save_to_cache - start, using cache: {}", self.common_data.use_cache); if self.common_data.use_cache { // Must save all results to file, old loaded from file with all currently counted results - let mut all_results: HashMap = loaded_hash_map; + let mut all_results: BTreeMap = loaded_hash_map; for (file_entry, _hash) in vec_file_entry { all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); } - save_hashes_to_file( + + let messages = save_cache_to_file_generalized( + &get_similar_images_cache_file(&self.hash_size, &self.hash_alg, &self.image_filter), &all_results, - &mut self.common_data.text_messages, self.common_data.save_also_as_json, - self.hash_size, - self.hash_alg, - self.image_filter, + 0, ); + self.get_text_messages_mut().extend_with_another_messages(messages); } debug!("save_to_cache - end"); } @@ -945,98 +940,6 @@ impl PrintResults for SimilarImages { } } -pub fn save_hashes_to_file( - hashmap: &HashMap, - text_messages: &mut Messages, - save_also_as_json: bool, - hash_size: u8, - hash_alg: HashAlg, - image_filter: FilterType, -) { - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = - open_cache_folder(&get_cache_file(&hash_size, &hash_alg, &image_filter), true, save_also_as_json, &mut text_messages.warnings) - { - { - let writer = BufWriter::new(file_handler.unwrap()); // Unwrap because cannot fail here - if let Err(e) = bincode::serialize_into(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file.display(), e)); - return; - } - } - if save_also_as_json { - if let Some(file_handler_json) = file_handler_json { - let writer = BufWriter::new(file_handler_json); - if let Err(e) = serde_json::to_writer(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file_json.display(), e)); - return; - } - } - } - - text_messages.messages.push(format!("Properly saved to file {} cache entries.", hashmap.len())); - } -} - -pub fn load_hashes_from_file( - text_messages: &mut Messages, - delete_outdated_cache: bool, - hash_size: u8, - hash_alg: HashAlg, - image_filter: FilterType, -) -> Option> { - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = - open_cache_folder(&get_cache_file(&hash_size, &hash_alg, &image_filter), false, true, &mut text_messages.warnings) - { - let mut hashmap_loaded_entries: HashMap; - if let Some(file_handler) = file_handler { - let reader = BufReader::new(file_handler); - hashmap_loaded_entries = match bincode::deserialize_from(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file.display(), e)); - return None; - } - }; - } else { - let reader = BufReader::new(file_handler_json.unwrap()); // Unwrap cannot fail, because at least one file must be valid - hashmap_loaded_entries = match serde_json::from_reader(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file_json.display(), e)); - return None; - } - }; - } - - // Don't load cache data if destination file not exists - if delete_outdated_cache { - hashmap_loaded_entries.retain(|src_path, _file_entry| Path::new(src_path).exists()); - } - - text_messages.messages.push(format!("Properly loaded {} cache entries.", hashmap_loaded_entries.len())); - - return Some(hashmap_loaded_entries); - } - None -} - -fn get_cache_file(hash_size: &u8, hash_alg: &HashAlg, image_filter: &FilterType) -> String { - format!( - "cache_similar_images_{}_{}_{}_50.bin", - hash_size, - convert_algorithm_to_string(hash_alg), - convert_filters_to_string(image_filter), - ) -} - pub fn get_string_from_similarity(similarity: &u32, hash_size: u8) -> String { let index_preset = match hash_size { 8 => 0, @@ -1085,7 +988,7 @@ pub fn return_similarity_from_similarity_preset(similarity_preset: &SimilarityPr } } -fn convert_filters_to_string(image_filter: &FilterType) -> String { +pub fn convert_filters_to_string(image_filter: &FilterType) -> String { match image_filter { FilterType::Lanczos3 => "Lanczos3", FilterType::Nearest => "Nearest", @@ -1096,7 +999,7 @@ fn convert_filters_to_string(image_filter: &FilterType) -> String { .to_string() } -fn convert_algorithm_to_string(hash_alg: &HashAlg) -> String { +pub fn convert_algorithm_to_string(hash_alg: &HashAlg) -> String { match hash_alg { HashAlg::Mean => "Mean", HashAlg::Gradient => "Gradient", diff --git a/czkawka_core/src/similar_videos.rs b/czkawka_core/src/similar_videos.rs index 8c7483d..589e035 100644 --- a/czkawka_core/src/similar_videos.rs +++ b/czkawka_core/src/similar_videos.rs @@ -15,9 +15,9 @@ use serde::{Deserialize, Serialize}; use vid_dup_finder_lib::HashCreationErrorKind::DetermineVideo; use vid_dup_finder_lib::{NormalizedTolerance, VideoHash}; -use crate::common::{check_folder_children, open_cache_folder, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, VIDEO_FILES_EXTENSIONS}; +use crate::common::{check_folder_children, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, VIDEO_FILES_EXTENSIONS}; +use crate::common_cache::{get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized}; use crate::common_dir_traversal::{common_get_entry_data_metadata, common_read_dir, get_lowercase_name, get_modified_time, CheckingMethod, ProgressData, ToolType}; -use crate::common_messages::Messages; use crate::common_tool::{CommonData, CommonToolData}; use crate::common_traits::{DebugPrint, PrintResults, ResultEntry, SaveResults}; use crate::flc; @@ -38,6 +38,12 @@ impl ResultEntry for FileEntry { fn get_path(&self) -> &Path { &self.path } + fn get_modified_date(&self) -> u64 { + self.modified_date + } + fn get_size(&self) -> u64 { + self.size + } } /// Distance metric to use with the BK-tree. @@ -259,24 +265,16 @@ impl SimilarVideos { let mut non_cached_files_to_check: BTreeMap = Default::default(); if self.common_data.use_cache { - loaded_hash_map = match load_hashes_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache) { - Some(t) => t, - None => Default::default(), - }; + let (messages, loaded_items) = + load_cache_from_file_generalized_by_path::(&get_similar_videos_cache_file(), self.get_delete_outdated_cache(), &self.videos_to_check); + self.get_text_messages_mut().extend_with_another_messages(messages); + loaded_hash_map = loaded_items.unwrap_or_default(); - for (name, file_entry) in &self.videos_to_check { - if !loaded_hash_map.contains_key(name) { - // If loaded data doesn't contains current videos info - non_cached_files_to_check.insert(name.clone(), file_entry.clone()); + for (name, file_entry) in mem::take(&mut self.videos_to_check) { + if let Some(cached_file_entry) = loaded_hash_map.get(&name) { + records_already_cached.insert(name.clone(), cached_file_entry.clone()); } else { - let loaded_item = loaded_hash_map.get(name).unwrap(); - if file_entry.size != loaded_item.size || file_entry.modified_date != loaded_item.modified_date { - // When size or modification date of video changed, then it is clear that is different video - non_cached_files_to_check.insert(name.clone(), file_entry.clone()); - } else { - // Checking may be omitted when already there is entry with same size and modification date - records_already_cached.insert(name.clone(), loaded_item.clone()); - } + non_cached_files_to_check.insert(name, file_entry); } } } else { @@ -375,7 +373,9 @@ impl SimilarVideos { for file_entry in vec_file_entry { all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); } - save_hashes_to_file(&all_results, &mut self.common_data.text_messages, self.common_data.save_also_as_json); + + let messages = save_cache_to_file_generalized(&get_similar_videos_cache_file(), &all_results, self.common_data.save_also_as_json, 0); + self.get_text_messages_mut().extend_with_another_messages(messages); } debug!("save_cache - end"); } @@ -512,76 +512,6 @@ impl PrintResults for SimilarVideos { } } -pub fn save_hashes_to_file(hashmap: &BTreeMap, text_messages: &mut Messages, save_also_as_json: bool) { - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = open_cache_folder(&get_cache_file(), true, save_also_as_json, &mut text_messages.warnings) { - { - let writer = BufWriter::new(file_handler.unwrap()); // Unwrap because cannot fail here - if let Err(e) = bincode::serialize_into(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file.display(), e)); - return; - } - } - if save_also_as_json { - if let Some(file_handler_json) = file_handler_json { - let writer = BufWriter::new(file_handler_json); - if let Err(e) = serde_json::to_writer(writer, hashmap) { - text_messages - .warnings - .push(format!("Cannot write data to cache file {}, reason {}", cache_file_json.display(), e)); - return; - } - } - } - - text_messages.messages.push(format!("Properly saved to file {} cache entries.", hashmap.len())); - } -} - -pub fn load_hashes_from_file(text_messages: &mut Messages, delete_outdated_cache: bool) -> Option> { - if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) = open_cache_folder(&get_cache_file(), false, true, &mut text_messages.warnings) { - let mut hashmap_loaded_entries: BTreeMap; - if let Some(file_handler) = file_handler { - let reader = BufReader::new(file_handler); - hashmap_loaded_entries = match bincode::deserialize_from(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file.display(), e)); - return None; - } - }; - } else { - let reader = BufReader::new(file_handler_json.unwrap()); // Unwrap cannot fail, because at least one file must be valid - hashmap_loaded_entries = match serde_json::from_reader(reader) { - Ok(t) => t, - Err(e) => { - text_messages - .warnings - .push(format!("Failed to load data from cache file {}, reason {}", cache_file_json.display(), e)); - return None; - } - }; - } - - // Don't load cache data if destination file not exists - if delete_outdated_cache { - hashmap_loaded_entries.retain(|src_path, _file_entry| Path::new(src_path).exists()); - } - - text_messages.messages.push(format!("Properly loaded {} cache entries.", hashmap_loaded_entries.len())); - - return Some(hashmap_loaded_entries); - } - None -} - -fn get_cache_file() -> String { - "cache_similar_videos.bin".to_string() -} - pub fn check_if_ffmpeg_is_installed() -> bool { let vid = "9999czekoczekoczekolada999.txt"; if let Err(DetermineVideo { diff --git a/czkawka_gui/src/connect_things/connect_button_search.rs b/czkawka_gui/src/connect_things/connect_button_search.rs index b213473..f09ad34 100644 --- a/czkawka_gui/src/connect_things/connect_button_search.rs +++ b/czkawka_gui/src/connect_things/connect_button_search.rs @@ -330,6 +330,7 @@ fn duplicate_search( df.set_minimal_prehash_cache_file_size(minimal_prehash_cache_file_size); df.set_check_method(check_method); df.set_hash_type(hash_type); + df.set_save_also_as_json(loaded_common_items.save_also_as_json); df.set_ignore_hard_links(loaded_common_items.hide_hard_links); df.set_use_cache(loaded_common_items.use_cache); df.set_use_prehash_cache(use_prehash_cache); diff --git a/czkawka_gui/src/connect_things/connect_settings.rs b/czkawka_gui/src/connect_things/connect_settings.rs index a220424..1f3a827 100644 --- a/czkawka_gui/src/connect_things/connect_settings.rs +++ b/czkawka_gui/src/connect_things/connect_settings.rs @@ -1,6 +1,10 @@ use std::collections::BTreeMap; use std::default::Default; +use czkawka_core::common_cache::{ + get_duplicate_cache_file, get_similar_images_cache_file, get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, load_cache_from_file_generalized_by_size, + save_cache_to_file_generalized, +}; use directories_next::ProjectDirs; use gtk4::prelude::*; use gtk4::{Label, ResponseType, Window}; @@ -121,20 +125,25 @@ pub fn connect_settings(gui_data: &GuiData) { let mut messages: Messages = Messages::new(); for use_prehash in [true, false] { for type_of_hash in &[HashType::Xxh3, HashType::Blake3, HashType::Crc32] { - if let Some(cache_entries) = czkawka_core::duplicate::load_hashes_from_file(&mut messages, true, type_of_hash, use_prehash) { + let (mut messages, loaded_items) = load_cache_from_file_generalized_by_size::( + &get_duplicate_cache_file(type_of_hash, use_prehash), + true, + &Default::default(), + ); + + if let Some(cache_entries) = loaded_items { let mut hashmap_to_save: BTreeMap = Default::default(); for (_, vec_file_entry) in cache_entries { for file_entry in vec_file_entry { hashmap_to_save.insert(file_entry.path.to_string_lossy().to_string(), file_entry); } } - czkawka_core::duplicate::save_hashes_to_file( - &hashmap_to_save, - &mut messages, - type_of_hash, - use_prehash, - entry_settings_cache_file_minimal_size.text().as_str().parse::().unwrap_or(2 * 1024 * 1024), - ); + + let minimal_cache_size = entry_settings_cache_file_minimal_size.text().as_str().parse::().unwrap_or(2 * 1024 * 1024); + + let save_messages = + save_cache_to_file_generalized(&get_duplicate_cache_file(type_of_hash, use_prehash), &hashmap_to_save, false, minimal_cache_size); + messages.extend_with_another_messages(save_messages); } } @@ -169,8 +178,16 @@ pub fn connect_settings(gui_data: &GuiData) { FilterType::Triangle, ] { for hash_alg in &[HashAlg::Blockhash, HashAlg::Gradient, HashAlg::DoubleGradient, HashAlg::VertGradient, HashAlg::Mean] { - if let Some(cache_entries) = czkawka_core::similar_images::load_hashes_from_file(&mut messages, true, *hash_size, *hash_alg, *image_filter) { - czkawka_core::similar_images::save_hashes_to_file(&cache_entries, &mut messages, false, *hash_size, *hash_alg, *image_filter); + let (mut messages, loaded_items) = load_cache_from_file_generalized_by_path::( + &get_similar_images_cache_file(hash_size, hash_alg, image_filter), + true, + &Default::default(), + ); + + if let Some(cache_entries) = loaded_items { + let save_messages = + save_cache_to_file_generalized(&get_similar_images_cache_file(hash_size, hash_alg, image_filter), &cache_entries, false, 0); + messages.extend_with_another_messages(save_messages); } } } @@ -196,9 +213,12 @@ pub fn connect_settings(gui_data: &GuiData) { dialog.connect_response(move |dialog, response_type| { if response_type == ResponseType::Ok { - let mut messages: Messages = Messages::new(); - if let Some(cache_entries) = czkawka_core::similar_videos::load_hashes_from_file(&mut messages, true) { - czkawka_core::similar_videos::save_hashes_to_file(&cache_entries, &mut messages, false); + let (mut messages, loaded_items) = + load_cache_from_file_generalized_by_path::(&get_similar_videos_cache_file(), true, &Default::default()); + + if let Some(cache_entries) = loaded_items { + let save_messages = save_cache_to_file_generalized(&get_similar_videos_cache_file(), &cache_entries, false, 0); + messages.extend_with_another_messages(save_messages); } messages.messages.push(flg!("cache_properly_cleared"));