From 6e89bcb507f440a8727761db398261c9c33bc493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <41945903+qarmin@users.noreply.github.com> Date: Wed, 13 Jan 2021 16:03:05 +0100 Subject: [PATCH] Add cache for broken files (#204) --- czkawka_cli/src/commands.rs | 2 +- czkawka_core/src/broken_files.rs | 234 ++++++++++++++++++++++---- czkawka_core/src/common_extensions.rs | 2 +- czkawka_core/src/similar_images.rs | 4 +- instructions/Instruction.md | 9 +- 5 files changed, 211 insertions(+), 40 deletions(-) diff --git a/czkawka_cli/src/commands.rs b/czkawka_cli/src/commands.rs index e83a1bf..3e3f74f 100644 --- a/czkawka_cli/src/commands.rs +++ b/czkawka_cli/src/commands.rs @@ -207,7 +207,7 @@ pub struct AllowedExtensions { short = "x", long, help = "Allowed file extension(s)", - long_help = "List of checked files with provided extension(s). There are also helpful macros which allow to easy use a typical extensions like:\nIMAGE(\"jpg,kra,gif,png,bmp,tiff,webp,hdr,svg\"),\nTEXT(\"txt,doc,docx,odt,rtf\"),\nVIDEO(\"mp4,flv,mkv,webm,vob,ogv,gifv,avi,mov,wmv,mpg,m4v,m4p,mpeg,3gp\") or\nMUSIC(\"mp3,flac,ogg,tta,wma,webm\")\n " + long_help = "List of checked files with provided extension(s). There are also helpful macros which allow to easy use a typical extensions like:\nIMAGE(\"jpg,kra,gif,png,bmp,tiff,hdr,svg\"),\nTEXT(\"txt,doc,docx,odt,rtf\"),\nVIDEO(\"mp4,flv,mkv,webm,vob,ogv,gifv,avi,mov,wmv,mpg,m4v,m4p,mpeg,3gp\") or\nMUSIC(\"mp3,flac,ogg,tta,wma,webm\")\n " )] pub allowed_extensions: Vec, } diff --git a/czkawka_core/src/broken_files.rs b/czkawka_core/src/broken_files.rs index ede0b38..8cd79c5 100644 --- a/czkawka_core/src/broken_files.rs +++ b/czkawka_core/src/broken_files.rs @@ -1,6 +1,6 @@ -use std::fs::{File, Metadata}; +use std::fs::{File, Metadata, OpenOptions}; use std::io::prelude::*; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::{fs, thread}; @@ -11,12 +11,16 @@ use crate::common_items::ExcludedItems; use crate::common_messages::Messages; use crate::common_traits::*; use crossbeam_channel::Receiver; +use directories_next::ProjectDirs; use rayon::prelude::*; -use std::io::BufWriter; +use std::collections::HashMap; +use std::io::{BufReader, BufWriter}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; use std::thread::sleep; +const CACHE_FILE_NAME: &str = "cache_broken_files.txt"; + #[derive(Debug)] pub struct ProgressData { pub current_stage: u8, @@ -35,13 +39,15 @@ pub enum DeleteMethod { pub struct FileEntry { pub path: PathBuf, pub modified_date: u64, + pub size: u64, pub type_of_file: TypeOfFile, pub error_string: String, } -#[derive(Clone, PartialEq, Eq)] +#[derive(Copy, Clone, PartialEq, Eq)] pub enum TypeOfFile { - Image, + Unknown = -1, + Image = 0, } /// Info struck with helpful information's about results @@ -61,7 +67,7 @@ impl Info { pub struct BrokenFiles { text_messages: Messages, information: Info, - files_to_check: Vec, + files_to_check: HashMap, broken_files: Vec, directories: Directories, allowed_extensions: Extensions, @@ -80,10 +86,10 @@ impl BrokenFiles { allowed_extensions: Extensions::new(), directories: Directories::new(), excluded_items: ExcludedItems::new(), - files_to_check: vec![], + files_to_check: Default::default(), delete_method: DeleteMethod::None, stopped_search: false, - broken_files: vec![], + broken_files: Default::default(), } } @@ -232,13 +238,8 @@ impl BrokenFiles { } .to_lowercase(); - let type_of_file; - - // Checking allowed image extensions - let allowed_image_extensions = ["jpg", "jpeg", "png", "bmp", "ico", "webp", "tiff", "pnm", "tga", "ff", "gif"]; - if allowed_image_extensions.iter().any(|e| file_name_lowercase.ends_with(format!(".{}", e).as_str())) { - type_of_file = TypeOfFile::Image; - } else { + let type_of_file = check_extension_avaibility(&file_name_lowercase); + if type_of_file == TypeOfFile::Unknown { continue 'dir; } @@ -273,12 +274,13 @@ impl BrokenFiles { continue; } // Permissions Denied }, + size: metadata.len(), type_of_file, error_string: "".to_string(), }; // Adding files to Vector - self.files_to_check.push(fe); + self.files_to_check.insert(fe.path.to_string_lossy().to_string(), fe); } } } @@ -292,6 +294,29 @@ impl BrokenFiles { fn look_for_broken_files(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::Sender>) -> bool { let system_time = SystemTime::now(); + let loaded_hash_map = match load_cache_from_file(&mut self.text_messages) { + Some(t) => t, + None => Default::default(), + }; + + let mut records_already_cached: HashMap = Default::default(); + let mut non_cached_files_to_check: HashMap = Default::default(); + for (name, file_entry) in &self.files_to_check { + #[allow(clippy::collapsible_if)] + if !loaded_hash_map.contains_key(name) { + // If loaded data doesn't contains current image info + non_cached_files_to_check.insert(name.clone(), file_entry.clone()); + } else { + if file_entry.size != loaded_hash_map.get(name).unwrap().size || file_entry.modified_date != loaded_hash_map.get(name).unwrap().modified_date { + // When size or modification date of image changed, then it is clear that is different image + non_cached_files_to_check.insert(name.clone(), file_entry.clone()); + } else { + // Checking may be omitted when already there is entry with same size and modification date + records_already_cached.insert(name.clone(), loaded_hash_map.get(name).unwrap().clone()); + } + } + } + let check_was_breaked = AtomicBool::new(false); // Used for breaking from GUI and ending check thread //// PROGRESS THREAD START @@ -304,7 +329,7 @@ impl BrokenFiles { let mut progress_send = progress_sender.clone(); let progress_thread_run = progress_thread_run.clone(); let atomic_file_counter = atomic_file_counter.clone(); - let files_to_check = self.files_to_check.len(); + let files_to_check = non_cached_files_to_check.len(); progress_thread_handle = thread::spawn(move || loop { progress_send .try_send(ProgressData { @@ -323,23 +348,34 @@ impl BrokenFiles { progress_thread_handle = thread::spawn(|| {}); } //// PROGRESS THREAD END - self.broken_files = self - .files_to_check + let mut vec_file_entry: Vec = non_cached_files_to_check .par_iter() .map(|file_entry| { atomic_file_counter.fetch_add(1, Ordering::Relaxed); if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { - // This will not break + check_was_breaked.store(true, Ordering::Relaxed); return None; } - match image::open(&file_entry.path) { - Ok(_) => Some(None), - Err(t) => { - let mut file_entry = file_entry.clone(); - file_entry.error_string = t.to_string(); - Some(Some(file_entry)) - } // Something is wrong with image + match file_entry.1.type_of_file { + TypeOfFile::Image => { + match image::open(&file_entry.1.path) { + Ok(_) => Some(None), + Err(t) => { + let error_string = t.to_string(); + // This error is a problem with image library, remove check when https://github.com/image-rs/jpeg-decoder/issues/130 will be fixed + if !error_string.contains("spectral selection is not allowed in non-progressive scan") { + let mut file_entry = file_entry.1.clone(); + file_entry.error_string = error_string; + Some(Some(file_entry)) + } else { + Some(None) + } + } // Something is wrong with image + } + } + // This means that cache read invalid value because maybe cache comes from different czkawka version + TypeOfFile::Unknown => Some(None), } }) .while_some() @@ -351,16 +387,35 @@ impl BrokenFiles { progress_thread_run.store(false, Ordering::Relaxed); progress_thread_handle.join().unwrap(); - self.information.number_of_broken_files = self.broken_files.len(); - - // Check if user aborted search(only from GUI) + // Break if stop was clicked if check_was_breaked.load(Ordering::Relaxed) { return false; } + + // Just connect loaded results with already calculated + for (_name, file_entry) in records_already_cached { + vec_file_entry.push(file_entry.clone()); + } + + self.broken_files = vec_file_entry.iter().filter_map(|f| if f.error_string.is_empty() { None } else { Some(f.clone()) }).collect(); + + // Must save all results to file, old loaded from file with all currently counted results + let mut all_results: HashMap = self.files_to_check.clone(); + + for file_entry in vec_file_entry { + all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); + } + for (_name, file_entry) in loaded_hash_map { + all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry); + } + save_cache_to_file(&all_results, &mut self.text_messages); + + self.information.number_of_broken_files = self.broken_files.len(); + Common::print_time(system_time, SystemTime::now(), "sort_images - reading data from files in parallel".to_string()); // Clean data - self.files_to_check = vec![]; + self.files_to_check = Default::default(); true } @@ -370,7 +425,7 @@ impl BrokenFiles { match self.delete_method { DeleteMethod::Delete => { - for file_entry in &self.files_to_check { + for file_entry in self.broken_files.iter() { if fs::remove_file(&file_entry.path).is_err() { self.text_messages.warnings.push(file_entry.path.display().to_string()); } @@ -472,3 +527,118 @@ impl PrintResults for BrokenFiles { Common::print_time(start_time, SystemTime::now(), "print_entries".to_string()); } } + +fn save_cache_to_file(hashmap_file_entry: &HashMap, text_messages: &mut Messages) { + println!("Allowed to save {} entries", hashmap_file_entry.len()); + if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") { + // Lin: /home/username/.cache/czkawka + // Win: C:\Users\Username\AppData\Local\Qarmin\Czkawka\cache + // Mac: /Users/Username/Library/Caches/pl.Qarmin.Czkawka + + let cache_dir = PathBuf::from(proj_dirs.cache_dir()); + if cache_dir.exists() { + if !cache_dir.is_dir() { + text_messages.messages.push(format!("Config dir {} is a file!", cache_dir.display())); + return; + } + } else if fs::create_dir_all(&cache_dir).is_err() { + text_messages.messages.push(format!("Cannot create config dir {}", cache_dir.display())); + return; + } + let cache_file = cache_dir.join(CACHE_FILE_NAME); + let file_handler = match OpenOptions::new().truncate(true).write(true).create(true).open(&cache_file) { + Ok(t) => t, + Err(_) => { + text_messages.messages.push(format!("Cannot create or open cache file {}", cache_file.display())); + return; + } + }; + let mut writer = BufWriter::new(file_handler); + + for file_entry in hashmap_file_entry.values() { + // Only save to cache files which have more than 1KB + if file_entry.size > 1024 { + let string: String = format!("{}//{}//{}//{}", file_entry.path.display(), file_entry.size, file_entry.modified_date, file_entry.error_string); + + if writeln!(writer, "{}", string).is_err() { + text_messages.messages.push(format!("Failed to save some data to cache file {}", cache_file.display())); + return; + }; + } + } + } +} + +fn load_cache_from_file(text_messages: &mut Messages) -> Option> { + if let Some(proj_dirs) = ProjectDirs::from("pl", "Qarmin", "Czkawka") { + let cache_dir = PathBuf::from(proj_dirs.cache_dir()); + let cache_file = cache_dir.join(CACHE_FILE_NAME); + let file_handler = match OpenOptions::new().read(true).open(&cache_file) { + Ok(t) => t, + Err(_) => { + // text_messages.messages.push(format!("Cannot find or open cache file {}", cache_file.display())); // This shouldn't be write to output + return None; + } + }; + + let reader = BufReader::new(file_handler); + + let mut hashmap_loaded_entries: HashMap = Default::default(); + + // Read the file line by line using the lines() iterator from std::io::BufRead. + for (index, line) in reader.lines().enumerate() { + let line = match line { + Ok(t) => t, + Err(_) => { + text_messages.warnings.push(format!("Failed to load line number {} from cache file {}", index + 1, cache_file.display())); + return None; + } + }; + let uuu = line.split("//").collect::>(); + if uuu.len() != 4 { + text_messages.warnings.push(format!("Found invalid data in line {} - ({}) in cache file {}", index + 1, line, cache_file.display())); + continue; + } + // Don't load cache data if destination file not exists + if Path::new(uuu[0]).exists() { + hashmap_loaded_entries.insert( + uuu[0].to_string(), + FileEntry { + path: PathBuf::from(uuu[0]), + size: match uuu[1].parse::() { + Ok(t) => t, + Err(_) => { + text_messages.warnings.push(format!("Found invalid size value in line {} - ({}) in cache file {}", index + 1, line, cache_file.display())); + continue; + } + }, + modified_date: match uuu[2].parse::() { + Ok(t) => t, + Err(_) => { + text_messages.warnings.push(format!("Found invalid modified date value in line {} - ({}) in cache file {}", index + 1, line, cache_file.display())); + continue; + } + }, + type_of_file: check_extension_avaibility(&uuu[0].to_lowercase()), + error_string: uuu[3].to_string(), + }, + ); + } + } + + return Some(hashmap_loaded_entries); + } + + text_messages.messages.push("Cannot find or open system config dir to save cache file".to_string()); + None +} + +fn check_extension_avaibility(file_name_lowercase: &str) -> TypeOfFile { + // Checking allowed image extensions + let allowed_image_extensions = ["jpg", "jpeg", "png", "bmp", "ico", "tiff", "pnm", "tga", "ff", "gif"]; + if allowed_image_extensions.iter().any(|e| file_name_lowercase.ends_with(format!(".{}", e).as_str())) { + TypeOfFile::Image + } else { + TypeOfFile::Unknown + } +} diff --git a/czkawka_core/src/common_extensions.rs b/czkawka_core/src/common_extensions.rs index ea055db..3c6d47f 100644 --- a/czkawka_core/src/common_extensions.rs +++ b/czkawka_core/src/common_extensions.rs @@ -18,7 +18,7 @@ impl Extensions { if allowed_extensions.is_empty() { return; } - allowed_extensions = allowed_extensions.replace("IMAGE", "jpg,kra,gif,png,bmp,tiff,webp,hdr,svg"); + allowed_extensions = allowed_extensions.replace("IMAGE", "jpg,kra,gif,png,bmp,tiff,hdr,svg"); allowed_extensions = allowed_extensions.replace("VIDEO", "mp4,flv,mkv,webm,vob,ogv,gifv,avi,mov,wmv,mpg,m4v,m4p,mpeg,3gp"); allowed_extensions = allowed_extensions.replace("MUSIC", "mp3,flac,ogg,tta,wma,webm"); allowed_extensions = allowed_extensions.replace("TEXT", "txt,doc,docx,odt,rtf"); diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index 886e101..4d1f8e3 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -265,7 +265,7 @@ impl SimilarImages { .to_lowercase(); // Checking allowed image extensions - let allowed_image_extensions = ["jpg", "jpeg", "png", "bmp", "ico", "webp", "tiff", "pnm", "tga", "ff", "gif"]; + let allowed_image_extensions = ["jpg", "jpeg", "png", "bmp", "ico", "tiff", "pnm", "tga", "ff", "gif"]; if !allowed_image_extensions.iter().any(|e| file_name_lowercase.ends_with(format!(".{}", e).as_str())) { continue 'dir; } @@ -693,7 +693,7 @@ fn load_hashes_from_file(text_messages: &mut Messages) -> Option t, Err(_) => { - text_messages.messages.push(format!("Cannot find or open cache file {}", cache_file.display())); + // text_messages.messages.push(format!("Cannot find or open cache file {}", cache_file.display())); // This shouldn't be write to output return None; } }; diff --git a/instructions/Instruction.md b/instructions/Instruction.md index 5c78014..a924f74 100644 --- a/instructions/Instruction.md +++ b/instructions/Instruction.md @@ -100,7 +100,7 @@ Then, for each selected tag by which we want to search for duplicates, we perfor ### Similar Images It is a tool for finding similar images that differ e.g. in watermark, size etc. -The tool first collects images with specific extensions that can be checked - `["jpg", "png", "bmp", "ico", "webp", "tiff"]`. +The tool first collects images with specific extensions that can be checked - `["jpg", "png", "bmp", "ico", "tiff"]`. Next cached data are loaded from file to prevent hashing twice same file. Automatically cache which points to non existing data is deleted. @@ -138,6 +138,7 @@ Only some image extensions are supported, because I rely on image crate. Also so For now Czkawka store only 2 files on disk: - `czkawka_gui_config.txt` - stores configuration of GUI which may be loaded at startup - `cache_similar_image.txt` - stores cache data and hashes which may be used later without needing to compute image hash again - DO NOT TRY TO EDIT THIS FILE MANUALLY! - editing this file may cause app crashes. +- `cache_broken_files.txt` - stores cache data of broken files First file is located in this path @@ -151,7 +152,7 @@ Second with cache here: Linux - `/home/username/.cache/czkawka` Mac - `/Users/Username/Library/Caches/pl.Qarmin.Czkawka` Windows - `C:\Users\Username\AppData\Local\Qarmin\Czkawka\cache` - + ## GUI GTK @@ -181,7 +182,7 @@ There are several buttons which do different actions: - Add (directories) - adds directories to include or exclude - Remove (directories) - remove directories to search or to exclude - Manual Add (directories) - allows to write by hand directories(may be used to write non visible in file manager directories) -- Save current configuration - saves current GUI configuration to configuration file +- Save current configuration - saves current GUI configuration to configuration file - Load configuration - loads configuration of file and override current GUI config - Reset configuration - reset current GUI configuration to default @@ -212,4 +213,4 @@ By default all tools only write about results to console, but it is possible wit - **Manually adding multiple directories** You can manually edit config file `czkawka_gui_config.txt` and add required directories. After that load configuration. - **Slow checking of little number similar images** - If you checked before a big amount of images(several tens of thousands) and them still exists on disk, then information's about it are loaded from cache and save to it, even if you have check now only a few images. You can rename cache file `cache_similar_image.txt`(to be able to use it again) or delete it - cache will regenerate but with lower amount of entries it should load and save a lot of faster. \ No newline at end of file + If you checked before a big amount of images(several tens of thousands) and them still exists on disk, then information's about it are loaded from cache and save to it, even if you have check now only a few images. You can rename cache file `cache_similar_image.txt`(to be able to use it again) or delete it - cache will regenerate but with lower amount of entries it should load and save a lot of faster.