From 183f333f640dce69d23d8df927af2554a5bc5e6d Mon Sep 17 00:00:00 2001 From: Thomas Jung Date: Mon, 29 Jan 2024 11:03:50 +0100 Subject: [PATCH] Support hard links for similar images and videos with `-L` This ignores matches for files that have the same inode. This only works on Unix. --- czkawka_cli/src/commands.rs | 4 + czkawka_cli/src/main.rs | 4 + czkawka_core/src/common_dir_traversal.rs | 210 ++++++++++++++++++++++- czkawka_core/src/similar_images.rs | 14 +- czkawka_core/src/similar_videos.rs | 14 +- 5 files changed, 234 insertions(+), 12 deletions(-) diff --git a/czkawka_cli/src/commands.rs b/czkawka_cli/src/commands.rs index 346a9f2..44a6440 100644 --- a/czkawka_cli/src/commands.rs +++ b/czkawka_cli/src/commands.rs @@ -212,6 +212,8 @@ pub struct SimilarImagesArgs { #[clap(flatten)] pub delete_method: DMethod, #[clap(flatten)] + pub allow_hard_links: AllowHardLinks, + #[clap(flatten)] pub dry_run: DryRun, #[clap( short = 'g', @@ -355,6 +357,8 @@ pub struct SimilarVideosArgs { #[clap(flatten)] pub delete_method: DMethod, #[clap(flatten)] + pub allow_hard_links: AllowHardLinks, + #[clap(flatten)] pub dry_run: DryRun, #[clap( short, diff --git a/czkawka_cli/src/main.rs b/czkawka_cli/src/main.rs index 89fb871..ce3f9d1 100644 --- a/czkawka_cli/src/main.rs +++ b/czkawka_cli/src/main.rs @@ -185,6 +185,7 @@ fn similar_images(similar_images: SimilarImagesArgs, stop_receiver: &Receiver<() hash_size, delete_method, dry_run, + allow_hard_links, } = similar_images; let mut item = SimilarImages::new(); @@ -198,6 +199,7 @@ fn similar_images(similar_images: SimilarImagesArgs, stop_receiver: &Receiver<() item.set_delete_method(delete_method.delete_method); item.set_dry_run(dry_run.dry_run); item.set_similarity(return_similarity_from_similarity_preset(&similarity_preset, hash_size)); + item.set_ignore_hard_links(!allow_hard_links.allow_hard_links); item.find_similar_images(Some(stop_receiver), Some(progress_sender)); @@ -272,6 +274,7 @@ fn similar_videos(similar_videos: SimilarVideosArgs, stop_receiver: &Receiver<() maximal_file_size, delete_method, dry_run, + allow_hard_links, } = similar_videos; let mut item = SimilarVideos::new(); @@ -282,6 +285,7 @@ fn similar_videos(similar_videos: SimilarVideosArgs, stop_receiver: &Receiver<() item.set_tolerance(tolerance); item.set_delete_method(delete_method.delete_method); item.set_dry_run(dry_run.dry_run); + item.set_ignore_hard_links(!allow_hard_links.allow_hard_links); item.find_similar_videos(Some(stop_receiver), Some(progress_sender)); diff --git a/czkawka_core/src/common_dir_traversal.rs b/czkawka_core/src/common_dir_traversal.rs index fe2414d..dc73106 100644 --- a/czkawka_core/src/common_dir_traversal.rs +++ b/czkawka_core/src/common_dir_traversal.rs @@ -1,6 +1,8 @@ use std::collections::BTreeMap; use std::fs; -use std::fs::{DirEntry, FileType, Metadata, ReadDir}; +use std::fs::{DirEntry, FileType, Metadata}; +#[cfg(target_family = "unix")] +use std::os::unix::fs::MetadataExt; use std::path::{Path, PathBuf}; use std::sync::atomic::Ordering; use std::time::UNIX_EPOCH; @@ -92,7 +94,7 @@ pub enum Collect { Files, } -#[derive(Eq, PartialEq, Copy, Clone)] +#[derive(Eq, PartialEq, Copy, Clone, Debug)] enum EntryType { File, Dir, @@ -546,9 +548,17 @@ fn process_symlink_in_symlink_mode( fe_result.push(fe); } -pub fn common_read_dir(current_folder: &Path, warnings: &mut Vec) -> Option { +pub fn common_read_dir(current_folder: &Path, warnings: &mut Vec) -> Option>> { match fs::read_dir(current_folder) { - Ok(t) => Some(t), + Ok(t) => { + // Make directory traversal order stable + let mut r: Vec<_> = t.collect(); + r.sort_by_key(|d| match d { + Ok(f) => f.path(), + _ => PathBuf::new(), + }); + Some(r) + } Err(e) => { warnings.push(flc!( "core_cannot_open_dir", @@ -634,3 +644,195 @@ pub fn get_modified_time(metadata: &Metadata, warnings: &mut Vec, curren } } } + +#[cfg(target_family = "windows")] +pub fn inode(_fe: &FileEntry) -> Option { + None +} + +#[cfg(target_family = "unix")] +pub fn inode(fe: &FileEntry) -> Option { + if let Ok(meta) = fs::metadata(&fe.path) { + Some(meta.ino()) + } else { + None + } +} + +pub fn take_1_per_inode((k, mut v): (Option, Vec)) -> Vec { + if k.is_some() { + v.drain(1..); + } + v +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::common_tool::*; + use once_cell::sync::Lazy; + use std::fs; + use std::fs::File; + use std::io; + use std::io::prelude::*; + use std::time::{Duration, SystemTime}; + use tempfile::TempDir; + + impl CommonData for CommonToolData { + fn get_cd(&self) -> &CommonToolData { + self + } + fn get_cd_mut(&mut self) -> &mut CommonToolData { + self + } + } + + static NOW: Lazy = Lazy::new(|| SystemTime::UNIX_EPOCH + Duration::new(100, 0)); + const CONTENT: &[u8; 1] = b"a"; + + fn create_files(dir: &TempDir) -> io::Result<(PathBuf, PathBuf, PathBuf)> { + let (src, hard, other) = (dir.path().join("a"), dir.path().join("b"), dir.path().join("c")); + + let mut file = File::create(&src)?; + file.write_all(CONTENT)?; + fs::hard_link(&src, &hard)?; + file.set_modified(*NOW)?; + + let mut file = File::create(&other)?; + file.write_all(CONTENT)?; + file.set_modified(*NOW)?; + Ok((src, hard, other)) + } + + #[test] + fn test_traversal() -> io::Result<()> { + let dir = tempfile::Builder::new().tempdir()?; + let (src, hard, other) = create_files(&dir)?; + let secs = NOW.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs(); + + let mut common_data = CommonToolData::new(ToolType::SimilarImages); + common_data.directories.set_included_directory([dir.path().to_owned()].to_vec()); + common_data.set_minimal_file_size(0); + + match DirTraversalBuilder::new().group_by(|_fe| ()).common_data(&common_data).build().run() { + DirTraversalResult::SuccessFiles { + warnings: _, + grouped_file_entries, + } => { + let actual: Vec<_> = grouped_file_entries.into_values().flatten().collect(); + assert_eq!( + [ + FileEntry { + path: src, + size: 1, + modified_date: secs, + }, + FileEntry { + path: hard, + size: 1, + modified_date: secs, + }, + FileEntry { + path: other, + size: 1, + modified_date: secs, + }, + ] + .to_vec(), + actual + ); + } + _ => { + panic!("Expect SuccessFiles."); + } + }; + Ok(()) + } + + #[cfg(target_family = "unix")] + #[test] + fn test_traversal_group_by_inode() -> io::Result<()> { + let dir = tempfile::Builder::new().tempdir()?; + let (src, _, other) = create_files(&dir)?; + let secs = NOW.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs(); + + let mut common_data = CommonToolData::new(ToolType::SimilarImages); + common_data.directories.set_included_directory([dir.path().to_owned()].to_vec()); + common_data.set_minimal_file_size(0); + + match DirTraversalBuilder::new().group_by(inode).common_data(&common_data).build().run() { + DirTraversalResult::SuccessFiles { + warnings: _, + grouped_file_entries, + } => { + let actual: Vec<_> = grouped_file_entries.into_iter().flat_map(take_1_per_inode).collect(); + assert_eq!( + [ + FileEntry { + path: src, + size: 1, + modified_date: secs, + }, + FileEntry { + path: other, + size: 1, + modified_date: secs, + }, + ] + .to_vec(), + actual + ); + } + _ => { + panic!("Expect SuccessFiles."); + } + }; + Ok(()) + } + + #[cfg(target_family = "windows")] + #[test] + fn test_traversal_group_by_inode() -> io::Result<()> { + let dir = tempfile::Builder::new().tempdir()?; + let (src, hard, other) = create_files(&dir)?; + let secs = NOW.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs(); + + let mut common_data = CommonToolData::new(ToolType::SimilarImages); + common_data.directories.set_included_directory([dir.path().to_owned()].to_vec()); + common_data.set_minimal_file_size(0); + + match DirTraversalBuilder::new().group_by(inode).common_data(&common_data).build().run() { + DirTraversalResult::SuccessFiles { + warnings: _, + grouped_file_entries, + } => { + let actual: Vec<_> = grouped_file_entries.into_iter().flat_map(take_1_per_inode).collect(); + assert_eq!( + [ + FileEntry { + path: src, + size: 1, + modified_date: secs, + }, + FileEntry { + path: hard, + size: 1, + modified_date: secs, + }, + FileEntry { + path: other, + size: 1, + modified_date: secs, + }, + ] + .to_vec(), + actual + ); + } + _ => { + panic!("Expect SuccessFiles."); + } + }; + Ok(()) + } +} diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index 750172c..f13098b 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -23,7 +23,7 @@ use crate::common::{ HEIC_EXTENSIONS, IMAGE_RS_SIMILAR_IMAGES_EXTENSIONS, RAW_IMAGE_EXTENSIONS, }; use crate::common_cache::{get_similar_images_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized}; -use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType}; +use crate::common_dir_traversal::{inode, take_1_per_inode, CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType}; use crate::common_tool::{CommonData, CommonToolData, DeleteMethod}; use crate::common_traits::{DebugPrint, PrintResults, ResultEntry}; use crate::flc; @@ -122,6 +122,7 @@ pub struct SimilarImages { hash_alg: HashAlg, image_filter: FilterType, exclude_images_with_same_size: bool, + ignore_hard_links: bool, } #[derive(Default)] @@ -145,6 +146,7 @@ impl SimilarImages { hash_alg: HashAlg::Gradient, image_filter: FilterType::Lanczos3, exclude_images_with_same_size: false, + ignore_hard_links: false, } } @@ -188,7 +190,7 @@ impl SimilarImages { let heic_extensions = HEIC_EXTENSIONS.iter().collect::>(); let result = DirTraversalBuilder::new() - .group_by(|_fe| ()) + .group_by(inode) .stop_receiver(stop_receiver) .progress_sender(progress_sender) .common_data(&self.common_data) @@ -199,8 +201,8 @@ impl SimilarImages { match result { DirTraversalResult::SuccessFiles { grouped_file_entries, warnings } => { self.images_to_check = grouped_file_entries - .into_values() - .flatten() + .into_iter() + .flat_map(if self.ignore_hard_links { |(_, fes)| fes } else { take_1_per_inode }) .map(|fe| { let fe_str = fe.path.to_string_lossy().to_string(); let extension_lowercase = fe.path.extension().unwrap_or_default().to_string_lossy().to_lowercase(); @@ -1090,6 +1092,10 @@ impl SimilarImages { pub fn set_similarity(&mut self, similarity: u32) { self.similarity = similarity; } + + pub fn set_ignore_hard_links(&mut self, ignore_hard_links: bool) { + self.ignore_hard_links = ignore_hard_links; + } } #[cfg(test)] diff --git a/czkawka_core/src/similar_videos.rs b/czkawka_core/src/similar_videos.rs index de8391e..60619a7 100644 --- a/czkawka_core/src/similar_videos.rs +++ b/czkawka_core/src/similar_videos.rs @@ -16,7 +16,7 @@ use vid_dup_finder_lib::{NormalizedTolerance, VideoHash}; use crate::common::{check_if_stop_received, delete_files_custom, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, VIDEO_FILES_EXTENSIONS}; use crate::common_cache::{get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized}; -use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType}; +use crate::common_dir_traversal::{inode, take_1_per_inode, CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType}; use crate::common_tool::{CommonData, CommonToolData, DeleteMethod}; use crate::common_traits::{DebugPrint, PrintResults, ResultEntry}; use crate::flc; @@ -83,6 +83,7 @@ pub struct SimilarVideos { videos_to_check: BTreeMap, tolerance: i32, exclude_videos_with_same_size: bool, + ignore_hard_links: bool, } impl CommonData for SimilarVideos { @@ -111,6 +112,7 @@ impl SimilarVideos { tolerance: 10, exclude_videos_with_same_size: false, similar_referenced_vectors: vec![], + ignore_hard_links: false, } } @@ -149,7 +151,7 @@ impl SimilarVideos { } let result = DirTraversalBuilder::new() - .group_by(|_fe| ()) + .group_by(inode) .stop_receiver(stop_receiver) .progress_sender(progress_sender) .common_data(&self.common_data) @@ -160,8 +162,8 @@ impl SimilarVideos { match result { DirTraversalResult::SuccessFiles { grouped_file_entries, warnings } => { self.videos_to_check = grouped_file_entries - .into_values() - .flatten() + .into_iter() + .flat_map(if self.ignore_hard_links { |(_, fes)| fes } else { take_1_per_inode }) .map(|fe| (fe.path.to_string_lossy().to_string(), fe.into_videos_entry())) .collect(); self.common_data.text_messages.warnings.extend(warnings); @@ -454,4 +456,8 @@ impl SimilarVideos { pub fn get_use_reference(&self) -> bool { self.common_data.use_reference_folders } + + pub fn set_ignore_hard_links(&mut self, ignore_hard_links: bool) { + self.ignore_hard_links = ignore_hard_links; + } }