From 1e94587de8813819bfcafa3f959209c2585646fd Mon Sep 17 00:00:00 2001 From: Thomas Andreas Jung Date: Sat, 20 Feb 2021 12:28:06 +0100 Subject: [PATCH] Ignore duplicates if those are hard links (#234) This is a proof of concept. ``` $ echo a > hardlinks/a $ cp hardlinks/{a,b} $ ln hardlinks/{a,c} $ cargo run --bin czkawka_cli dup -m 1 --directories $(pwd)/hardlinks -f /dev/stderr > /dev/null -------------------------------------------------Files with same hashes------------------------------------------------- Found 1 duplicated files which in 1 groups which takes 2 B. ---- Size 2 B (2) - 2 files /home/thomas/Development/czkawka/hardlinks/a /home/thomas/Development/czkawka/hardlinks/b ``` Open: - Windows support - Probably this should be a cli option --- czkawka_core/src/duplicate.rs | 67 +++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs index ffb3f6b..e430154 100644 --- a/czkawka_core/src/duplicate.rs +++ b/czkawka_core/src/duplicate.rs @@ -1,9 +1,13 @@ use crossbeam_channel::Receiver; use humansize::{file_size_opts as options, FileSize}; +#[cfg(target_family = "unix")] +use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; use std::fs::{File, Metadata, OpenOptions}; use std::io::prelude::*; use std::io::{Error, ErrorKind, Result}; +#[cfg(target_family = "unix")] +use std::os::unix::fs::MetadataExt; use std::path::{Path, PathBuf}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::{fs, thread}; @@ -61,7 +65,7 @@ pub enum DeleteMethod { HardLink, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Default)] pub struct FileEntry { pub path: PathBuf, pub size: u64, @@ -551,12 +555,16 @@ impl DuplicateFinder { // Create new BTreeMap without single size entries(files have not duplicates) let mut new_map: BTreeMap> = Default::default(); - for (size, vector) in &self.files_with_identical_size { + for (size, vec) in &self.files_with_identical_size { + if vec.len() <= 1 { + continue; + } + let vector = filter_hard_links(vec); if vector.len() > 1 { self.information.number_of_duplicated_files_by_size += vector.len() - 1; self.information.number_of_groups_by_size += 1; self.information.lost_space_by_size += (vector.len() as u64 - 1) * size; - new_map.insert(*size, vector.clone()); + new_map.insert(*size, vector); } } self.files_with_identical_size = new_map; @@ -1298,6 +1306,26 @@ fn delete_files(vector: &[FileEntry], delete_method: &DeleteMethod, warnings: &m (gained_space, removed_files, failed_to_remove_files) } +#[cfg(target_family = "windows")] +fn filter_hard_links(vec_file_entry: &[FileEntry]) -> Vec { + vec_file_entry.to_vec() +} + +#[cfg(target_family = "unix")] +fn filter_hard_links(vec_file_entry: &[FileEntry]) -> Vec { + let mut inodes: HashSet = HashSet::with_capacity(vec_file_entry.len()); + let mut identical: Vec = Vec::with_capacity(vec_file_entry.len()); + for f in vec_file_entry { + if let Ok(meta) = fs::metadata(&f.path) { + if !inodes.insert(meta.ino()) { + continue; + } + } + identical.push(f.clone()); + } + identical +} + fn make_hard_link(src: &PathBuf, dst: &PathBuf) -> Result<()> { let dst_dir = dst.parent().ok_or_else(|| Error::new(ErrorKind::Other, "No parent"))?; let temp = tempfile::Builder::new().tempfile_in(dst_dir)?; @@ -1669,4 +1697,37 @@ mod tests { assert_eq!(vec![dst], read_dir(&dir)?.map(|e| e.unwrap().path()).collect::>()); Ok(()) } + + #[test] + fn test_filter_hard_links_empty() { + let expected: Vec = Default::default(); + assert_eq!(expected, filter_hard_links(&[])); + } + + #[cfg(target_family = "unix")] + #[test] + fn test_filter_hard_links() -> Result<()> { + let dir = tempfile::Builder::new().tempdir()?; + let (src, dst) = (dir.path().join("a"), dir.path().join("b")); + File::create(&src)?; + fs::hard_link(src.clone(), dst.clone())?; + let e1 = FileEntry { path: src, ..Default::default() }; + let e2 = FileEntry { path: dst, ..Default::default() }; + let actual = filter_hard_links(&[e1.clone(), e2]); + assert_eq!(vec![e1], actual); + Ok(()) + } + + #[test] + fn test_filter_hard_links_regular_files() -> Result<()> { + let dir = tempfile::Builder::new().tempdir()?; + let (src, dst) = (dir.path().join("a"), dir.path().join("b")); + File::create(&src)?; + File::create(&dst)?; + let e1 = FileEntry { path: src, ..Default::default() }; + let e2 = FileEntry { path: dst, ..Default::default() }; + let actual = filter_hard_links(&[e1.clone(), e2.clone()]); + assert_eq!(vec![e1, e2], actual); + Ok(()) + } }