diff --git a/Cargo.lock b/Cargo.lock index 2511ef9..d5b7912 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,253 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "arrayref" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" + +[[package]] +name = "arrayvec" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff77d8686867eceff3105329d4698d96c2391c176d5d03adc90c7389162b5b8" + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "blake3" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce4f9586c9a3151c4b49b19e82ba163dd073614dd057e53c969e1a4db5b52720" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "crypto-mac", + "digest", +] + +[[package]] +name = "cc" +version = "1.0.59" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66120af515773fb005778dc07c261bd201ec8ce50bd6e7144c927753fe013381" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + +[[package]] +name = "crossbeam-channel" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ee0cc8804d5393478d743b035099520087a5186f3b93fa58cec08fa62407b6" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "maybe-uninit", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "lazy_static", + "maybe-uninit", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" +dependencies = [ + "autocfg", + "cfg-if", + "lazy_static", +] + +[[package]] +name = "crypto-mac" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b584a330336237c1eecd3e94266efb216c56ed91225d634cb2991c5f3fd1aeab" +dependencies = [ + "generic-array", + "subtle", +] + [[package]] name = "czkawka" version = "0.1.0" +dependencies = [ + "blake3", + "multimap", + "rayon", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + +[[package]] +name = "either" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56b59865bce947ac5958779cfa508f6c3b9497cc762b7e24a12d11ccde2c4f" + +[[package]] +name = "generic-array" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "hermit-abi" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3deed196b6e7f9e44a2ae8d94225d80302d81208b1bb673fd21fe634645c85a9" +dependencies = [ + "libc", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "755456fae044e6fa1ebbbd1b3e902ae19e73097ed4ed87bb79934a867c007bc3" + +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" + +[[package]] +name = "memoffset" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c198b026e1bbf08a937e94c6c60f9ec4a2267f5b0d2eec9c1b21b061ce2be55f" +dependencies = [ + "autocfg", +] + +[[package]] +name = "multimap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1255076139a83bb467426e7f8d0134968a8118844faa755985e077cf31850333" +dependencies = [ + "serde", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "rayon" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd016f0c045ad38b5251be2c9c0ab806917f82da4d36b2a327e5166adad9270" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91739a34c4355b5434ce54c9086c5895604a9c278586d1f1aa95e04f66b525a0" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "serde" +version = "1.0.115" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54c9a88f2da7238af84b5101443f0c0d0a3bbdc455e34a5c9497b1903ed55d5" + +[[package]] +name = "subtle" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "502d53007c02d7605a05df1c1a73ee436952781653da5d0bf57ad608f66932c1" + +[[package]] +name = "typenum" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" + +[[package]] +name = "version_check" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" diff --git a/Cargo.toml b/Cargo.toml index bd844ca..b7755e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,10 @@ [package] name = "czkawka" version = "0.1.0" -authors = ["Rafał Mikrut "] +authors = ["Rafał Mikrut "] edition = "2018" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] +rayon = "1.4.0" +blake3 = "0.3.6" +multimap = "0.8.2" \ No newline at end of file diff --git a/src/duplicate.rs b/src/duplicate.rs index 778e651..c639ca3 100644 --- a/src/duplicate.rs +++ b/src/duplicate.rs @@ -1,23 +1,28 @@ // Todo, należy upewnić się, że ma wystarczające uprawnienia do odczytu i usuwania use std::collections::HashMap; -use std::fs::Metadata; +use std::fs::{File, Metadata}; +use std::hash::Hash; +use std::io::prelude::*; use std::path::Path; use std::time::SystemTime; use std::{fs, process}; pub struct DuplicateFinder { - number_of_checked_files: u64, - number_of_checked_folders: u64, - number_of_ignored_things: u64, - number_of_files_which_has_duplicated_entries: u64, - number_of_duplicated_files: u64, + number_of_checked_files: usize, + number_of_checked_folders: usize, + number_of_ignored_things: usize, + number_of_duplicated_files: usize, // files : Vec>>, - files: HashMap, - files_with_duplicated_entries: HashMap, + files_size: HashMap>, + // files_hashes: HashMap<[u8],Vec>, // duplicated_entries // Same as files, but only with 2+ entries // files : Vec>, excluded_directories: Vec, included_directories: Vec, + // ignored_extensions: Vec, + // allowed_extensions: Vec, + // ignored_file_names: Vec, // TODO Regex Support + // allowed_file_names: Vec, // TODO Regex Support } impl DuplicateFinder { @@ -26,25 +31,31 @@ impl DuplicateFinder { number_of_checked_files: 0, number_of_checked_folders: 0, number_of_ignored_things: 0, - number_of_files_which_has_duplicated_entries: 0, number_of_duplicated_files: 0, - files: Default::default(), - files_with_duplicated_entries: Default::default(), + files_size: Default::default(), + // files_hashes: Default::default(), excluded_directories: vec![], included_directories: vec![], + // ignored_extensions: vec![], + // allowed_extensions: vec![], + // ignored_file_names: vec![], + // allowed_file_names: vec![] } } + // TODO - Still isn't used but it will be probably required with GUI // pub fn clear(&mut self) { + // // self.number_of_checked_files = 0; // self.number_of_checked_folders = 0; // self.number_of_ignored_things = 0; // self.number_of_files_which_has_duplicated_entries = 0; // self.number_of_duplicated_files = 0; - // self.files.clear(); + // self.files_sizeclear(); // self.excluded_directories.clear(); // self.included_directories.clear(); // } - pub fn find_duplicates(&mut self) { + pub fn find_duplicates_by_size(&mut self) { + // TODO add multithread checking for file hash //let mut path; let start_time: SystemTime = SystemTime::now(); let mut folders_to_check: Vec = Vec::with_capacity(1024 * 16); // This should be small enough too not see to big difference and big enough to store most of paths without needing to resize vector @@ -84,6 +95,7 @@ impl DuplicateFinder { //println!("Directory\t - {:?}", next_folder); // DEBUG } else if metadata.is_file() { let current_file_name = "".to_owned() + ¤t_folder + &entry_data.file_name().into_string().unwrap(); + // println!("File\t\t - {:?}", current_file_name); // DEBUG //file_to_check let fe: FileEntry = FileEntry { path: current_file_name, @@ -91,10 +103,12 @@ impl DuplicateFinder { created_date: metadata.created().unwrap(), modified_date: metadata.modified().unwrap(), }; - self.files.insert(metadata.len(), fe); + if !self.files_size.contains_key(&metadata.len()) { + self.files_size.insert(metadata.len(), Vec::new()); + } + self.files_size.get_mut(&metadata.len()).unwrap().push(fe); self.number_of_checked_files += 1; - // println!("File\t\t - {:?}", current_file); // DEBUG } else { // Probably this is symbolic links so we are free to ignore this // println!("Found another type of file {} {:?}","".to_owned() + ¤t_folder + &entry_data.file_name().into_string().unwrap(), metadata) //DEBUG @@ -103,34 +117,110 @@ impl DuplicateFinder { } } self.debug_print(); - let end_time: SystemTime = SystemTime::now(); - println!("Duration of finding duplicates {:?}", end_time.duration_since(start_time).expect("a")); + DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates".to_string()); + //println!("Duration of finding duplicates {:?}", end_time.duration_since(start_time).expect("a")); } // pub fn save_to_file(&self) {} + /// Remove files which have unique size + pub fn remove_files_with_unique_size(&mut self) { + let start_time: SystemTime = SystemTime::now(); + self.debug_print(); + let mut new_hashmap: HashMap> = Default::default(); + + self.number_of_duplicated_files = 0; + + for entry in &self.files_size { + if entry.1.len() > 1 { + self.number_of_duplicated_files += entry.1.len() - 1; + new_hashmap.insert(*entry.0, entry.1.clone()); + } + } + + self.files_size = new_hashmap; + + self.debug_print(); + DuplicateFinder::print_time(start_time, SystemTime::now(), "optimize_files".to_string()); + } + + /// Should be slower than checking in different ways, but still needs to be checked + pub fn find_duplicates_by_hashing(mut self) { + let start_time: SystemTime = SystemTime::now(); + let mut file_handler: File; + + for entry in self.files_size { + let mut hashes: Vec = Vec::new(); + if entry.1.len() > 5 { + println!("{}", entry.1.len()); + } + + for file_entry in entry.1.iter().enumerate() { + file_handler = match File::open(&file_entry.1.path) { + Ok(T) => T, + Err(_) => { + // Removing File may happens,so we should handle this + hashes.push("".to_owned()); + continue; + } + }; + + let mut hasher: blake3::Hasher = blake3::Hasher::new(); + let mut buffer = [0u8; 16384]; + loop { + let n = file_handler.read(&mut buffer).unwrap(); + if n == 0 { + break; + } + hasher.update(&buffer[..n]); + } + //println!("{}", hasher.finalize().to_hex().to_string()); + } + } + + DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates_by_hashing".to_string()); + } + // /// I'mm not sure about performance, so maybe I + // pub fn find_small_duplicates_by_hashing(mut self){ + // let start_time: SystemTime = SystemTime::now(); + // let size_limit_for_small_files u64 = // 16 MB + // let mut new_hashmap + // + // DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates_by_comparting_begin_bytes_of_file".to_string()); + // } + + pub fn print_time(start_time: SystemTime, end_time: SystemTime, function_name: String) { + println!( + "Execution of function \"{}\" took {:?}", + function_name, + end_time.duration_since(start_time).expect("Time cannot go reverse.") + ); + } + /// Setting include directories, panics when there is not directories available pub fn set_include_directory(&mut self, mut include_directory: String) { - if include_directory.len() == 0 { + let start_time: SystemTime = SystemTime::now(); + + if include_directory.is_empty() { println!("At least one directory must be provided") } include_directory = include_directory.replace("\"", ""); - let directories: Vec = include_directory.split(",").map(String::from).collect(); + let directories: Vec = include_directory.split(',').map(String::from).collect(); let mut checked_directories: Vec = Vec::new(); for directory in directories { if directory == "/" { println!("Using / is probably not good idea, you may go out of ram."); } - if directory.contains("*") { + if directory.contains('*') { println!("Include Directory ERROR: Wildcards are not supported, please don't use it."); process::exit(1); } - if directory.starts_with("~") { + if directory.starts_with('~') { println!("Include Directory ERROR: ~ in path isn't supported."); process::exit(1); } - if !directory.starts_with("/") { + if !directory.starts_with('/') { println!("Include Directory ERROR: Relative path are not supported."); process::exit(1); } @@ -144,45 +234,46 @@ impl DuplicateFinder { } // directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho - if !directory.ends_with("/") { + if !directory.ends_with('/') { checked_directories.push(directory + "/"); } else { checked_directories.push(directory); } } - if checked_directories.len() == 0 { + if checked_directories.is_empty() { println!("Not found even one correct path to include."); process::exit(1); } self.included_directories = checked_directories; - println!("Included directories - {:?}", self.included_directories); + DuplicateFinder::print_time(start_time, SystemTime::now(), "set_include_directory".to_string()); } pub fn set_exclude_directory(&mut self, mut exclude_directory: String) { - if exclude_directory.len() == 0 { + let start_time: SystemTime = SystemTime::now(); + if exclude_directory.is_empty() { return; } exclude_directory = exclude_directory.replace("\"", ""); - let directories: Vec = exclude_directory.split(",").map(String::from).collect(); + let directories: Vec = exclude_directory.split(',').map(String::from).collect(); let mut checked_directories: Vec = Vec::new(); for directory in directories { if directory == "/" { println!("Exclude Directory ERROR: Excluding / is pointless, because it means that no files will be scanned."); } - if directory.contains("*") { + if directory.contains('*') { println!("Exclude Directory ERROR: Wildcards are not supported, please don't use it."); process::exit(1); } - if directory.starts_with("~") { + if directory.starts_with('~') { println!("Exclude Directory ERROR: ~ in path isn't supported."); process::exit(1); } - if !directory.starts_with("/") { + if !directory.starts_with('/') { println!("Exclude Directory ERROR: Relative path are not supported."); process::exit(1); } @@ -196,7 +287,7 @@ impl DuplicateFinder { } // directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho - if !directory.ends_with("/") { + if !directory.ends_with('/') { checked_directories.push(directory + "/"); } else { checked_directories.push(directory); @@ -205,7 +296,7 @@ impl DuplicateFinder { self.excluded_directories = checked_directories; - println!("Excluded directories - {:?}", &self.excluded_directories); + DuplicateFinder::print_time(start_time, SystemTime::now(), "set_exclude_directory".to_string()); } pub fn debug_print(&self) { @@ -213,9 +304,8 @@ impl DuplicateFinder { println!("Number of all checked files - {}", self.number_of_checked_files); println!("Number of all checked folders - {}", self.number_of_checked_folders); println!("Number of all ignored things - {}", self.number_of_ignored_things); - println!("Number of all files with duplicates - {}", self.number_of_files_which_has_duplicated_entries); println!("Number of duplicated files - {}", self.number_of_duplicated_files); - println!("Files list - {}", self.files.len()); + println!("Files list - {}", self.files_size.len()); println!("Excluded directories - {:?}", self.excluded_directories); println!("Included directories - {:?}", self.included_directories); println!("-----------------------------------------"); @@ -225,6 +315,8 @@ impl DuplicateFinder { /// let df : DuplicateFinder = saf /// ``` pub fn optimize_directories(&mut self) { + let start_time: SystemTime = SystemTime::now(); + let mut optimized_included: Vec = Vec::::new(); let mut optimized_excluded: Vec = Vec::::new(); // Remove duplicated entries like: "/", "/" @@ -249,7 +341,7 @@ impl DuplicateFinder { break; } } - if is_inside == false { + if !is_inside { optimized_excluded.push(ed_checked.to_string()); } } @@ -266,7 +358,7 @@ impl DuplicateFinder { break; } } - if is_inside == false { + if !is_inside { optimized_included.push(id_checked.to_string()); } } @@ -329,7 +421,7 @@ impl DuplicateFinder { self.excluded_directories = optimized_excluded; // optimized_excluded = Vec::::new(); - if self.included_directories.len() == 0 { + if self.included_directories.is_empty() { println!("Optimize Directories ERROR: Excluded directories overlaps all included directories."); process::exit(1); } @@ -337,12 +429,24 @@ impl DuplicateFinder { // Not needed, but better is to have sorted everything self.excluded_directories.sort(); self.included_directories.sort(); + DuplicateFinder::print_time(start_time, SystemTime::now(), "optimize_directories".to_string()); } } +#[derive(Clone)] struct FileEntry { pub path: String, pub size: u64, pub created_date: SystemTime, pub modified_date: SystemTime, } +impl FileEntry { + // pub fn return_copy(&self) -> FileEntry { + // let new_copy : FileEntry = FileEntry{ + // path: self.path.to_string(), + // size: self.size, + // created_date: self.created_date, + // modified_date: self.modified_date + // }; + // } +} diff --git a/src/main.rs b/src/main.rs index c40d998..fbcf61f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,7 +27,9 @@ fn main() { df.set_include_directory(arguments[2].to_string()); df.optimize_directories(); df.debug_print(); - df.find_duplicates(); + df.find_duplicates_by_size(); + df.remove_files_with_unique_size(); + df.find_duplicates_by_hashing(); } argum => println!("{} argument is not supported, check help for more info.", argum), };