// Todo, należy upewnić się, że ma wystarczające uprawnienia do odczytu i usuwania use std::collections::HashMap; use std::fs::{File, Metadata}; use std::hash::Hash; use std::io::prelude::*; use std::path::Path; use std::time::SystemTime; use std::{fs, process}; pub struct DuplicateFinder { number_of_checked_files: usize, number_of_checked_folders: usize, number_of_ignored_things: usize, number_of_duplicated_files: usize, // files : Vec>>, files_size: HashMap>, // files_hashes: HashMap<[u8],Vec>, // duplicated_entries // Same as files, but only with 2+ entries // files : Vec>, excluded_directories: Vec, included_directories: Vec, // ignored_extensions: Vec, // allowed_extensions: Vec, // ignored_file_names: Vec, // TODO Regex Support // allowed_file_names: Vec, // TODO Regex Support } impl DuplicateFinder { pub fn new() -> DuplicateFinder { DuplicateFinder { number_of_checked_files: 0, number_of_checked_folders: 0, number_of_ignored_things: 0, number_of_duplicated_files: 0, files_size: Default::default(), // files_hashes: Default::default(), excluded_directories: vec![], included_directories: vec![], // ignored_extensions: vec![], // allowed_extensions: vec![], // ignored_file_names: vec![], // allowed_file_names: vec![] } } // TODO - Still isn't used but it will be probably required with GUI // pub fn clear(&mut self) { // // self.number_of_checked_files = 0; // self.number_of_checked_folders = 0; // self.number_of_ignored_things = 0; // self.number_of_files_which_has_duplicated_entries = 0; // self.number_of_duplicated_files = 0; // self.files_sizeclear(); // self.excluded_directories.clear(); // self.included_directories.clear(); // } pub fn find_duplicates_by_size(&mut self) { // TODO add multithread checking for file hash //let mut path; let start_time: SystemTime = SystemTime::now(); let mut folders_to_check: Vec = Vec::with_capacity(1024 * 16); // This should be small enough too not see to big difference and big enough to store most of paths without needing to resize vector // Add root folders for finding for id in &self.included_directories { folders_to_check.push(id.to_string()); } let mut current_folder: String; let mut next_folder: String; while !folders_to_check.is_empty() { current_folder = folders_to_check.pop().unwrap(); let read_dir = fs::read_dir(¤t_folder); let read_dir = match read_dir { Ok(t) => t, _ => continue, }; for entry in read_dir { let entry_data = entry.unwrap(); let metadata: Metadata = entry_data.metadata().unwrap(); if metadata.is_dir() { let mut is_excluded_dir = false; next_folder = "".to_owned() + ¤t_folder + &entry_data.file_name().into_string().unwrap() + "/"; for ed in &self.excluded_directories { if next_folder == ed.to_string() { is_excluded_dir = true; break; } } if !is_excluded_dir { folders_to_check.push(next_folder); } self.number_of_checked_folders += 1; //println!("Directory\t - {:?}", next_folder); // DEBUG } else if metadata.is_file() { let current_file_name = "".to_owned() + ¤t_folder + &entry_data.file_name().into_string().unwrap(); // println!("File\t\t - {:?}", current_file_name); // DEBUG //file_to_check let fe: FileEntry = FileEntry { path: current_file_name, size: metadata.len(), created_date: metadata.created().unwrap(), modified_date: metadata.modified().unwrap(), }; if !self.files_size.contains_key(&metadata.len()) { self.files_size.insert(metadata.len(), Vec::new()); } self.files_size.get_mut(&metadata.len()).unwrap().push(fe); self.number_of_checked_files += 1; } else { // Probably this is symbolic links so we are free to ignore this // println!("Found another type of file {} {:?}","".to_owned() + ¤t_folder + &entry_data.file_name().into_string().unwrap(), metadata) //DEBUG self.number_of_ignored_things += 1; } } } self.debug_print(); DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates".to_string()); //println!("Duration of finding duplicates {:?}", end_time.duration_since(start_time).expect("a")); } // pub fn save_to_file(&self) {} /// Remove files which have unique size pub fn remove_files_with_unique_size(&mut self) { let start_time: SystemTime = SystemTime::now(); self.debug_print(); let mut new_hashmap: HashMap> = Default::default(); self.number_of_duplicated_files = 0; for entry in &self.files_size { if entry.1.len() > 1 { self.number_of_duplicated_files += entry.1.len() - 1; new_hashmap.insert(*entry.0, entry.1.clone()); } } self.files_size = new_hashmap; self.debug_print(); DuplicateFinder::print_time(start_time, SystemTime::now(), "optimize_files".to_string()); } /// Should be slower than checking in different ways, but still needs to be checked pub fn find_duplicates_by_hashing(mut self) { let start_time: SystemTime = SystemTime::now(); let mut file_handler: File; for entry in self.files_size { let mut hashes: Vec = Vec::new(); if entry.1.len() > 5 { println!("{}", entry.1.len()); } for file_entry in entry.1.iter().enumerate() { file_handler = match File::open(&file_entry.1.path) { Ok(T) => T, Err(_) => { // Removing File may happens,so we should handle this hashes.push("".to_owned()); continue; } }; let mut hasher: blake3::Hasher = blake3::Hasher::new(); let mut buffer = [0u8; 16384]; loop { let n = file_handler.read(&mut buffer).unwrap(); if n == 0 { break; } hasher.update(&buffer[..n]); } //println!("{}", hasher.finalize().to_hex().to_string()); } } DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates_by_hashing".to_string()); } // /// I'mm not sure about performance, so maybe I // pub fn find_small_duplicates_by_hashing(mut self){ // let start_time: SystemTime = SystemTime::now(); // let size_limit_for_small_files u64 = // 16 MB // let mut new_hashmap // // DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates_by_comparting_begin_bytes_of_file".to_string()); // } pub fn print_time(start_time: SystemTime, end_time: SystemTime, function_name: String) { println!( "Execution of function \"{}\" took {:?}", function_name, end_time.duration_since(start_time).expect("Time cannot go reverse.") ); } /// Setting include directories, panics when there is not directories available pub fn set_include_directory(&mut self, mut include_directory: String) { let start_time: SystemTime = SystemTime::now(); if include_directory.is_empty() { println!("At least one directory must be provided") } include_directory = include_directory.replace("\"", ""); let directories: Vec = include_directory.split(',').map(String::from).collect(); let mut checked_directories: Vec = Vec::new(); for directory in directories { if directory == "/" { println!("Using / is probably not good idea, you may go out of ram."); } if directory.contains('*') { println!("Include Directory ERROR: Wildcards are not supported, please don't use it."); process::exit(1); } if directory.starts_with('~') { println!("Include Directory ERROR: ~ in path isn't supported."); process::exit(1); } if !directory.starts_with('/') { println!("Include Directory ERROR: Relative path are not supported."); process::exit(1); } if !Path::new(&directory).exists() { println!("Include Directory ERROR: Path {} doens't exists.", directory); process::exit(1); } if !Path::new(&directory).exists() { println!("Include Directory ERROR: {} isn't folder.", directory); process::exit(1); } // directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho if !directory.ends_with('/') { checked_directories.push(directory + "/"); } else { checked_directories.push(directory); } } if checked_directories.is_empty() { println!("Not found even one correct path to include."); process::exit(1); } self.included_directories = checked_directories; DuplicateFinder::print_time(start_time, SystemTime::now(), "set_include_directory".to_string()); } pub fn set_exclude_directory(&mut self, mut exclude_directory: String) { let start_time: SystemTime = SystemTime::now(); if exclude_directory.is_empty() { return; } exclude_directory = exclude_directory.replace("\"", ""); let directories: Vec = exclude_directory.split(',').map(String::from).collect(); let mut checked_directories: Vec = Vec::new(); for directory in directories { if directory == "/" { println!("Exclude Directory ERROR: Excluding / is pointless, because it means that no files will be scanned."); } if directory.contains('*') { println!("Exclude Directory ERROR: Wildcards are not supported, please don't use it."); process::exit(1); } if directory.starts_with('~') { println!("Exclude Directory ERROR: ~ in path isn't supported."); process::exit(1); } if !directory.starts_with('/') { println!("Exclude Directory ERROR: Relative path are not supported."); process::exit(1); } if !Path::new(&directory).exists() { println!("Exclude Directory ERROR: Path {} doens't exists.", directory); process::exit(1); } if !Path::new(&directory).exists() { println!("Exclude Directory ERROR: {} isn't folder.", directory); process::exit(1); } // directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho if !directory.ends_with('/') { checked_directories.push(directory + "/"); } else { checked_directories.push(directory); } } self.excluded_directories = checked_directories; DuplicateFinder::print_time(start_time, SystemTime::now(), "set_exclude_directory".to_string()); } pub fn debug_print(&self) { println!("---------------DEBUG PRINT---------------"); println!("Number of all checked files - {}", self.number_of_checked_files); println!("Number of all checked folders - {}", self.number_of_checked_folders); println!("Number of all ignored things - {}", self.number_of_ignored_things); println!("Number of duplicated files - {}", self.number_of_duplicated_files); println!("Files list - {}", self.files_size.len()); println!("Excluded directories - {:?}", self.excluded_directories); println!("Included directories - {:?}", self.included_directories); println!("-----------------------------------------"); } /// Remove unused entries when included or excluded overlaps with each other or are duplicated /// ``` /// let df : DuplicateFinder = saf /// ``` pub fn optimize_directories(&mut self) { let start_time: SystemTime = SystemTime::now(); let mut optimized_included: Vec = Vec::::new(); let mut optimized_excluded: Vec = Vec::::new(); // Remove duplicated entries like: "/", "/" self.excluded_directories.sort(); self.included_directories.sort(); self.excluded_directories.dedup(); self.included_directories.dedup(); // Optimize for duplicated included directories - "/", "/home". "/home/Pulpit" to "/"- TODO let mut is_inside: bool; for ed_checked in &self.excluded_directories { is_inside = false; for ed_help in &self.excluded_directories { if ed_checked == ed_help { // We checking same element continue; } if ed_checked.starts_with(ed_help) { is_inside = true; break; } } if !is_inside { optimized_excluded.push(ed_checked.to_string()); } } for id_checked in &self.included_directories { is_inside = false; for id_help in &self.included_directories { if id_checked == id_help { // We checking same element continue; } if id_checked.starts_with(id_help) { is_inside = true; break; } } if !is_inside { optimized_included.push(id_checked.to_string()); } } self.included_directories = optimized_included; optimized_included = Vec::::new(); self.excluded_directories = optimized_excluded; optimized_excluded = Vec::::new(); // Remove include directories which are inside any exclude directory for id in &self.included_directories { let mut is_inside: bool = false; for ed in &self.excluded_directories { if id.starts_with(ed) { is_inside = true; break; } } if !is_inside { optimized_included.push(id.to_string()); } } self.included_directories = optimized_included; optimized_included = Vec::::new(); // Remove non existed directories for id in &self.included_directories { let path = Path::new(id); if path.exists() { optimized_included.push(id.to_string()); } } for ed in &self.excluded_directories { let path = Path::new(ed); if path.exists() { optimized_excluded.push(ed.to_string()); } } self.included_directories = optimized_included; // optimized_included = Vec::::new(); self.excluded_directories = optimized_excluded; optimized_excluded = Vec::::new(); // Excluded paths must are inside include path, because TODO for ed in &self.excluded_directories { let mut is_inside: bool = false; for id in &self.included_directories { if ed.starts_with(id) { is_inside = true; break; } } if is_inside { optimized_excluded.push(ed.to_string()); } } self.excluded_directories = optimized_excluded; // optimized_excluded = Vec::::new(); if self.included_directories.is_empty() { println!("Optimize Directories ERROR: Excluded directories overlaps all included directories."); process::exit(1); } // Not needed, but better is to have sorted everything self.excluded_directories.sort(); self.included_directories.sort(); DuplicateFinder::print_time(start_time, SystemTime::now(), "optimize_directories".to_string()); } } #[derive(Clone)] struct FileEntry { pub path: String, pub size: u64, pub created_date: SystemTime, pub modified_date: SystemTime, } impl FileEntry { // pub fn return_copy(&self) -> FileEntry { // let new_copy : FileEntry = FileEntry{ // path: self.path.to_string(), // size: self.size, // created_date: self.created_date, // modified_date: self.modified_date // }; // } }