2020-08-27 06:50:07 +12:00
// Todo, należy upewnić się, że ma wystarczające uprawnienia do odczytu i usuwania
use std ::collections ::HashMap ;
2020-08-29 06:30:22 +12:00
use std ::fs ::{ File , Metadata } ;
use std ::hash ::Hash ;
use std ::io ::prelude ::* ;
2020-08-28 04:57:56 +12:00
use std ::path ::Path ;
use std ::time ::SystemTime ;
use std ::{ fs , process } ;
2020-08-27 06:50:07 +12:00
pub struct DuplicateFinder {
2020-08-29 06:30:22 +12:00
number_of_checked_files : usize ,
number_of_checked_folders : usize ,
number_of_ignored_things : usize ,
number_of_duplicated_files : usize ,
2020-08-27 06:50:07 +12:00
// files : Vec<HashMap<FileEntry, Vec<FileEntry>>>,
2020-08-29 06:30:22 +12:00
files_size : HashMap < u64 , Vec < FileEntry > > ,
// files_hashes: HashMap<[u8],Vec<FileEntry>>,
2020-08-28 04:57:56 +12:00
// duplicated_entries // Same as files, but only with 2+ entries
2020-08-27 06:50:07 +12:00
// files : Vec<Vec<FileEntry>>,
excluded_directories : Vec < String > ,
included_directories : Vec < String > ,
2020-08-29 06:30:22 +12:00
// ignored_extensions: Vec<String>,
// allowed_extensions: Vec<String>,
// ignored_file_names: Vec<String>, // TODO Regex Support
// allowed_file_names: Vec<String>, // TODO Regex Support
2020-08-27 06:50:07 +12:00
}
impl DuplicateFinder {
pub fn new ( ) -> DuplicateFinder {
DuplicateFinder {
number_of_checked_files : 0 ,
2020-08-28 04:57:56 +12:00
number_of_checked_folders : 0 ,
number_of_ignored_things : 0 ,
2020-08-27 06:50:07 +12:00
number_of_duplicated_files : 0 ,
2020-08-29 06:30:22 +12:00
files_size : Default ::default ( ) ,
// files_hashes: Default::default(),
2020-08-27 06:50:07 +12:00
excluded_directories : vec ! [ ] ,
included_directories : vec ! [ ] ,
2020-08-29 06:30:22 +12:00
// ignored_extensions: vec![],
// allowed_extensions: vec![],
// ignored_file_names: vec![],
// allowed_file_names: vec![]
2020-08-27 06:50:07 +12:00
}
}
2020-08-29 06:30:22 +12:00
// TODO - Still isn't used but it will be probably required with GUI
2020-08-27 19:42:37 +12:00
// pub fn clear(&mut self) {
2020-08-29 06:30:22 +12:00
//
2020-08-27 19:42:37 +12:00
// self.number_of_checked_files = 0;
2020-08-28 04:57:56 +12:00
// self.number_of_checked_folders = 0;
// self.number_of_ignored_things = 0;
2020-08-27 19:42:37 +12:00
// self.number_of_files_which_has_duplicated_entries = 0;
// self.number_of_duplicated_files = 0;
2020-08-29 06:30:22 +12:00
// self.files_sizeclear();
2020-08-27 19:42:37 +12:00
// self.excluded_directories.clear();
// self.included_directories.clear();
// }
2020-08-29 06:30:22 +12:00
pub fn find_duplicates_by_size ( & mut self ) {
// TODO add multithread checking for file hash
2020-08-28 04:57:56 +12:00
//let mut path;
let start_time : SystemTime = SystemTime ::now ( ) ;
let mut folders_to_check : Vec < String > = Vec ::with_capacity ( 1024 * 16 ) ; // This should be small enough too not see to big difference and big enough to store most of paths without needing to resize vector
// Add root folders for finding
for id in & self . included_directories {
folders_to_check . push ( id . to_string ( ) ) ;
}
let mut current_folder : String ;
let mut next_folder : String ;
while ! folders_to_check . is_empty ( ) {
current_folder = folders_to_check . pop ( ) . unwrap ( ) ;
let read_dir = fs ::read_dir ( & current_folder ) ;
let read_dir = match read_dir {
Ok ( t ) = > t ,
_ = > continue ,
} ;
for entry in read_dir {
let entry_data = entry . unwrap ( ) ;
let metadata : Metadata = entry_data . metadata ( ) . unwrap ( ) ;
if metadata . is_dir ( ) {
let mut is_excluded_dir = false ;
next_folder = " " . to_owned ( ) + & current_folder + & entry_data . file_name ( ) . into_string ( ) . unwrap ( ) + " / " ;
for ed in & self . excluded_directories {
if next_folder = = ed . to_string ( ) {
is_excluded_dir = true ;
break ;
}
}
if ! is_excluded_dir {
folders_to_check . push ( next_folder ) ;
}
self . number_of_checked_folders + = 1 ;
//println!("Directory\t - {:?}", next_folder); // DEBUG
} else if metadata . is_file ( ) {
let current_file_name = " " . to_owned ( ) + & current_folder + & entry_data . file_name ( ) . into_string ( ) . unwrap ( ) ;
2020-08-29 06:30:22 +12:00
// println!("File\t\t - {:?}", current_file_name); // DEBUG
2020-08-28 04:57:56 +12:00
//file_to_check
let fe : FileEntry = FileEntry {
path : current_file_name ,
size : metadata . len ( ) ,
created_date : metadata . created ( ) . unwrap ( ) ,
modified_date : metadata . modified ( ) . unwrap ( ) ,
} ;
2020-08-29 06:30:22 +12:00
if ! self . files_size . contains_key ( & metadata . len ( ) ) {
self . files_size . insert ( metadata . len ( ) , Vec ::new ( ) ) ;
}
self . files_size . get_mut ( & metadata . len ( ) ) . unwrap ( ) . push ( fe ) ;
2020-08-28 04:57:56 +12:00
self . number_of_checked_files + = 1 ;
} else {
// Probably this is symbolic links so we are free to ignore this
// println!("Found another type of file {} {:?}","".to_owned() + ¤t_folder + &entry_data.file_name().into_string().unwrap(), metadata) //DEBUG
self . number_of_ignored_things + = 1 ;
}
}
}
self . debug_print ( ) ;
2020-08-29 06:30:22 +12:00
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " find_duplicates " . to_string ( ) ) ;
//println!("Duration of finding duplicates {:?}", end_time.duration_since(start_time).expect("a"));
2020-08-28 04:57:56 +12:00
}
2020-08-27 19:42:37 +12:00
// pub fn save_to_file(&self) {}
2020-08-29 06:30:22 +12:00
/// Remove files which have unique size
pub fn remove_files_with_unique_size ( & mut self ) {
let start_time : SystemTime = SystemTime ::now ( ) ;
self . debug_print ( ) ;
let mut new_hashmap : HashMap < u64 , Vec < FileEntry > > = Default ::default ( ) ;
self . number_of_duplicated_files = 0 ;
for entry in & self . files_size {
if entry . 1. len ( ) > 1 {
self . number_of_duplicated_files + = entry . 1. len ( ) - 1 ;
new_hashmap . insert ( * entry . 0 , entry . 1. clone ( ) ) ;
}
}
self . files_size = new_hashmap ;
self . debug_print ( ) ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " optimize_files " . to_string ( ) ) ;
}
/// Should be slower than checking in different ways, but still needs to be checked
pub fn find_duplicates_by_hashing ( mut self ) {
let start_time : SystemTime = SystemTime ::now ( ) ;
let mut file_handler : File ;
for entry in self . files_size {
let mut hashes : Vec < String > = Vec ::new ( ) ;
if entry . 1. len ( ) > 5 {
println! ( " {} " , entry . 1. len ( ) ) ;
}
for file_entry in entry . 1. iter ( ) . enumerate ( ) {
file_handler = match File ::open ( & file_entry . 1. path ) {
Ok ( T ) = > T ,
Err ( _ ) = > {
// Removing File may happens,so we should handle this
hashes . push ( " " . to_owned ( ) ) ;
continue ;
}
} ;
let mut hasher : blake3 ::Hasher = blake3 ::Hasher ::new ( ) ;
let mut buffer = [ 0 u8 ; 16384 ] ;
loop {
let n = file_handler . read ( & mut buffer ) . unwrap ( ) ;
if n = = 0 {
break ;
}
hasher . update ( & buffer [ .. n ] ) ;
}
//println!("{}", hasher.finalize().to_hex().to_string());
}
}
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " find_duplicates_by_hashing " . to_string ( ) ) ;
}
// /// I'mm not sure about performance, so maybe I
// pub fn find_small_duplicates_by_hashing(mut self){
// let start_time: SystemTime = SystemTime::now();
// let size_limit_for_small_files u64 = // 16 MB
// let mut new_hashmap
//
// DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates_by_comparting_begin_bytes_of_file".to_string());
// }
pub fn print_time ( start_time : SystemTime , end_time : SystemTime , function_name : String ) {
println! (
" Execution of function \" {} \" took {:?} " ,
function_name ,
end_time . duration_since ( start_time ) . expect ( " Time cannot go reverse. " )
) ;
}
2020-08-27 19:42:37 +12:00
/// Setting include directories, panics when there is not directories available
2020-08-27 06:50:07 +12:00
pub fn set_include_directory ( & mut self , mut include_directory : String ) {
2020-08-29 06:30:22 +12:00
let start_time : SystemTime = SystemTime ::now ( ) ;
if include_directory . is_empty ( ) {
2020-08-27 06:50:07 +12:00
println! ( " At least one directory must be provided " )
}
include_directory = include_directory . replace ( " \" " , " " ) ;
2020-08-29 06:30:22 +12:00
let directories : Vec < String > = include_directory . split ( ',' ) . map ( String ::from ) . collect ( ) ;
2020-08-27 06:50:07 +12:00
let mut checked_directories : Vec < String > = Vec ::new ( ) ;
for directory in directories {
if directory = = " / " {
println! ( " Using / is probably not good idea, you may go out of ram. " ) ;
}
2020-08-29 06:30:22 +12:00
if directory . contains ( '*' ) {
2020-08-27 06:50:07 +12:00
println! ( " Include Directory ERROR: Wildcards are not supported, please don't use it. " ) ;
process ::exit ( 1 ) ;
}
2020-08-29 06:30:22 +12:00
if directory . starts_with ( '~' ) {
2020-08-27 06:50:07 +12:00
println! ( " Include Directory ERROR: ~ in path isn't supported. " ) ;
process ::exit ( 1 ) ;
}
2020-08-29 06:30:22 +12:00
if ! directory . starts_with ( '/' ) {
2020-08-27 06:50:07 +12:00
println! ( " Include Directory ERROR: Relative path are not supported. " ) ;
process ::exit ( 1 ) ;
}
2020-08-28 04:57:56 +12:00
if ! Path ::new ( & directory ) . exists ( ) {
println! ( " Include Directory ERROR: Path {} doens't exists. " , directory ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( " Include Directory ERROR: {} isn't folder. " , directory ) ;
process ::exit ( 1 ) ;
}
2020-08-27 19:42:37 +12:00
// directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho
2020-08-29 06:30:22 +12:00
if ! directory . ends_with ( '/' ) {
2020-08-27 19:42:37 +12:00
checked_directories . push ( directory + " / " ) ;
} else {
checked_directories . push ( directory ) ;
}
2020-08-27 06:50:07 +12:00
}
2020-08-29 06:30:22 +12:00
if checked_directories . is_empty ( ) {
2020-08-27 06:50:07 +12:00
println! ( " Not found even one correct path to include. " ) ;
process ::exit ( 1 ) ;
}
self . included_directories = checked_directories ;
2020-08-29 06:30:22 +12:00
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " set_include_directory " . to_string ( ) ) ;
2020-08-27 06:50:07 +12:00
}
pub fn set_exclude_directory ( & mut self , mut exclude_directory : String ) {
2020-08-29 06:30:22 +12:00
let start_time : SystemTime = SystemTime ::now ( ) ;
if exclude_directory . is_empty ( ) {
2020-08-27 06:50:07 +12:00
return ;
}
exclude_directory = exclude_directory . replace ( " \" " , " " ) ;
2020-08-29 06:30:22 +12:00
let directories : Vec < String > = exclude_directory . split ( ',' ) . map ( String ::from ) . collect ( ) ;
2020-08-27 06:50:07 +12:00
let mut checked_directories : Vec < String > = Vec ::new ( ) ;
for directory in directories {
if directory = = " / " {
println! ( " Exclude Directory ERROR: Excluding / is pointless, because it means that no files will be scanned. " ) ;
}
2020-08-29 06:30:22 +12:00
if directory . contains ( '*' ) {
2020-08-27 08:24:02 +12:00
println! ( " Exclude Directory ERROR: Wildcards are not supported, please don't use it. " ) ;
2020-08-27 06:50:07 +12:00
process ::exit ( 1 ) ;
}
2020-08-29 06:30:22 +12:00
if directory . starts_with ( '~' ) {
2020-08-27 08:24:02 +12:00
println! ( " Exclude Directory ERROR: ~ in path isn't supported. " ) ;
2020-08-27 06:50:07 +12:00
process ::exit ( 1 ) ;
}
2020-08-29 06:30:22 +12:00
if ! directory . starts_with ( '/' ) {
2020-08-27 08:24:02 +12:00
println! ( " Exclude Directory ERROR: Relative path are not supported. " ) ;
2020-08-27 06:50:07 +12:00
process ::exit ( 1 ) ;
}
2020-08-28 04:57:56 +12:00
if ! Path ::new ( & directory ) . exists ( ) {
println! ( " Exclude Directory ERROR: Path {} doens't exists. " , directory ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( " Exclude Directory ERROR: {} isn't folder. " , directory ) ;
process ::exit ( 1 ) ;
}
2020-08-27 19:42:37 +12:00
// directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho
2020-08-29 06:30:22 +12:00
if ! directory . ends_with ( '/' ) {
2020-08-27 19:42:37 +12:00
checked_directories . push ( directory + " / " ) ;
} else {
checked_directories . push ( directory ) ;
}
2020-08-27 06:50:07 +12:00
}
2020-08-27 08:24:02 +12:00
self . excluded_directories = checked_directories ;
2020-08-27 06:50:07 +12:00
2020-08-29 06:30:22 +12:00
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " set_exclude_directory " . to_string ( ) ) ;
2020-08-27 06:50:07 +12:00
}
2020-08-27 08:24:02 +12:00
2020-08-27 06:50:07 +12:00
pub fn debug_print ( & self ) {
2020-08-27 08:24:02 +12:00
println! ( " ---------------DEBUG PRINT--------------- " ) ;
2020-08-27 06:50:07 +12:00
println! ( " Number of all checked files - {} " , self . number_of_checked_files ) ;
2020-08-28 04:57:56 +12:00
println! ( " Number of all checked folders - {} " , self . number_of_checked_folders ) ;
println! ( " Number of all ignored things - {} " , self . number_of_ignored_things ) ;
2020-08-27 06:50:07 +12:00
println! ( " Number of duplicated files - {} " , self . number_of_duplicated_files ) ;
2020-08-29 06:30:22 +12:00
println! ( " Files list - {} " , self . files_size . len ( ) ) ;
2020-08-27 08:24:02 +12:00
println! ( " Excluded directories - {:?} " , self . excluded_directories ) ;
println! ( " Included directories - {:?} " , self . included_directories ) ;
println! ( " ----------------------------------------- " ) ;
}
2020-08-27 19:42:37 +12:00
/// Remove unused entries when included or excluded overlaps with each other or are duplicated
/// ```
/// let df : DuplicateFinder = saf
/// ```
pub fn optimize_directories ( & mut self ) {
2020-08-29 06:30:22 +12:00
let start_time : SystemTime = SystemTime ::now ( ) ;
2020-08-27 08:24:02 +12:00
let mut optimized_included : Vec < String > = Vec ::< String > ::new ( ) ;
let mut optimized_excluded : Vec < String > = Vec ::< String > ::new ( ) ;
// Remove duplicated entries like: "/", "/"
self . excluded_directories . sort ( ) ;
self . included_directories . sort ( ) ;
self . excluded_directories . dedup ( ) ;
self . included_directories . dedup ( ) ;
// Optimize for duplicated included directories - "/", "/home". "/home/Pulpit" to "/"- TODO
2020-08-27 19:42:37 +12:00
let mut is_inside : bool ;
for ed_checked in & self . excluded_directories {
is_inside = false ;
for ed_help in & self . excluded_directories {
if ed_checked = = ed_help {
// We checking same element
continue ;
}
if ed_checked . starts_with ( ed_help ) {
is_inside = true ;
break ;
}
}
2020-08-29 06:30:22 +12:00
if ! is_inside {
2020-08-27 19:42:37 +12:00
optimized_excluded . push ( ed_checked . to_string ( ) ) ;
}
}
for id_checked in & self . included_directories {
is_inside = false ;
for id_help in & self . included_directories {
if id_checked = = id_help {
// We checking same element
continue ;
}
if id_checked . starts_with ( id_help ) {
is_inside = true ;
break ;
}
}
2020-08-29 06:30:22 +12:00
if ! is_inside {
2020-08-27 19:42:37 +12:00
optimized_included . push ( id_checked . to_string ( ) ) ;
}
}
self . included_directories = optimized_included ;
optimized_included = Vec ::< String > ::new ( ) ;
self . excluded_directories = optimized_excluded ;
2020-08-28 04:57:56 +12:00
optimized_excluded = Vec ::< String > ::new ( ) ;
2020-08-27 08:24:02 +12:00
// Remove include directories which are inside any exclude directory
2020-08-28 04:57:56 +12:00
for id in & self . included_directories {
let mut is_inside : bool = false ;
for ed in & self . excluded_directories {
2020-08-27 08:24:02 +12:00
if id . starts_with ( ed ) {
2020-08-28 04:57:56 +12:00
is_inside = true ;
break ;
2020-08-27 08:24:02 +12:00
}
2020-08-28 04:57:56 +12:00
}
if ! is_inside {
2020-08-27 08:24:02 +12:00
optimized_included . push ( id . to_string ( ) ) ;
}
}
2020-08-28 04:57:56 +12:00
self . included_directories = optimized_included ;
optimized_included = Vec ::< String > ::new ( ) ;
// Remove non existed directories
for id in & self . included_directories {
let path = Path ::new ( id ) ;
if path . exists ( ) {
optimized_included . push ( id . to_string ( ) ) ;
}
}
for ed in & self . excluded_directories {
let path = Path ::new ( ed ) ;
if path . exists ( ) {
optimized_excluded . push ( ed . to_string ( ) ) ;
}
}
2020-08-27 08:24:02 +12:00
self . included_directories = optimized_included ;
2020-08-27 19:42:37 +12:00
// optimized_included = Vec::<String>::new();
2020-08-28 04:57:56 +12:00
self . excluded_directories = optimized_excluded ;
optimized_excluded = Vec ::< String > ::new ( ) ;
// Excluded paths must are inside include path, because TODO
for ed in & self . excluded_directories {
let mut is_inside : bool = false ;
for id in & self . included_directories {
if ed . starts_with ( id ) {
is_inside = true ;
break ;
}
}
if is_inside {
optimized_excluded . push ( ed . to_string ( ) ) ;
}
}
self . excluded_directories = optimized_excluded ;
// optimized_excluded = Vec::<String>::new();
2020-08-27 08:24:02 +12:00
2020-08-29 06:30:22 +12:00
if self . included_directories . is_empty ( ) {
2020-08-27 08:24:02 +12:00
println! ( " Optimize Directories ERROR: Excluded directories overlaps all included directories. " ) ;
process ::exit ( 1 ) ;
}
2020-08-27 19:42:37 +12:00
// Not needed, but better is to have sorted everything
self . excluded_directories . sort ( ) ;
self . included_directories . sort ( ) ;
2020-08-29 06:30:22 +12:00
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " optimize_directories " . to_string ( ) ) ;
2020-08-27 06:50:07 +12:00
}
}
2020-08-29 06:30:22 +12:00
#[ derive(Clone) ]
2020-08-27 06:50:07 +12:00
struct FileEntry {
2020-08-28 04:57:56 +12:00
pub path : String ,
pub size : u64 ,
pub created_date : SystemTime ,
pub modified_date : SystemTime ,
2020-08-27 06:50:07 +12:00
}
2020-08-29 06:30:22 +12:00
impl FileEntry {
// pub fn return_copy(&self) -> FileEntry {
// let new_copy : FileEntry = FileEntry{
// path: self.path.to_string(),
// size: self.size,
// created_date: self.created_date,
// modified_date: self.modified_date
// };
// }
}