1
0
Fork 0
mirror of synced 2024-04-30 18:43:25 +12:00

Added checking for duplicates by checking hash of first 1MB of file

This commit is contained in:
Rafał Mikrut 2020-09-26 20:46:35 +02:00
parent 4fb2a0cc92
commit 338352f384
5 changed files with 28 additions and 19 deletions

View file

@ -109,6 +109,8 @@ fn main() {
df.set_check_method(duplicate::CheckingMethod::Size); df.set_check_method(duplicate::CheckingMethod::Size);
} else if argument_name == "hash" { } else if argument_name == "hash" {
df.set_check_method(duplicate::CheckingMethod::Hash); df.set_check_method(duplicate::CheckingMethod::Hash);
} else if argument_name == "hashmb" {
df.set_check_method(duplicate::CheckingMethod::HashMB);
} else { } else {
println!("-l can only have values hash or size"); println!("-l can only have values hash or size");
process::exit(1); process::exit(1);
@ -272,7 +274,7 @@ Usage of Czkawka:
-f file_to_save - saves results to file -f file_to_save - saves results to file
-s min_size - minimum size of checked files in bytes, assigning bigger value may speed up searching. -s min_size - minimum size of checked files in bytes, assigning bigger value may speed up searching.
-x allowed_extension - list of checked extension, e.g. "jpg,mp4" will allow to check "book.jpg" and "car.mp4" but not roman.png. There are also helpful macros which allow to easy use a typcal extension like IMAGE("jpg,kra,gif,png,bmp,tiff,webp,hdr,svg") or TEXT("txt,doc,docx,odt,rtf") -x allowed_extension - list of checked extension, e.g. "jpg,mp4" will allow to check "book.jpg" and "car.mp4" but not roman.png. There are also helpful macros which allow to easy use a typcal extension like IMAGE("jpg,kra,gif,png,bmp,tiff,webp,hdr,svg") or TEXT("txt,doc,docx,odt,rtf")
-l type_of_search - allows to use fastest which takes into account only size, and more accurate which check if file contnet is same(hashes). -l type_of_search - allows to use fastest which takes into account only size(SIZE), more accurate which takes into account hash of only first 1MB of file(HASHMB) or fully accurate(the slowest solution) which check hash of all file(HASH).
-delete - delete found files, by default remove all except the most oldest one, it can take arguments: aen(All except newest one), aeo(All except oldest one), on(Only one newest), oo(Only one oldest) -delete - delete found files, by default remove all except the most oldest one, it can take arguments: aen(All except newest one), aeo(All except oldest one), on(Only one newest), oo(Only one oldest)
Usage example: Usage example:

View file

@ -87,7 +87,6 @@ impl BigFile {
&self.information &self.information
} }
pub fn set_recursive_search(&mut self, recursive_search: bool) { pub fn set_recursive_search(&mut self, recursive_search: bool) {
self.recursive_search = recursive_search; self.recursive_search = recursive_search;
} }
@ -208,14 +207,14 @@ impl BigFile {
Ok(t) => t, Ok(t) => t,
Err(_) => { Err(_) => {
self.text_messages.warnings.push("Unable to get creation date from file ".to_string() + current_file_name.as_str()); self.text_messages.warnings.push("Unable to get creation date from file ".to_string() + current_file_name.as_str());
continue continue;
} // Permissions Denied } // Permissions Denied
}, },
modified_date: match metadata.modified() { modified_date: match metadata.modified() {
Ok(t) => t, Ok(t) => t,
Err(_) => { Err(_) => {
self.text_messages.warnings.push("Unable to get modification date from file ".to_string() + current_file_name.as_str()); self.text_messages.warnings.push("Unable to get modification date from file ".to_string() + current_file_name.as_str());
continue continue;
} // Permissions Denied } // Permissions Denied
}, },
}; };
@ -234,8 +233,6 @@ impl BigFile {
} }
} }
// Extract n biggest files to new TreeMap // Extract n biggest files to new TreeMap
let mut new_map: BTreeMap<u64, Vec<FileEntry>> = Default::default(); let mut new_map: BTreeMap<u64, Vec<FileEntry>> = Default::default();
@ -376,7 +373,7 @@ impl PrintResults for BigFile {
for (size, vector) in self.big_files.iter().rev() { for (size, vector) in self.big_files.iter().rev() {
// TODO Align all to same width // TODO Align all to same width
for entry in vector { for entry in vector {
println!("{} ({}) - {}", size.file_size(options::BINARY).unwrap(), size, entry.path); println!("{} ({} bytes) - {}", size.file_size(options::BINARY).unwrap(), size, entry.path);
} }
} }
Common::print_time(start_time, SystemTime::now(), "print_duplicated_entries".to_string()); Common::print_time(start_time, SystemTime::now(), "print_duplicated_entries".to_string());

View file

@ -12,11 +12,14 @@ use crate::common_items::ExcludedItems;
use crate::common_messages::Messages; use crate::common_messages::Messages;
use crate::common_traits::*; use crate::common_traits::*;
const HASH_MB_LIMIT_BYTES: u64 = 1024 * 1024; // 1MB
#[derive(PartialEq, Eq, Clone, Debug)] #[derive(PartialEq, Eq, Clone, Debug)]
pub enum CheckingMethod { pub enum CheckingMethod {
None, None,
Size, Size,
Hash, Hash,
HashMB,
} }
#[derive(Eq, PartialEq, Clone, Debug)] #[derive(Eq, PartialEq, Clone, Debug)]
@ -116,7 +119,7 @@ impl DuplicateFinder {
pub fn find_duplicates(&mut self) { pub fn find_duplicates(&mut self) {
self.directories.optimize_directories(self.recursive_search, &mut self.text_messages); self.directories.optimize_directories(self.recursive_search, &mut self.text_messages);
self.check_files_size(); self.check_files_size();
if self.check_method == CheckingMethod::Hash { if self.check_method == CheckingMethod::Hash || self.check_method == CheckingMethod::HashMB {
self.check_files_hash(); self.check_files_hash();
} }
self.delete_files(); self.delete_files();
@ -354,6 +357,7 @@ impl DuplicateFinder {
let mut hasher: blake3::Hasher = blake3::Hasher::new(); let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 16384]; let mut buffer = [0u8; 16384];
let mut readed_bytes: u64 = 0;
loop { loop {
let n = match file_handler.read(&mut buffer) { let n = match file_handler.read(&mut buffer) {
Ok(t) => t, Ok(t) => t,
@ -366,8 +370,14 @@ impl DuplicateFinder {
if n == 0 { if n == 0 {
break; break;
} }
readed_bytes += n as u64;
self.information.bytes_read_when_hashing += n as u64; self.information.bytes_read_when_hashing += n as u64;
hasher.update(&buffer[..n]); hasher.update(&buffer[..n]);
if self.check_method == CheckingMethod::HashMB && readed_bytes >= HASH_MB_LIMIT_BYTES {
break;
}
} }
if !error_reading_file { if !error_reading_file {
let hash_string: String = hasher.finalize().to_hex().to_string(); let hash_string: String = hasher.finalize().to_hex().to_string();
@ -400,8 +410,8 @@ impl DuplicateFinder {
let start_time: SystemTime = SystemTime::now(); let start_time: SystemTime = SystemTime::now();
match self.check_method { match self.check_method {
CheckingMethod::Hash => { CheckingMethod::Hash | CheckingMethod::HashMB => {
for (_size, vector_vectors) in &self.files_with_identical_hashes { for vector_vectors in self.files_with_identical_hashes.values() {
for vector in vector_vectors.iter() { for vector in vector_vectors.iter() {
let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages.warnings); let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages.warnings);
self.information.gained_space += tuple.0; self.information.gained_space += tuple.0;
@ -411,7 +421,7 @@ impl DuplicateFinder {
} }
} }
CheckingMethod::Size => { CheckingMethod::Size => {
for (_size, vector) in &self.files_with_identical_size { for vector in self.files_with_identical_size.values() {
let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages.warnings); let tuple: (u64, usize, usize) = delete_files(vector, &self.delete_method, &mut self.text_messages.warnings);
self.information.gained_space += tuple.0; self.information.gained_space += tuple.0;
self.information.number_of_removed_files += tuple.1; self.information.number_of_removed_files += tuple.1;
@ -570,7 +580,7 @@ impl PrintResults for DuplicateFinder {
let mut number_of_groups: u64 = 0; let mut number_of_groups: u64 = 0;
match self.check_method { match self.check_method {
CheckingMethod::Hash => { CheckingMethod::Hash | CheckingMethod::HashMB => {
for (_size, vector) in self.files_with_identical_hashes.iter() { for (_size, vector) in self.files_with_identical_hashes.iter() {
for j in vector { for j in vector {
number_of_files += j.len() as u64; number_of_files += j.len() as u64;

View file

@ -97,7 +97,7 @@ impl EmptyFolder {
fn optimize_folders(&mut self) { fn optimize_folders(&mut self) {
let mut new_directory_folders: BTreeMap<String, FolderEntry> = Default::default(); let mut new_directory_folders: BTreeMap<String, FolderEntry> = Default::default();
for (name,folder_entry) in &self.empty_folder_list { for (name, folder_entry) in &self.empty_folder_list {
match &folder_entry.parent_path { match &folder_entry.parent_path {
Some(t) => { Some(t) => {
if !self.empty_folder_list.contains_key(t) { if !self.empty_folder_list.contains_key(t) {
@ -212,7 +212,7 @@ impl EmptyFolder {
// Now we check if checked folders are really empty, and if are, then // Now we check if checked folders are really empty, and if are, then
if initial_checking { if initial_checking {
// We need to set empty folder list // We need to set empty folder list
for (name,folder_entry) in folders_checked { for (name, folder_entry) in folders_checked {
if folder_entry.is_empty != FolderEmptiness::No { if folder_entry.is_empty != FolderEmptiness::No {
self.empty_folder_list.insert(name, folder_entry); self.empty_folder_list.insert(name, folder_entry);
} }
@ -220,7 +220,7 @@ impl EmptyFolder {
} else { } else {
// We need to check if parent of folder isn't also empty, because we wan't to delete only parent with two empty folders except this folders and at the end parent folder // We need to check if parent of folder isn't also empty, because we wan't to delete only parent with two empty folders except this folders and at the end parent folder
let mut new_folders_list: BTreeMap<String, FolderEntry> = Default::default(); let mut new_folders_list: BTreeMap<String, FolderEntry> = Default::default();
for (name,folder_entry) in folders_checked { for (name, folder_entry) in folders_checked {
if folder_entry.is_empty != FolderEmptiness::No && self.empty_folder_list.contains_key(&name) { if folder_entry.is_empty != FolderEmptiness::No && self.empty_folder_list.contains_key(&name) {
new_folders_list.insert(name, folder_entry); new_folders_list.insert(name, folder_entry);
} }
@ -235,10 +235,10 @@ impl EmptyFolder {
fn delete_empty_folders(&mut self) { fn delete_empty_folders(&mut self) {
let start_time: SystemTime = SystemTime::now(); let start_time: SystemTime = SystemTime::now();
// Folders may be deleted or require too big privileges // Folders may be deleted or require too big privileges
for (name,_folder_entry) in &self.empty_folder_list { for name in self.empty_folder_list.keys() {
match fs::remove_dir_all(name) { match fs::remove_dir_all(name) {
Ok(_) => (), Ok(_) => (),
Err(_) => self.text_messages.warnings.push(format!("Failed to remove folder {}",name)), Err(_) => self.text_messages.warnings.push(format!("Failed to remove folder {}", name)),
}; };
} }

View file

@ -279,7 +279,7 @@ fn main() {
let duplicates_group: usize; let duplicates_group: usize;
match check_method { match check_method {
CheckingMethod::Hash => { CheckingMethod::Hash | CheckingMethod::HashMB => {
duplicates_number = information.number_of_duplicated_files_by_hash; duplicates_number = information.number_of_duplicated_files_by_hash;
duplicates_size = information.lost_space_by_hash; duplicates_size = information.lost_space_by_hash;
duplicates_group = information.number_of_groups_by_hash; duplicates_group = information.number_of_groups_by_hash;
@ -315,7 +315,7 @@ fn main() {
let col_indices = [0, 1, 2, 3]; let col_indices = [0, 1, 2, 3];
match check_method { match check_method {
CheckingMethod::Hash => { CheckingMethod::Hash | CheckingMethod::HashMB => {
let btreemap = df.get_files_sorted_by_hash(); let btreemap = df.get_files_sorted_by_hash();
for (size, vectors_vector) in btreemap.iter().rev() { for (size, vectors_vector) in btreemap.iter().rev() {