1
0
Fork 0
mirror of synced 2024-04-27 17:22:13 +12:00

Add pre hash check (#83)

This commit is contained in:
Rafał Mikrut 2020-10-24 00:56:39 -04:00 committed by GitHub
parent 8ecde0fc9a
commit d996c3c46b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 69 additions and 9 deletions

View file

@ -33,7 +33,7 @@ pub enum DeleteMethod {
OneNewest,
}
#[derive(Clone)]
#[derive(Clone, Debug)]
pub struct FileEntry {
pub path: PathBuf,
pub size: u64,
@ -51,7 +51,10 @@ pub struct Info {
pub number_of_duplicated_files_by_size: usize,
pub number_of_groups_by_hash: usize,
pub number_of_duplicated_files_by_hash: usize,
pub number_of_duplicated_files_after_pre_hash: usize,
pub number_of_groups_after_pre_hash: usize,
pub lost_space_by_size: u64,
pub lost_space_after_pre_hash: u64,
pub lost_space_by_hash: u64,
pub bytes_read_when_hashing: u64,
pub number_of_removed_files: usize,
@ -208,21 +211,21 @@ impl DuplicateFinder {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Cannot read entry in dir {}", current_folder.display()));
continue;
continue 'dir;
} //Permissions denied
};
let metadata: Metadata = match entry_data.metadata() {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Cannot read metadata in dir {}", current_folder.display()));
continue;
continue 'dir;
} //Permissions denied
};
if metadata.is_dir() {
self.information.number_of_checked_folders += 1;
if !self.recursive_search {
continue;
continue 'dir;
}
let next_folder = current_folder.join(entry_data.file_name());
@ -239,7 +242,7 @@ impl DuplicateFinder {
// let mut have_valid_extension: bool;
let file_name_lowercase: String = match entry_data.file_name().into_string() {
Ok(t) => t,
Err(_) => continue,
Err(_) => continue 'dir,
}
.to_lowercase();
@ -273,7 +276,7 @@ impl DuplicateFinder {
},
Err(_) => {
self.text_messages.warnings.push(format!("Unable to get modification date from file {}", current_file_name.display()));
continue;
continue 'dir;
} // Permissions Denied
},
};
@ -318,10 +321,58 @@ impl DuplicateFinder {
let start_time: SystemTime = SystemTime::now();
let mut file_handler: File;
let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>>;
let mut pre_checked_map: BTreeMap<u64, Vec<FileEntry>> = Default::default();
// 1 step - check only small part of file hash
for (size, vector) in &self.files_with_identical_size {
hashmap_with_hash = Default::default();
for file_entry in vector {
if rx.is_some() && rx.unwrap().try_recv().is_ok() {
return false;
}
file_handler = match File::open(&file_entry.path) {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Unable to check hash of file {}", file_entry.path.display()));
continue;
}
};
let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 1024 * 2];
let n = match file_handler.read(&mut buffer) {
Ok(t) => t,
Err(_) => {
self.text_messages.warnings.push(format!("Error happened when checking hash of file {}", file_entry.path.display()));
continue;
}
};
self.information.bytes_read_when_hashing += n as u64;
hasher.update(&buffer[..n]);
let hash_string: String = hasher.finalize().to_hex().to_string();
hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
}
for (_string, mut vector) in hashmap_with_hash {
if vector.len() > 1 {
pre_checked_map.entry(*size).or_insert_with(Vec::new);
pre_checked_map.get_mut(size).unwrap().append(&mut vector);
}
}
}
for (size, vector) in pre_checked_map.iter() {
self.information.number_of_duplicated_files_after_pre_hash += vector.len() - 1;
self.information.number_of_groups_after_pre_hash += 1;
self.information.lost_space_after_pre_hash += (vector.len() as u64 - 1) * size;
}
// 2 step - Check full file hash
for (size, vector) in &pre_checked_map {
hashmap_with_hash = Default::default();
for file_entry in vector {
if rx.is_some() && rx.unwrap().try_recv().is_ok() {
return false;
@ -337,7 +388,7 @@ impl DuplicateFinder {
let mut error_reading_file: bool = false;
let mut hasher: blake3::Hasher = blake3::Hasher::new();
let mut buffer = [0u8; 16384];
let mut buffer = [0u8; 32 * 1024];
let mut read_bytes: u64 = 0;
loop {
let n = match file_handler.read(&mut buffer) {
@ -448,11 +499,20 @@ impl DebugPrint for DuplicateFinder {
"Number of duplicated files by size(in groups) - {} ({})",
self.information.number_of_duplicated_files_by_size, self.information.number_of_groups_by_size
);
println!(
"Number of duplicated files after pre hash(in groups) - {} ({})",
self.information.number_of_duplicated_files_after_pre_hash, self.information.number_of_groups_after_pre_hash
);
println!(
"Number of duplicated files by hash(in groups) - {} ({})",
self.information.number_of_duplicated_files_by_hash, self.information.number_of_groups_by_hash
);
println!("Lost space by size - {} ({} bytes)", self.information.lost_space_by_size.file_size(options::BINARY).unwrap(), self.information.lost_space_by_size);
println!(
"Lost space after pre hash - {} ({} bytes)",
self.information.lost_space_after_pre_hash.file_size(options::BINARY).unwrap(),
self.information.lost_space_after_pre_hash
);
println!("Lost space by hash - {} ({} bytes)", self.information.lost_space_by_hash.file_size(options::BINARY).unwrap(), self.information.lost_space_by_hash);
println!(
"Gained space by removing duplicated entries - {} ({} bytes)",

View file

@ -1583,7 +1583,7 @@ fn main() {
for (size, vectors_vector) in btreemap.iter().rev() {
for vector in vectors_vector {
let values: [&dyn ToValue; 6] = [
&(vector.len().to_string() + " x " + size.to_string().as_str()),
&(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
&"".to_string(), // No text in 3 column
&(0), // Not used here
@ -1611,7 +1611,7 @@ fn main() {
for (size, vector) in btreemap.iter().rev() {
let values: [&dyn ToValue; 6] = [
&(vector.len().to_string() + " x " + size.to_string().as_str()),
&(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
&"".to_string(), // No text in 3 column
&(0), // Not used here