Add pre hash check (#83)
This commit is contained in:
parent
8ecde0fc9a
commit
d996c3c46b
|
@ -33,7 +33,7 @@ pub enum DeleteMethod {
|
||||||
OneNewest,
|
OneNewest,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct FileEntry {
|
pub struct FileEntry {
|
||||||
pub path: PathBuf,
|
pub path: PathBuf,
|
||||||
pub size: u64,
|
pub size: u64,
|
||||||
|
@ -51,7 +51,10 @@ pub struct Info {
|
||||||
pub number_of_duplicated_files_by_size: usize,
|
pub number_of_duplicated_files_by_size: usize,
|
||||||
pub number_of_groups_by_hash: usize,
|
pub number_of_groups_by_hash: usize,
|
||||||
pub number_of_duplicated_files_by_hash: usize,
|
pub number_of_duplicated_files_by_hash: usize,
|
||||||
|
pub number_of_duplicated_files_after_pre_hash: usize,
|
||||||
|
pub number_of_groups_after_pre_hash: usize,
|
||||||
pub lost_space_by_size: u64,
|
pub lost_space_by_size: u64,
|
||||||
|
pub lost_space_after_pre_hash: u64,
|
||||||
pub lost_space_by_hash: u64,
|
pub lost_space_by_hash: u64,
|
||||||
pub bytes_read_when_hashing: u64,
|
pub bytes_read_when_hashing: u64,
|
||||||
pub number_of_removed_files: usize,
|
pub number_of_removed_files: usize,
|
||||||
|
@ -208,21 +211,21 @@ impl DuplicateFinder {
|
||||||
Ok(t) => t,
|
Ok(t) => t,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
self.text_messages.warnings.push(format!("Cannot read entry in dir {}", current_folder.display()));
|
self.text_messages.warnings.push(format!("Cannot read entry in dir {}", current_folder.display()));
|
||||||
continue;
|
continue 'dir;
|
||||||
} //Permissions denied
|
} //Permissions denied
|
||||||
};
|
};
|
||||||
let metadata: Metadata = match entry_data.metadata() {
|
let metadata: Metadata = match entry_data.metadata() {
|
||||||
Ok(t) => t,
|
Ok(t) => t,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
self.text_messages.warnings.push(format!("Cannot read metadata in dir {}", current_folder.display()));
|
self.text_messages.warnings.push(format!("Cannot read metadata in dir {}", current_folder.display()));
|
||||||
continue;
|
continue 'dir;
|
||||||
} //Permissions denied
|
} //Permissions denied
|
||||||
};
|
};
|
||||||
if metadata.is_dir() {
|
if metadata.is_dir() {
|
||||||
self.information.number_of_checked_folders += 1;
|
self.information.number_of_checked_folders += 1;
|
||||||
|
|
||||||
if !self.recursive_search {
|
if !self.recursive_search {
|
||||||
continue;
|
continue 'dir;
|
||||||
}
|
}
|
||||||
|
|
||||||
let next_folder = current_folder.join(entry_data.file_name());
|
let next_folder = current_folder.join(entry_data.file_name());
|
||||||
|
@ -239,7 +242,7 @@ impl DuplicateFinder {
|
||||||
// let mut have_valid_extension: bool;
|
// let mut have_valid_extension: bool;
|
||||||
let file_name_lowercase: String = match entry_data.file_name().into_string() {
|
let file_name_lowercase: String = match entry_data.file_name().into_string() {
|
||||||
Ok(t) => t,
|
Ok(t) => t,
|
||||||
Err(_) => continue,
|
Err(_) => continue 'dir,
|
||||||
}
|
}
|
||||||
.to_lowercase();
|
.to_lowercase();
|
||||||
|
|
||||||
|
@ -273,7 +276,7 @@ impl DuplicateFinder {
|
||||||
},
|
},
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
self.text_messages.warnings.push(format!("Unable to get modification date from file {}", current_file_name.display()));
|
self.text_messages.warnings.push(format!("Unable to get modification date from file {}", current_file_name.display()));
|
||||||
continue;
|
continue 'dir;
|
||||||
} // Permissions Denied
|
} // Permissions Denied
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -318,10 +321,58 @@ impl DuplicateFinder {
|
||||||
let start_time: SystemTime = SystemTime::now();
|
let start_time: SystemTime = SystemTime::now();
|
||||||
let mut file_handler: File;
|
let mut file_handler: File;
|
||||||
let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>>;
|
let mut hashmap_with_hash: HashMap<String, Vec<FileEntry>>;
|
||||||
|
let mut pre_checked_map: BTreeMap<u64, Vec<FileEntry>> = Default::default();
|
||||||
|
|
||||||
|
// 1 step - check only small part of file hash
|
||||||
for (size, vector) in &self.files_with_identical_size {
|
for (size, vector) in &self.files_with_identical_size {
|
||||||
hashmap_with_hash = Default::default();
|
hashmap_with_hash = Default::default();
|
||||||
|
|
||||||
|
for file_entry in vector {
|
||||||
|
if rx.is_some() && rx.unwrap().try_recv().is_ok() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
file_handler = match File::open(&file_entry.path) {
|
||||||
|
Ok(t) => t,
|
||||||
|
Err(_) => {
|
||||||
|
self.text_messages.warnings.push(format!("Unable to check hash of file {}", file_entry.path.display()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut hasher: blake3::Hasher = blake3::Hasher::new();
|
||||||
|
let mut buffer = [0u8; 1024 * 2];
|
||||||
|
let n = match file_handler.read(&mut buffer) {
|
||||||
|
Ok(t) => t,
|
||||||
|
Err(_) => {
|
||||||
|
self.text_messages.warnings.push(format!("Error happened when checking hash of file {}", file_entry.path.display()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
self.information.bytes_read_when_hashing += n as u64;
|
||||||
|
hasher.update(&buffer[..n]);
|
||||||
|
|
||||||
|
let hash_string: String = hasher.finalize().to_hex().to_string();
|
||||||
|
hashmap_with_hash.entry(hash_string.to_string()).or_insert_with(Vec::new);
|
||||||
|
hashmap_with_hash.get_mut(hash_string.as_str()).unwrap().push(file_entry.to_owned());
|
||||||
|
}
|
||||||
|
for (_string, mut vector) in hashmap_with_hash {
|
||||||
|
if vector.len() > 1 {
|
||||||
|
pre_checked_map.entry(*size).or_insert_with(Vec::new);
|
||||||
|
pre_checked_map.get_mut(size).unwrap().append(&mut vector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size, vector) in pre_checked_map.iter() {
|
||||||
|
self.information.number_of_duplicated_files_after_pre_hash += vector.len() - 1;
|
||||||
|
self.information.number_of_groups_after_pre_hash += 1;
|
||||||
|
self.information.lost_space_after_pre_hash += (vector.len() as u64 - 1) * size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2 step - Check full file hash
|
||||||
|
for (size, vector) in &pre_checked_map {
|
||||||
|
hashmap_with_hash = Default::default();
|
||||||
|
|
||||||
for file_entry in vector {
|
for file_entry in vector {
|
||||||
if rx.is_some() && rx.unwrap().try_recv().is_ok() {
|
if rx.is_some() && rx.unwrap().try_recv().is_ok() {
|
||||||
return false;
|
return false;
|
||||||
|
@ -337,7 +388,7 @@ impl DuplicateFinder {
|
||||||
let mut error_reading_file: bool = false;
|
let mut error_reading_file: bool = false;
|
||||||
|
|
||||||
let mut hasher: blake3::Hasher = blake3::Hasher::new();
|
let mut hasher: blake3::Hasher = blake3::Hasher::new();
|
||||||
let mut buffer = [0u8; 16384];
|
let mut buffer = [0u8; 32 * 1024];
|
||||||
let mut read_bytes: u64 = 0;
|
let mut read_bytes: u64 = 0;
|
||||||
loop {
|
loop {
|
||||||
let n = match file_handler.read(&mut buffer) {
|
let n = match file_handler.read(&mut buffer) {
|
||||||
|
@ -448,11 +499,20 @@ impl DebugPrint for DuplicateFinder {
|
||||||
"Number of duplicated files by size(in groups) - {} ({})",
|
"Number of duplicated files by size(in groups) - {} ({})",
|
||||||
self.information.number_of_duplicated_files_by_size, self.information.number_of_groups_by_size
|
self.information.number_of_duplicated_files_by_size, self.information.number_of_groups_by_size
|
||||||
);
|
);
|
||||||
|
println!(
|
||||||
|
"Number of duplicated files after pre hash(in groups) - {} ({})",
|
||||||
|
self.information.number_of_duplicated_files_after_pre_hash, self.information.number_of_groups_after_pre_hash
|
||||||
|
);
|
||||||
println!(
|
println!(
|
||||||
"Number of duplicated files by hash(in groups) - {} ({})",
|
"Number of duplicated files by hash(in groups) - {} ({})",
|
||||||
self.information.number_of_duplicated_files_by_hash, self.information.number_of_groups_by_hash
|
self.information.number_of_duplicated_files_by_hash, self.information.number_of_groups_by_hash
|
||||||
);
|
);
|
||||||
println!("Lost space by size - {} ({} bytes)", self.information.lost_space_by_size.file_size(options::BINARY).unwrap(), self.information.lost_space_by_size);
|
println!("Lost space by size - {} ({} bytes)", self.information.lost_space_by_size.file_size(options::BINARY).unwrap(), self.information.lost_space_by_size);
|
||||||
|
println!(
|
||||||
|
"Lost space after pre hash - {} ({} bytes)",
|
||||||
|
self.information.lost_space_after_pre_hash.file_size(options::BINARY).unwrap(),
|
||||||
|
self.information.lost_space_after_pre_hash
|
||||||
|
);
|
||||||
println!("Lost space by hash - {} ({} bytes)", self.information.lost_space_by_hash.file_size(options::BINARY).unwrap(), self.information.lost_space_by_hash);
|
println!("Lost space by hash - {} ({} bytes)", self.information.lost_space_by_hash.file_size(options::BINARY).unwrap(), self.information.lost_space_by_hash);
|
||||||
println!(
|
println!(
|
||||||
"Gained space by removing duplicated entries - {} ({} bytes)",
|
"Gained space by removing duplicated entries - {} ({} bytes)",
|
||||||
|
|
|
@ -1583,7 +1583,7 @@ fn main() {
|
||||||
for (size, vectors_vector) in btreemap.iter().rev() {
|
for (size, vectors_vector) in btreemap.iter().rev() {
|
||||||
for vector in vectors_vector {
|
for vector in vectors_vector {
|
||||||
let values: [&dyn ToValue; 6] = [
|
let values: [&dyn ToValue; 6] = [
|
||||||
&(vector.len().to_string() + " x " + size.to_string().as_str()),
|
&(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
|
||||||
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
|
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
|
||||||
&"".to_string(), // No text in 3 column
|
&"".to_string(), // No text in 3 column
|
||||||
&(0), // Not used here
|
&(0), // Not used here
|
||||||
|
@ -1611,7 +1611,7 @@ fn main() {
|
||||||
|
|
||||||
for (size, vector) in btreemap.iter().rev() {
|
for (size, vector) in btreemap.iter().rev() {
|
||||||
let values: [&dyn ToValue; 6] = [
|
let values: [&dyn ToValue; 6] = [
|
||||||
&(vector.len().to_string() + " x " + size.to_string().as_str()),
|
&(format!("{} x {} ({} bytes)", vector.len(), size.file_size(options::BINARY).unwrap(), size)),
|
||||||
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
|
&(format!("{} ({} bytes) lost", ((vector.len() - 1) as u64 * *size as u64).file_size(options::BINARY).unwrap(), (vector.len() - 1) as u64 * *size as u64)),
|
||||||
&"".to_string(), // No text in 3 column
|
&"".to_string(), // No text in 3 column
|
||||||
&(0), // Not used here
|
&(0), // Not used here
|
||||||
|
|
Loading…
Reference in a new issue