1
0
Fork 0
mirror of synced 2024-04-25 00:02:07 +12:00

Save to cache info about broken, too short etc. images and videos (#471)

This commit is contained in:
Rafał Mikrut 2021-11-29 13:53:09 +01:00 committed by GitHub
parent f14101139d
commit f2333b9df2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 57 additions and 57 deletions

View file

@ -451,7 +451,17 @@ impl SimilarImages {
let image = match image::open(file_entry.path.clone()) {
Ok(t) => t,
Err(_inspected) => return Some(None), // Something is wrong with image
// Err(_inspected) => return Some(None), // Something is wrong with image,
// For broken images empty hash is used, because without it will try to resecan files each time when it is called(missing cache file is responsible for it)
// This may cause problems(very rarely), when e.g. file was not available due lack of permissions, but it is available now
Err(_inspected) => {
let mut buf = Vec::new();
for _i in 0..(self.hash_size * self.hash_size / 8) {
buf.push(0);
}
file_entry.hash = buf.clone();
return Some(Some((file_entry, buf)));
}
};
let dimensions = image.dimensions();
@ -463,16 +473,6 @@ impl SimilarImages {
let hash = hasher.hash_image(&image);
let buf: Vec<u8> = hash.as_bytes().to_vec();
// Images with hashes with full of 0 or 255 usually means that algorithm fails to decode them because e.g. contains a log of alpha channel
{
if buf.iter().all(|e| *e == 0) {
return Some(None);
}
if buf.iter().all(|e| *e == 255) {
return Some(None);
}
}
file_entry.hash = buf.clone();
Some(Some((file_entry, buf)))
@ -495,9 +495,12 @@ impl SimilarImages {
}
for (file_entry, buf) in &vec_file_entry {
self.bktree.add(buf.clone());
self.image_hashes.entry(buf.clone()).or_insert_with(Vec::<FileEntry>::new);
self.image_hashes.get_mut(buf).unwrap().push(file_entry.clone());
// Only use to comparing, non broken hashes(all 0 or 255 hashes means that algorithm fails to decode them because e.g. contains a log of alpha channel)
if !(buf.iter().all(|e| *e == 0) || buf.iter().all(|e| *e == 255)) {
self.bktree.add(buf.clone());
self.image_hashes.entry(buf.clone()).or_insert_with(Vec::<FileEntry>::new);
self.image_hashes.get_mut(buf).unwrap().push(file_entry.clone());
}
}
if self.use_cache {
@ -753,9 +756,9 @@ fn save_hashes_to_file(hashmap: &BTreeMap<String, FileEntry>, text_messages: &mu
string += format!("{}//{}//{}//{}", file_entry.path.display(), file_entry.size, file_entry.dimensions, file_entry.modified_date).as_str();
for i in 0..file_entry.hash.len() - 1 {
for hash in &file_entry.hash {
string.push_str("//");
string.push_str(file_entry.hash[i].to_string().as_str());
string.push_str(hash.to_string().as_str());
}
if let Err(e) = writeln!(writer, "{}", string) {
@ -800,8 +803,8 @@ fn load_hashes_from_file(text_messages: &mut Messages, hash_size: u8, hash_alg:
index + 1,
line,
cache_file.display(),
uuu.len(),
number_of_results + 4
number_of_results + 4,
uuu.len()
));
continue;
}
@ -820,22 +823,6 @@ fn load_hashes_from_file(text_messages: &mut Messages, hash_size: u8, hash_alg:
});
}
#[cfg(debug_assertions)]
{
let mut have_at_least: u8 = 0;
for i in hash.iter() {
if *i == 0 {
have_at_least += 1;
}
}
if have_at_least == hash.len() as u8 {
println!("ERROR START - {}", line);
println!("have_at_least == hash.len() as u8");
println!("ERROR END hash.len() - {} == have_at_least - {}", hash.len(), have_at_least);
continue; // Just skip this entry, it is very very unlikelly that something have this hash, but if it has, then just ignore it
}
}
hashmap_loaded_entries.insert(
uuu[0].to_string(),
FileEntry {

View file

@ -27,6 +27,8 @@ use crate::common_traits::{DebugPrint, PrintResults, SaveResults};
pub const MAX_TOLERANCE: i32 = 20;
const HASH_SIZE: usize = 19;
#[derive(Debug)]
pub struct ProgressData {
pub current_stage: u8,
@ -41,6 +43,7 @@ pub struct FileEntry {
pub size: u64,
pub modified_date: u64,
pub vhash: VideoHash,
pub error: String,
}
/// Distance metric to use with the BK-tree.
@ -321,6 +324,7 @@ impl SimilarVideos {
} // Permissions Denied
},
vhash: Default::default(),
error: "".to_string(),
};
self.videos_to_check.insert(current_file_name.to_string_lossy().to_string(), fe);
@ -400,7 +404,7 @@ impl SimilarVideos {
progress_thread_handle = thread::spawn(|| {});
}
//// PROGRESS THREAD END
let old_vec_file_entry: Vec<std::result::Result<FileEntry, String>> = non_cached_files_to_check
let mut vec_file_entry: Vec<FileEntry> = non_cached_files_to_check
.par_iter()
.map(|file_entry| {
atomic_file_counter.fetch_add(1, Ordering::Relaxed);
@ -412,30 +416,25 @@ impl SimilarVideos {
let vhash = match VideoHash::from_path(&file_entry.path) {
Ok(t) => t,
Err(e) => return Some(Err(format!("Failed to hash file, {}", e))),
Err(e) => {
return {
file_entry.error = format!("Failed to hash file, {}", e);
Some(file_entry)
}
}
};
file_entry.vhash = vhash;
Some(Ok(file_entry))
Some(file_entry)
})
.while_some()
.collect::<Vec<std::result::Result<FileEntry, String>>>();
.collect::<Vec<FileEntry>>();
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
let mut vec_file_entry = Vec::new();
for result in old_vec_file_entry {
match result {
Ok(t) => vec_file_entry.push(t),
Err(e) => {
self.text_messages.errors.push(e);
}
}
}
Common::print_time(hash_map_modification, SystemTime::now(), "sort_videos - reading data from files in parallel".to_string());
let hash_map_modification = SystemTime::now();
@ -446,9 +445,14 @@ impl SimilarVideos {
let mut hashmap_with_file_entries: HashMap<String, FileEntry> = Default::default();
let mut vector_of_hashes: Vec<VideoHash> = Vec::new();
for i in &vec_file_entry {
hashmap_with_file_entries.insert(i.vhash.src_path().to_string_lossy().to_string(), i.clone());
vector_of_hashes.push(i.vhash.clone());
for file_entry in &vec_file_entry {
// 0 means that images was not hashed correctly, e.g. could be improperly
if file_entry.error.is_empty() {
hashmap_with_file_entries.insert(file_entry.vhash.src_path().to_string_lossy().to_string(), file_entry.clone());
vector_of_hashes.push(file_entry.vhash.clone());
} else {
self.text_messages.errors.push(file_entry.error.clone());
}
}
if self.use_cache {
@ -614,7 +618,16 @@ fn save_hashes_to_file(hashmap: &BTreeMap<String, FileEntry>, text_messages: &mu
for file_entry in hashmap.values() {
let mut string: String = String::with_capacity(256);
string += format!("{}//{}//{}//{}//{}", file_entry.path.display(), file_entry.size, file_entry.modified_date, file_entry.vhash.num_frames(), file_entry.vhash.duration()).as_str();
string += format!(
"{}//{}//{}//{}//{}//{}",
file_entry.path.display(),
file_entry.size,
file_entry.modified_date,
file_entry.vhash.num_frames(),
file_entry.vhash.duration(),
file_entry.error
)
.as_str();
for i in file_entry.vhash.hash() {
string.push_str("//");
@ -655,15 +668,14 @@ fn load_hashes_from_file(text_messages: &mut Messages) -> Option<BTreeMap<String
}
};
let uuu = line.split("//").collect::<Vec<&str>>();
let hash_size = 19;
// Hash size + other things
if uuu.len() != (hash_size + 5) {
if uuu.len() != (HASH_SIZE + 6) {
text_messages.warnings.push(format!(
"Found invalid data in line {} - ({}) in cache file {}, expected {} values, found {}",
index + 1,
line,
cache_file.display(),
hash_size + 5,
HASH_SIZE + 6,
uuu.len(),
));
continue;
@ -671,8 +683,8 @@ fn load_hashes_from_file(text_messages: &mut Messages) -> Option<BTreeMap<String
// Don't load cache data if destination file not exists
if Path::new(uuu[0]).exists() {
let mut hash: [u64; 19] = [0; 19];
for i in 0..hash_size {
hash[i] = match uuu[5 + i as usize].parse::<u64>() {
for i in 0..HASH_SIZE {
hash[i] = match uuu[6 + i as usize].parse::<u64>() {
Ok(t) => t,
Err(e) => {
text_messages
@ -706,6 +718,7 @@ fn load_hashes_from_file(text_messages: &mut Messages) -> Option<BTreeMap<String
}
},
vhash: VideoHash::with_start_data(uuu[4].parse::<u32>().unwrap_or(0), uuu[0], hash, uuu[3].parse::<u32>().unwrap_or(10)),
error: uuu[5].to_string(),
},
);
}