1
0
Fork 0
mirror of synced 2024-05-02 19:44:09 +12:00

Fix similar images algorithm (#799)

* Fixed missing images with similarity equal to 0

* Checking

* Unify algorithm between two functions

* Finally fix problem with missing images

* Lock and debug

* No comment
This commit is contained in:
Rafał Mikrut 2022-08-29 21:37:31 +02:00 committed by GitHub
parent 24b64a32c6
commit d29935d208
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 155 additions and 149 deletions

81
Cargo.lock generated
View file

@ -352,9 +352,9 @@ dependencies = [
[[package]]
name = "clap"
version = "3.2.17"
version = "3.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29e724a68d9319343bb3328c9cc2dfde263f4b3142ee1059a9980580171c954b"
checksum = "b15f2ea93df33549dbe2e8eecd1ca55269d63ae0b3ba1f55db030817d1c2867f"
dependencies = [
"atty",
"bitflags",
@ -369,9 +369,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "3.2.17"
version = "3.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13547f7012c01ab4a0e8f8967730ada8f9fdf419e8b6c792788f39cf4e46eefa"
checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65"
dependencies = [
"heck",
"proc-macro-error",
@ -409,9 +409,9 @@ checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
[[package]]
name = "cpufeatures"
version = "0.2.3"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1079fb8528d9f9c888b1e8aa651e6e079ade467323d58f75faf1d30b1808f540"
checksum = "dc948ebb96241bb40ab73effeb80d9f93afaad49359d159a5e61be51619fe813"
dependencies = [
"libc",
]
@ -660,16 +660,15 @@ dependencies = [
[[package]]
name = "exr"
version = "1.4.2"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14cc0e06fb5f67e5d6beadf3a382fec9baca1aa751c6d5368fdeee7e5932c215"
checksum = "78c26a90d9dd411a3d119d6f55752fb4c134ca243250c32fb9cab7b2561638d2"
dependencies = [
"bit_field",
"deflate 1.0.0",
"flume",
"half",
"inflate",
"lebe",
"miniz_oxide",
"smallvec",
"threadpool",
]
@ -827,9 +826,9 @@ checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394"
[[package]]
name = "futures"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab30e97ab6aacfe635fad58f22c2bb06c8b685f7421eb1e064a729e2a5f481fa"
checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c"
dependencies = [
"futures-channel",
"futures-core",
@ -842,9 +841,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bfc52cbddcfd745bf1740338492bb0bd83d76c67b445f91c5fb29fae29ecaa1"
checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050"
dependencies = [
"futures-core",
"futures-sink",
@ -852,15 +851,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2acedae88d38235936c3922476b10fced7b2b68136f5e3c03c2d5be348a1115"
checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf"
[[package]]
name = "futures-executor"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d11aa21b5b587a64682c0094c2bdd4df0076c5324961a40cc3abd7f37930528"
checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab"
dependencies = [
"futures-core",
"futures-task",
@ -869,15 +868,15 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93a66fc6d035a26a3ae255a6d2bca35eda63ae4c5512bef54449113f7a1228e5"
checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68"
[[package]]
name = "futures-macro"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0db9cce532b0eae2ccf2766ab246f114b56b9cf6d445e00c2549fbc100ca045d"
checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17"
dependencies = [
"proc-macro2",
"quote",
@ -886,21 +885,21 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca0bae1fe9752cf7fd9b0064c674ae63f97b37bc714d745cbde0afb7ec4e6765"
checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56"
[[package]]
name = "futures-task"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "842fc63b931f4056a24d59de13fb1272134ce261816e063e634ad0c15cdc5306"
checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1"
[[package]]
name = "futures-util"
version = "0.3.23"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0828a5471e340229c11c77ca80017937ce3c58cb788a17e5f1c2d5c485a9577"
checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90"
dependencies = [
"futures-channel",
"futures-core",
@ -1557,9 +1556,9 @@ dependencies = [
[[package]]
name = "lock_api"
version = "0.4.7"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53"
checksum = "9f80bf5aacaf25cbfc8210d1cfb718f2bf3b11c4c54e5afe36c236853a8ec390"
dependencies = [
"autocfg",
"scopeguard",
@ -1940,9 +1939,9 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
[[package]]
name = "pest"
version = "2.2.1"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69486e2b8c2d2aeb9762db7b4e00b0331156393555cff467f4163ff06821eef8"
checksum = "4b0560d531d1febc25a3c9398a62a71256c0178f2e3443baedd9ad4bb8c9deb4"
dependencies = [
"thiserror",
"ucd-trie",
@ -2352,18 +2351,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.143"
version = "1.0.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53e8e5d5b70924f74ff5c6d64d9a5acd91422117c60f48c4e07855238a254553"
checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.143"
version = "1.0.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3d8e8de557aee63c26b85b947f5e59b690d0454c753f3adeb5cd7835ab88391"
checksum = "94ed3a816fb1d101812f83e789f888322c34e291f894f19590dc310963e87a00"
dependencies = [
"proc-macro2",
"quote",
@ -2372,9 +2371,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.83"
version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7"
checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
dependencies = [
"itoa",
"ryu",
@ -2779,9 +2778,9 @@ dependencies = [
[[package]]
name = "time"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db76ff9fa4b1458b3c7f077f3ff9887394058460d21e634355b273aaf11eea45"
checksum = "3c3f9a28b618c3a6b9251b6908e9c99e04b9e5c02e6581ccbb67d59c34ef7f9b"
dependencies = [
"itoa",
"libc",
@ -3205,5 +3204,5 @@ dependencies = [
"hmac",
"pbkdf2",
"sha1",
"time 0.3.13",
"time 0.3.14",
]

View file

@ -729,19 +729,28 @@ impl SimilarImages {
};
//// PROGRESS THREAD END
for hash in &all_hashes {
self.bktree.add(hash.to_vec());
// Don't use hashes with multiple images in bktree, because they will always be master of group and cannot be find by other hashes
let mut additional_chunk_to_check: Vec<_> = Default::default();
let mut hashes_with_multiple_images: HashSet<_> = Default::default(); // Fast way to check if hash have multiple imaages
for (hash, vec_files) in &all_hashed_images {
if vec_files.len() >= 2 {
additional_chunk_to_check.push(hash);
hashes_with_multiple_images.insert(hash);
} else {
self.bktree.add(hash.to_vec());
}
}
let number_of_processors = num_cpus::get();
let chunk_size = all_hashes.len() / number_of_processors;
let chunks: Vec<_> = if chunk_size > 0 { all_hashes.chunks(chunk_size).collect() } else { vec![&all_hashes] };
let mut chunks: Vec<_> = if chunk_size > 0 { all_hashes.chunks(chunk_size).collect() } else { vec![&all_hashes] };
chunks.push(&additional_chunk_to_check);
let parts: Vec<_> = chunks
.into_par_iter()
.map(|hashes_to_check| {
let mut hashes_parents: HashMap<&Vec<u8>, u32> = Default::default(); // Hash used as parent, childrens
let mut hashes_similarity: HashMap<&Vec<u8>, (&Vec<u8>, u32)> = Default::default(); // Hash used as child, (parent_hash,similarity)
let mut hashes_parents: HashMap<&Vec<u8>, u32> = Default::default(); // Hashes used as parent (hash, children_number_of_hash)
let mut hashes_similarity: HashMap<&Vec<u8>, (&Vec<u8>, u32)> = Default::default(); // Hashes used as child, (parent_hash, similarity)
// Sprawdź czy hash nie jest użyty jako master gdzie indziej
// Jeśli tak to przejdź do sprawdzania kolejnego elementu
@ -752,15 +761,16 @@ impl SimilarImages {
for (index, hash_to_check) in hashes_to_check.iter().enumerate() {
// Don't check for user stop too often
// Also don't add too ofter data to variables
const CYCLES_COUNTER: usize = 50;
if index % CYCLES_COUNTER == 0 && index != 0 {
// Also don't add too often data to atomic variable
const CYCLES_COUNTER: usize = 0b111111;
if ((index & CYCLES_COUNTER) == CYCLES_COUNTER) && index != 0 {
atomic_mode_counter.fetch_add(CYCLES_COUNTER, Ordering::Relaxed);
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_stopped.store(true, Ordering::Relaxed);
return None;
}
}
hashes_parents.insert(hash_to_check, 0);
let mut found_items = self
.bktree
@ -770,55 +780,15 @@ impl SimilarImages {
found_items.sort_unstable_by_key(|f| f.0);
for (similarity, other_hash) in found_items {
// SSSTART
// Cannot use hash if already is used as master record(have more than 0 children)
if let Some(children_number) = hashes_parents.get(other_hash) {
if *children_number > 0 {
continue;
}
}
// If there is already record, with smaller sensitivity, then replace it
let mut need_to_add = false;
let mut need_to_check = false;
// TODO replace variables from above with closures
// If current checked hash, have parent, first we must check if similarity between them is lower than checked item
if let Some((current_parent_hash, current_similarity_with_parent)) = hashes_similarity.get(hash_to_check) {
if *current_similarity_with_parent > similarity {
need_to_check = true;
*hashes_parents.get_mut(current_parent_hash).unwrap() -= 1;
hashes_similarity.remove(hash_to_check).unwrap();
}
} else {
need_to_check = true;
}
if need_to_check {
if let Some((other_parent_hash, other_similarity)) = hashes_similarity.get(other_hash) {
if *other_similarity > similarity {
need_to_add = true;
*hashes_parents.get_mut(other_parent_hash).unwrap() -= 1;
}
}
// But when there is no record, just add it
else {
need_to_add = true
}
}
if need_to_add {
hashes_similarity.insert(other_hash, (hash_to_check, similarity));
if let Some(number_of_children) = hashes_parents.get_mut(hash_to_check) {
*number_of_children += 1;
} else {
hashes_parents.insert(hash_to_check, 1);
}
}
// ENND
for (similarity, compared_hash) in found_items {
image_to_check(
&mut hashes_parents,
&mut hashes_similarity,
&hashes_with_multiple_images,
hash_to_check,
compared_hash,
similarity,
);
}
}
@ -849,66 +819,39 @@ impl SimilarImages {
hashes_similarity = first_hashes_similarity;
}
for (_partial_hashes_with_parents, partial_hashes_with_similarity) in iter {
for (hash_to_check, (other_hash, similarity)) in partial_hashes_with_similarity {
// SSSTART
// Cannot use hash if already is used as master record(have more than 0 children)
if let Some(children_number) = hashes_parents.get(other_hash) {
if *children_number > 0 {
continue;
}
for (partial_hashes_with_parents, partial_hashes_with_similarity) in iter {
for (parent_hash, _child_number) in partial_hashes_with_parents {
if !hashes_parents.contains_key(parent_hash) && !hashes_similarity.contains_key(parent_hash) {
hashes_parents.insert(parent_hash, 0);
}
}
// If there is already record, with smaller sensitivity, then replace it
let mut need_to_add = false;
let mut need_to_check = false;
// TODO replace variables from above with closures
// If current checked hash, have parent, first we must check if similarity between them is lower than checked item
if let Some((current_parent_hash, current_similarity_with_parent)) = hashes_similarity.get(hash_to_check) {
if *current_similarity_with_parent > similarity {
need_to_check = true;
*hashes_parents.get_mut(current_parent_hash).unwrap() -= 1;
hashes_similarity.remove(hash_to_check).unwrap();
}
} else {
need_to_check = true;
}
if need_to_check {
if let Some((other_parent_hash, other_similarity)) = hashes_similarity.get(other_hash) {
if *other_similarity > similarity {
need_to_add = true;
*hashes_parents.get_mut(other_parent_hash).unwrap() -= 1;
}
}
// But when there is no record, just add it
else {
need_to_add = true
}
}
if need_to_add {
hashes_similarity.insert(other_hash, (hash_to_check, similarity));
if let Some(number_of_children) = hashes_parents.get_mut(hash_to_check) {
*number_of_children += 1;
} else {
hashes_parents.insert(hash_to_check, 1);
}
}
// ENND
for (hash_to_check, (compared_hash, similarity)) in partial_hashes_with_similarity {
image_to_check(
&mut hashes_parents,
&mut hashes_similarity,
&hashes_with_multiple_images,
hash_to_check,
compared_hash,
similarity,
);
}
}
#[cfg(debug_assertions)]
debug_check_for_duplicated_things(hashes_parents.clone(), hashes_similarity.clone(), all_hashed_images.clone(), "LATTER");
// Collecting results
// Just simple check if all original hashes with multiple entries are available in end results
let original_hashes_at_start = hashes_with_multiple_images.len();
let original_hashes_in_end_results = hashes_parents
.iter()
.filter(|(parent_hash, _child_number)| hashes_with_multiple_images.contains(*parent_hash))
.count();
assert_eq!(original_hashes_at_start, original_hashes_in_end_results);
// Collecting results to vector
for (parent_hash, child_number) in hashes_parents {
if child_number > 0 {
// If hash contains other hasher OR multiple images are available for checked hash
if child_number > 0 || hashes_with_multiple_images.contains(parent_hash) {
let vec_fe = all_hashed_images.get(parent_hash).unwrap().clone();
collected_similar_images.insert(parent_hash.clone(), vec_fe);
}
@ -1040,6 +983,61 @@ impl SimilarImages {
}
}
fn image_to_check<'a>(
hashes_parents: &mut HashMap<&'a Vec<u8>, u32>,
hashes_similarity: &mut HashMap<&'a Vec<u8>, (&'a Vec<u8>, u32)>,
hashes_with_multiple_images: &HashSet<&'a Vec<u8>>,
hash_to_check: &'a Vec<u8>,
compared_hash: &'a Vec<u8>,
similarity: u32,
) {
if let Some(children_number) = hashes_parents.get(compared_hash) {
if *children_number > 0 || hashes_with_multiple_images.contains(compared_hash) {
return;
}
}
// If there is already record, with smaller sensitivity, then replace it
let mut need_to_add = false;
let mut need_to_check = false;
// TODO consider to replace variables from above with closures
// If current checked hash, have parent, first we must check if similarity between them is lower than checked item
if let Some((current_parent_hash, current_similarity_with_parent)) = hashes_similarity.get(hash_to_check) {
if *current_similarity_with_parent > similarity {
need_to_check = true;
*hashes_parents.get_mut(current_parent_hash).unwrap() -= 1;
hashes_similarity.remove(hash_to_check).unwrap();
}
} else {
need_to_check = true;
}
if need_to_check {
if let Some((other_parent_hash, other_similarity)) = hashes_similarity.get(compared_hash) {
if *other_similarity > similarity {
need_to_add = true;
*hashes_parents.get_mut(other_parent_hash).unwrap() -= 1;
}
}
// But when there is no record, just add it
else {
need_to_add = true
}
}
if need_to_add {
hashes_similarity.insert(compared_hash, (hash_to_check, similarity));
if let Some(number_of_children) = hashes_parents.get_mut(hash_to_check) {
*number_of_children += 1;
} else {
hashes_parents.insert(hash_to_check, 1);
}
}
}
impl Default for SimilarImages {
fn default() -> Self {
Self::new()
@ -1368,12 +1366,14 @@ fn debug_check_for_duplicated_things(
all_hashed_images: HashMap<Vec<u8>, Vec<FileEntry>>,
numm: &str,
) {
let mut found_broken_thing = false;
let mut hashmap_hashes: HashSet<_> = Default::default();
let mut hashmap_names: HashSet<_> = Default::default();
for (hash, number_of_children) in &hashes_parents {
if *number_of_children > 0 {
if hashmap_hashes.contains(*hash) {
println!("------1--HASH--{} {:?}", numm, all_hashed_images.get(*hash).unwrap());
found_broken_thing = true;
}
hashmap_hashes.insert(hash.to_vec());
@ -1381,6 +1381,7 @@ fn debug_check_for_duplicated_things(
let name = i.path.to_string_lossy().to_string();
if hashmap_names.contains(&name) {
println!("------1--NAME--{} {:?}", numm, name);
found_broken_thing = true;
}
hashmap_names.insert(name);
}
@ -1389,6 +1390,7 @@ fn debug_check_for_duplicated_things(
for hash in hashes_similarity.keys() {
if hashmap_hashes.contains(*hash) {
println!("------2--HASH--{} {:?}", numm, all_hashed_images.get(*hash).unwrap());
found_broken_thing = true;
}
hashmap_hashes.insert(hash.to_vec());
@ -1396,8 +1398,13 @@ fn debug_check_for_duplicated_things(
let name = i.path.to_string_lossy().to_string();
if hashmap_names.contains(&name) {
println!("------2--NAME--{} {:?}", numm, name);
found_broken_thing = true;
}
hashmap_names.insert(name);
}
}
if found_broken_thing {
panic!();
}
}