1
0
Fork 0
mirror of synced 2024-05-21 04:43:50 +12:00
czkawka/czkawka_core/src/bad_extensions.rs

578 lines
21 KiB
Rust
Raw Normal View History

2022-05-31 08:02:39 +12:00
use std::collections::{BTreeSet, HashMap};
use std::fs::File;
use std::io::prelude::*;
use std::io::BufWriter;
use std::mem;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::SystemTime;
use crossbeam_channel::Receiver;
use futures::channel::mpsc::UnboundedSender;
use mime_guess::get_mime_extensions;
use rayon::prelude::*;
use crate::common::{prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads, Common};
use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData};
use crate::common_directory::Directories;
use crate::common_extensions::Extensions;
use crate::common_items::ExcludedItems;
use crate::common_messages::Messages;
use crate::common_traits::*;
static DISABLED_EXTENSIONS: &[&str] = &["file", "cache", "bak", "data"]; // Such files can have any type inside
// This adds several workarounds for bugs/invalid recognizing types by external libraries
// ("real_content_extension", "current_file_extension")
const WORKAROUNDS: &[(&str, &str)] = &[
// Wine/Windows
2022-05-31 08:02:39 +12:00
("der", "cat"),
("exe", "acm"),
("exe", "ax"),
("exe", "bck"),
("exe", "com"),
("exe", "cpl"),
("exe", "dll16"),
("exe", "dll"),
("exe", "drv16"),
("exe", "drv"),
("exe", "ds"),
2022-05-31 08:02:39 +12:00
("exe", "efi"),
("exe", "exe16"),
("exe", "fon"), // Type of font or something else
2022-05-31 08:02:39 +12:00
("exe", "mod16"),
("exe", "msstyles"),
2022-05-31 08:02:39 +12:00
("exe", "mui"),
("exe", "mun"),
("exe", "orig"),
("exe", "ps1xml"),
("exe", "rll"),
("exe", "rs"),
("exe", "scr"),
2022-05-31 08:02:39 +12:00
("exe", "signed"),
("exe", "sys"),
("exe", "tlb"),
("exe", "tsp"),
("exe", "vdm"),
("exe", "vxd"),
2022-05-31 08:02:39 +12:00
("exe", "winmd"),
("gz", "loggz"),
2022-05-31 08:02:39 +12:00
("xml", "adml"),
("xml", "admx"),
("xml", "camp"),
("xml", "cdmp"),
("xml", "cdxml"),
("xml", "dgml"),
("xml", "diagpkg"),
("xml", "gmmp"),
("xml", "library-ms"),
("xml", "man"),
2022-05-31 08:02:39 +12:00
("xml", "manifest"),
("xml", "msc"),
2022-05-31 08:02:39 +12:00
("xml", "mum"),
("xml", "resx"),
("zip", "wmz"),
// Other
("exe", "pyd"), // Python/Mingw
2022-05-31 08:02:39 +12:00
("gz", "blend"), // Blender
("gz", "crate"), // Cargo
("gz", "svgz"), // Archive svg
("gz", "tgz"), // Archive
("html", "dtd"), // Mingw
("html", "ent"), // Mingw
2022-05-31 08:02:39 +12:00
("html", "md"), // Markdown
("jpg", "jfif"), // Photo format
("m4v", "mp4"), // m4v and mp4 are interchangeable
2022-05-31 08:02:39 +12:00
("mobi", "azw3"), // Ebook format
("mpg", "vob"), // Weddings in parts have usually vob extension
("obj", "bin"), // Multiple apps, Czkawka, Nvidia, Windows
("obj", "o"), // Compilators
("odp", "otp"), // LibreOffice
("ods", "ots"), // Libreoffice
("odt", "ott"), // Libreoffice
("ogg", "ogv"), // Audio format
("pptx", "ppsx"), // Powerpoint
("sh", "bash"), // Linux
("sh", "guess"), // GNU
2022-05-31 08:02:39 +12:00
("sh", "pl"), // Gnome/Linux
("sh", "pm"), // Gnome/Linux
("sh", "py"), // Python
("sh", "pyx"), // Python
("sh", "rs"), // Rust
("sh", "sample"), // Git
("xml", "bsp"), // Quartus
("xml", "cbp"), // CodeBlocks config
("xml", "cfg"), // Multiple apps - Godot
("xml", "cmb"), // Cambalache
("xml", "conf"), // Multiple apps - Python
("xml", "config"), // Multiple apps - QT Creator
("xml", "dae"), // 3D models
("xml", "docbook"), //
("xml", "fb2"), //
("xml", "gir"), // GTK
("xml", "glade"), // Glade
("xml", "iml"), // Intelij Idea
("xml", "kdenlive"), // KDenLive
("xml", "lang"), // ?
("xml", "nuspec"), // Nuget
2022-05-31 08:02:39 +12:00
("xml", "policy"), // SystemD
("xml", "qsys"), // Quartus
("xml", "sopcinfo"), // Quartus
("xml", "svg"), // SVG
("xml", "ui"), // Cambalache, Glade
("xml", "user"), // Qtcreator
("xml", "vbox"), // VirtualBox
("xml", "vbox-prev"), // VirtualBox
("xml", "vcproj"), // VisualStudio
("xml", "xba"), // Libreoffice
("xml", "xcd"), // Libreoffice files
("zip", "apk"), // Android apk
2022-06-01 03:52:55 +12:00
("zip", "cbr"), // Comics
2022-05-31 08:02:39 +12:00
("zip", "dat"), // Multiple - python, brave
("zip", "doc"), // Word
("zip", "docx"), // Word
("zip", "jar"), // Java
("zip", "kra"), // Krita
("zip", "nupkg"), // Nuget packages
("zip", "odg"), // Libreoffice
("zip", "pptx"), // Powerpoint
("zip", "whl"), // Python packages
("zip", "xlsx"), // Excel
2022-05-31 08:02:39 +12:00
("zip", "xpi"), // Firefox extensions
("zip", "zcos"), // Scilab
// Probably invalid
("html", "svg"),
("xml", "html"),
// Probably bug in external library
("msi", "ppt"), // Not sure why ppt is not recognized
("msi", "doc"), // Not sure why doc is not recognized
("exe", "xls"), // Not sure why xls is not recognized
];
#[derive(Clone)]
pub struct BadFileEntry {
pub path: PathBuf,
pub modified_date: u64,
pub size: u64,
pub current_extension: String,
pub proper_extensions: String,
}
2022-06-01 03:52:55 +12:00
/// Info struck with helpful information's about results
#[derive(Default)]
pub struct Info {
pub number_of_files_with_bad_extension: usize,
}
impl Info {
#[must_use]
pub fn new() -> Self {
Default::default()
}
}
pub struct BadExtensions {
text_messages: Messages,
information: Info,
files_to_check: Vec<FileEntry>,
bad_extensions_files: Vec<BadFileEntry>,
directories: Directories,
allowed_extensions: Extensions,
excluded_items: ExcludedItems,
minimal_file_size: u64,
maximal_file_size: u64,
recursive_search: bool,
stopped_search: bool,
save_also_as_json: bool,
include_files_without_extension: bool,
}
impl BadExtensions {
#[must_use]
pub fn new() -> Self {
Self {
text_messages: Messages::new(),
information: Info::new(),
recursive_search: true,
allowed_extensions: Extensions::new(),
directories: Directories::new(),
excluded_items: ExcludedItems::new(),
files_to_check: Default::default(),
stopped_search: false,
minimal_file_size: 8192,
maximal_file_size: u64::MAX,
bad_extensions_files: Default::default(),
save_also_as_json: false,
include_files_without_extension: true,
}
}
pub fn find_bad_extensions_files(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&UnboundedSender<ProgressData>>) {
self.directories.optimize_directories(self.recursive_search, &mut self.text_messages);
if !self.check_files(stop_receiver, progress_sender) {
self.stopped_search = true;
return;
}
if !self.look_for_bad_extensions_files(stop_receiver, progress_sender) {
self.stopped_search = true;
return;
}
self.debug_print();
}
#[must_use]
pub fn get_stopped_search(&self) -> bool {
self.stopped_search
}
#[must_use]
pub const fn get_bad_extensions_files(&self) -> &Vec<BadFileEntry> {
&self.bad_extensions_files
}
pub fn set_maximal_file_size(&mut self, maximal_file_size: u64) {
self.maximal_file_size = match maximal_file_size {
0 => 1,
t => t,
};
}
pub fn set_minimal_file_size(&mut self, minimal_file_size: u64) {
self.minimal_file_size = match minimal_file_size {
0 => 1,
t => t,
};
}
#[cfg(target_family = "unix")]
pub fn set_exclude_other_filesystems(&mut self, exclude_other_filesystems: bool) {
self.directories.set_exclude_other_filesystems(exclude_other_filesystems);
}
#[cfg(not(target_family = "unix"))]
pub fn set_exclude_other_filesystems(&mut self, _exclude_other_filesystems: bool) {}
#[must_use]
pub const fn get_text_messages(&self) -> &Messages {
&self.text_messages
}
#[must_use]
pub const fn get_information(&self) -> &Info {
&self.information
}
pub fn set_save_also_as_json(&mut self, save_also_as_json: bool) {
self.save_also_as_json = save_also_as_json;
}
pub fn set_recursive_search(&mut self, recursive_search: bool) {
self.recursive_search = recursive_search;
}
pub fn set_included_directory(&mut self, included_directory: Vec<PathBuf>) -> bool {
self.directories.set_included_directory(included_directory, &mut self.text_messages)
}
pub fn set_excluded_directory(&mut self, excluded_directory: Vec<PathBuf>) {
self.directories.set_excluded_directory(excluded_directory, &mut self.text_messages);
}
pub fn set_allowed_extensions(&mut self, allowed_extensions: String) {
self.allowed_extensions.set_allowed_extensions(allowed_extensions, &mut self.text_messages);
}
pub fn set_excluded_items(&mut self, excluded_items: Vec<String>) {
self.excluded_items.set_excluded_items(excluded_items, &mut self.text_messages);
}
fn check_files(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&UnboundedSender<ProgressData>>) -> bool {
let result = DirTraversalBuilder::new()
.root_dirs(self.directories.included_directories.clone())
.group_by(|_fe| ())
.stop_receiver(stop_receiver)
.progress_sender(progress_sender)
.minimal_file_size(self.minimal_file_size)
.maximal_file_size(self.maximal_file_size)
.directories(self.directories.clone())
.allowed_extensions(self.allowed_extensions.clone())
.excluded_items(self.excluded_items.clone())
.recursive_search(self.recursive_search)
.build()
.run();
match result {
DirTraversalResult::SuccessFiles {
start_time,
grouped_file_entries,
warnings,
} => {
if let Some(files_to_check) = grouped_file_entries.get(&()) {
self.files_to_check = files_to_check.clone();
}
self.text_messages.warnings.extend(warnings);
Common::print_time(start_time, SystemTime::now(), "check_files");
true
}
DirTraversalResult::SuccessFolders { .. } => {
unreachable!()
}
DirTraversalResult::Stopped => false,
}
}
fn look_for_bad_extensions_files(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&UnboundedSender<ProgressData>>) -> bool {
let system_time = SystemTime::now();
let check_was_stopped = AtomicBool::new(false); // Used for breaking from GUI and ending check thread
let progress_thread_run = Arc::new(AtomicBool::new(true));
let atomic_counter = Arc::new(AtomicUsize::new(0));
let progress_thread_handle = prepare_thread_handler_common(
progress_sender,
&progress_thread_run,
&atomic_counter,
1,
1,
self.files_to_check.len(),
CheckingMethod::None,
);
let mut files_to_check = Default::default();
mem::swap(&mut files_to_check, &mut self.files_to_check);
2022-05-31 08:02:39 +12:00
let mut hashmap_workarounds: HashMap<&str, Vec<&str>> = Default::default();
2022-05-31 08:02:39 +12:00
for (proper, found) in WORKAROUNDS {
// This should be enabled when items will have only 1 possible workaround items, but looks that some have 2 or even more, so at least for now this is disabled
// if hashmap_workarounds.contains_key(found) {
// panic!("Already have {} key", found);
// }
hashmap_workarounds.entry(found).or_insert_with(Vec::new).push(proper);
2022-05-31 08:02:39 +12:00
}
self.bad_extensions_files = self.verify_extensions(files_to_check, &atomic_counter, stop_receiver, &check_was_stopped, &hashmap_workarounds);
send_info_and_wait_for_ending_all_threads(&progress_thread_run, progress_thread_handle);
// Break if stop was clicked
if check_was_stopped.load(Ordering::Relaxed) {
return false;
}
self.information.number_of_files_with_bad_extension = self.bad_extensions_files.len();
Common::print_time(system_time, SystemTime::now(), "bad extension finding");
// Clean unused data
self.files_to_check = Default::default();
true
}
fn verify_extensions(
&self,
files_to_check: Vec<FileEntry>,
atomic_counter: &Arc<AtomicUsize>,
stop_receiver: Option<&Receiver<()>>,
check_was_stopped: &AtomicBool,
hashmap_workarounds: &HashMap<&str, Vec<&str>>,
) -> Vec<BadFileEntry> {
files_to_check
2022-05-31 08:02:39 +12:00
.into_par_iter()
.map(|file_entry| {
atomic_counter.fetch_add(1, Ordering::Relaxed);
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
check_was_stopped.store(true, Ordering::Relaxed);
return None;
}
// Check what exactly content file contains
let kind = match infer::get_from_path(&file_entry.path) {
Ok(k) => match k {
Some(t) => t,
None => return Some(None),
},
Err(_) => return Some(None),
};
let proper_extension = kind.extension();
let Some(current_extension) = self.get_and_validate_extension(&file_entry, proper_extension) else {
return Some(None);
};
// Check for all extensions that file can use(not sure if it is worth to do it)
let (mut all_available_extensions, valid_extensions) = self.check_for_all_extensions_that_file_can_use(hashmap_workarounds, &current_extension, proper_extension);
if all_available_extensions.is_empty() {
// Not found any extension
return Some(None);
} else if current_extension.is_empty() {
if !self.include_files_without_extension {
return Some(None);
}
} else if all_available_extensions.take(&current_extension).is_some() {
// Found proper extension
return Some(None);
}
Some(Some(BadFileEntry {
path: file_entry.path,
modified_date: file_entry.modified_date,
size: file_entry.size,
current_extension,
proper_extensions: valid_extensions,
}))
})
.while_some()
.filter(Option::is_some)
.map(Option::unwrap)
.collect::<Vec<_>>()
}
fn get_and_validate_extension(&self, file_entry: &FileEntry, proper_extension: &str) -> Option<String> {
let current_extension;
// Extract current extension from file
if let Some(extension) = file_entry.path.extension() {
let extension = extension.to_string_lossy().to_lowercase();
if DISABLED_EXTENSIONS.contains(&extension.as_str()) {
return None;
}
// Text longer than 10 characters is not considered as extension
if extension.len() > 10 {
current_extension = String::new();
} else {
current_extension = extension;
}
} else {
current_extension = String::new();
}
// Already have proper extension, no need to do more things
if current_extension == proper_extension {
return None;
}
Some(current_extension)
}
fn check_for_all_extensions_that_file_can_use(
&self,
hashmap_workarounds: &HashMap<&str, Vec<&str>>,
current_extension: &str,
proper_extension: &str,
) -> (BTreeSet<String>, String) {
let mut all_available_extensions: BTreeSet<String> = Default::default();
let valid_extensions = if current_extension.is_empty() {
String::new()
} else {
for mim in mime_guess::from_ext(proper_extension) {
if let Some(all_ext) = get_mime_extensions(&mim) {
for ext in all_ext {
all_available_extensions.insert((*ext).to_string());
}
}
}
// Workarounds
if let Some(vec_pre) = hashmap_workarounds.get(current_extension) {
for pre in vec_pre {
if all_available_extensions.contains(*pre) {
all_available_extensions.insert(current_extension.to_string());
break;
}
}
}
let mut guessed_multiple_extensions = format!("({proper_extension}) - ");
for ext in &all_available_extensions {
guessed_multiple_extensions.push_str(ext);
guessed_multiple_extensions.push(',');
}
guessed_multiple_extensions.pop();
guessed_multiple_extensions
};
(all_available_extensions, valid_extensions)
}
}
impl Default for BadExtensions {
fn default() -> Self {
Self::new()
}
}
impl DebugPrint for BadExtensions {
#[allow(dead_code)]
#[allow(unreachable_code)]
/// Debugging printing - only available on debug build
fn debug_print(&self) {
#[cfg(not(debug_assertions))]
{
return;
}
println!("---------------DEBUG PRINT---------------");
println!("### Information's");
println!("Errors size - {}", self.text_messages.errors.len());
println!("Warnings size - {}", self.text_messages.warnings.len());
println!("Messages size - {}", self.text_messages.messages.len());
println!("### Other");
println!("Excluded items - {:?}", self.excluded_items.items);
println!("Included directories - {:?}", self.directories.included_directories);
println!("Excluded directories - {:?}", self.directories.excluded_directories);
println!("Recursive search - {}", self.recursive_search);
println!("-----------------------------------------");
}
}
impl SaveResults for BadExtensions {
fn save_results_to_file(&mut self, file_name: &str) -> bool {
let start_time: SystemTime = SystemTime::now();
let file_name: String = match file_name {
"" => "results.txt".to_string(),
k => k.to_string(),
};
let file_handler = match File::create(&file_name) {
Ok(t) => t,
Err(e) => {
self.text_messages.errors.push(format!("Failed to create file {file_name}, reason {e}"));
return false;
}
};
let mut writer = BufWriter::new(file_handler);
if let Err(e) = writeln!(
writer,
"Results of searching {:?} with excluded directories {:?} and excluded items {:?}",
self.directories.included_directories, self.directories.excluded_directories, self.excluded_items.items
) {
self.text_messages.errors.push(format!("Failed to save results to file {file_name}, reason {e}"));
return false;
}
if !self.bad_extensions_files.is_empty() {
writeln!(writer, "Found {} files with invalid extension.", self.information.number_of_files_with_bad_extension).unwrap();
for file_entry in &self.bad_extensions_files {
writeln!(writer, "{} ----- {}", file_entry.path.display(), file_entry.proper_extensions).unwrap();
}
} else {
write!(writer, "Not found any files with invalid extension.").unwrap();
}
Common::print_time(start_time, SystemTime::now(), "save_results_to_file");
true
}
}
impl PrintResults for BadExtensions {
/// Print information's about duplicated entries
/// Only needed for CLI
fn print_results(&self) {
let start_time: SystemTime = SystemTime::now();
println!("Found {} files with invalid extension.\n", self.information.number_of_files_with_bad_extension);
for file_entry in &self.bad_extensions_files {
println!("{} ----- {}", file_entry.path.display(), file_entry.proper_extensions);
}
Common::print_time(start_time, SystemTime::now(), "print_entries");
}
}