From 4b683303934bd656d913c42bc5eb59b7a178925b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <mikrutrafal54@gmail.com>
Date: Fri, 11 Sep 2020 22:32:17 +0200
Subject: [PATCH] Added support for excluded items with wildcard *

---
 czkawka_cli/src/main.rs       | 71 ++++++++++++++++---------------
 czkawka_core/Cargo.toml       |  5 +--
 czkawka_core/src/common.rs    | 78 ++++++++++++++++++++++++++++++++++-
 czkawka_core/src/duplicate.rs | 65 ++++++++++++++++++++++++-----
 4 files changed, 172 insertions(+), 47 deletions(-)

diff --git a/czkawka_cli/src/main.rs b/czkawka_cli/src/main.rs
index 0c65dba..da1939f 100644
--- a/czkawka_cli/src/main.rs
+++ b/czkawka_cli/src/main.rs
@@ -1,3 +1,4 @@
+use czkawka_core::duplicate::Info;
 use czkawka_core::{duplicate, empty_folder};
 use std::{env, process};
 
@@ -132,37 +133,7 @@ fn main() {
 
             df.find_duplicates(&check_method, &delete_method);
 
-            let info = df.get_infos();
-
-            if !info.messages.is_empty() {
-                println!("-------------------------------MESSAGES--------------------------------");
-            }
-            for i in &info.messages {
-                println!("{}", i);
-            }
-            if !info.messages.is_empty() {
-                println!("---------------------------END OF MESSAGES-----------------------------");
-            }
-
-            if !info.warnings.is_empty() {
-                println!("-------------------------------WARNINGS--------------------------------");
-            }
-            for i in &info.warnings {
-                println!("{}", i);
-            }
-            if !info.warnings.is_empty() {
-                println!("---------------------------END OF WARNINGS-----------------------------");
-            }
-
-            if !info.errors.is_empty() {
-                println!("--------------------------------ERRORS---------------------------------");
-            }
-            for i in &info.errors {
-                println!("{}", i);
-            }
-            if !info.errors.is_empty() {
-                println!("----------------------------END OF ERRORS------------------------------");
-            }
+            print_infos(df.get_infos());
         }
         "--h" | "--help" => {
             print_help();
@@ -205,16 +176,18 @@ Usage of Czkawka:
       czkawka --help
       czkawka
 
-  --d <-i directory_to_search> [-e exclude_directories = ""] [-s min_size = 1024] [-x allowed_extension = ""] [-l type_of_search = "hash"] [-delete = "aeo"] - search for duplicates files
+  --d <-i directory_to_search> [-e exclude_directories = ""] [-k excluded_items = ""] [-s min_size = 1024] [-x allowed_extension = ""] [-l type_of_search = "hash"] [-delete = "aeo"] - search for duplicates files
     -i directory_to_search - list of directories which should will be searched like /home/rafal
     -e exclude_directories - list of directories which will be excluded from search.
+    -k excluded_items - list of excluded items which contains * wildcard(may be slow)
     -s min_size - minimum size of checked files in bytes, assigning bigger value may speed up searching.
-    -x allowed_extension - list of checked extension, e.g. "jpg,mp4" will allow to check "book.jpg" and "car.mp4" but not roman.png.There are also helpful macros which allow to easy use a typcal extension like IMAGE("jpg,kra,gif,png,bmp,tiff,webp,hdr,svg") or TEXT("txt,doc,docx,odt,rtf")
+    -x allowed_extension - list of checked extension, e.g. "jpg,mp4" will allow to check "book.jpg" and "car.mp4" but not roman.png. There are also helpful macros which allow to easy use a typcal extension like IMAGE("jpg,kra,gif,png,bmp,tiff,webp,hdr,svg") or TEXT("txt,doc,docx,odt,rtf")
     -l type_of_search - allows to use fastest which takes into account only size, and more accurate which check if file contnet is same(hashes).
     -delete - delete found files, by default remove all except the most oldest one, it can take arguments: aen(All except newest one), aeo(All except oldest one), on(Only one newest), oo(Only one oldest)
     Usage example:
       czkawka --d -i "/home/rafal/,/home/szczekacz" -e "/home/rafal/Pulpit,/home/rafal/Obrazy" -s 25 -x "7z,rar,IMAGE" -l "size" -delete
       czkawka --d -i "/etc/,/mnt/Miecz" -s 1000 -x "VIDEO" -l "hash"
+      czkawka --d -i "/var/" -k "/var/l*b/,/var/lo*,*tmp"
       czkawka --d -i "/etc/" -delete "aeo"
 
   --e <-i directory_to_search> [-e exclude_directories = ""] [-delete] - option to find and delete empty folders
@@ -225,6 +198,38 @@ Usage of Czkawka:
     "###
     );
 }
+/// Printing infos about warnings, messages and errors
+fn print_infos(infos: &Info) {
+    if !infos.messages.is_empty() {
+        println!("-------------------------------MESSAGES--------------------------------");
+    }
+    for i in &infos.messages {
+        println!("{}", i);
+    }
+    if !infos.messages.is_empty() {
+        println!("---------------------------END OF MESSAGES-----------------------------");
+    }
+
+    if !infos.warnings.is_empty() {
+        println!("-------------------------------WARNINGS--------------------------------");
+    }
+    for i in &infos.warnings {
+        println!("{}", i);
+    }
+    if !infos.warnings.is_empty() {
+        println!("---------------------------END OF WARNINGS-----------------------------");
+    }
+
+    if !infos.errors.is_empty() {
+        println!("--------------------------------ERRORS---------------------------------");
+    }
+    for i in &infos.errors {
+        println!("{}", i);
+    }
+    if !infos.errors.is_empty() {
+        println!("----------------------------END OF ERRORS------------------------------");
+    }
+}
 
 struct ArgumentsPair {
     command: String,
diff --git a/czkawka_core/Cargo.toml b/czkawka_core/Cargo.toml
index 9507239..1754225 100644
--- a/czkawka_core/Cargo.toml
+++ b/czkawka_core/Cargo.toml
@@ -5,7 +5,6 @@ authors = ["Rafał Mikrut <mikrutrafal54@gmail.com>"]
 edition = "2018"
 
 [dependencies]
-humansize = "1.1.0"
+humansize = "1"
 blake3 = "0.3.6"
-#rayon = "1.4.0"
-#regex = "1.3.9"
\ No newline at end of file
+#rayon = "1"
\ No newline at end of file
diff --git a/czkawka_core/src/common.rs b/czkawka_core/src/common.rs
index cd48e2a..d888179 100644
--- a/czkawka_core/src/common.rs
+++ b/czkawka_core/src/common.rs
@@ -5,9 +5,85 @@ use std::time::SystemTime;
 pub struct Common();
 impl Common {
     pub fn print_time(start_time: SystemTime, end_time: SystemTime, function_name: String) {
-        if true {
+        if false {
             return;
         }
         println!("Execution of function \"{}\" took {:?}", function_name, end_time.duration_since(start_time).expect("Time cannot go reverse."));
     }
+
+    /// Function to check if directory match expression
+    pub fn regex_check(expression: &str, directory: &str) -> bool {
+        if !expression.contains('*') {
+            println!("Expression should have *");
+            return false;
+        }
+
+        let temp_splits: Vec<&str> = expression.split('*').collect();
+        let mut splits: Vec<&str> = Vec::new();
+        for i in temp_splits {
+            if i != "" {
+                splits.push(i);
+            }
+        }
+        if splits.is_empty() {
+            return false;
+        }
+
+        // Early checking if directory contains all parts needed by expression
+        for split in &splits {
+            if !directory.contains(split) {
+                return false;
+            }
+        }
+
+        let mut position_of_splits: Vec<usize> = Vec::new();
+
+        // `git*` shouldn't be true for `/gitsfafasfs`
+        if !expression.starts_with('*') && directory.find(&splits[0]).unwrap() > 0 {
+            return false;
+        }
+        // `*home` shouldn't be true for `/homeowner`
+        if !expression.ends_with('*') && !directory.ends_with(splits.last().unwrap()) {
+            // && !directory.ends_with(&(splits.last().unwrap().to_string() + "/")){
+            return false;
+        }
+
+        // At the end we check if parts between * are correctly positioned
+        position_of_splits.push(directory.find(&splits[0]).unwrap());
+        let mut current_index: usize;
+        let mut found_index: usize;
+        for i in splits[1..].iter().enumerate() {
+            current_index = *position_of_splits.get(i.0).unwrap() + i.1.len();
+            found_index = match directory[current_index..].find(i.1) {
+                Some(t) => t,
+                None => return false,
+            };
+            position_of_splits.push(found_index + current_index);
+        }
+        true
+    }
+}
+#[cfg(test)]
+mod test {
+    use crate::common::Common;
+
+    #[test]
+    fn test_regex() {
+        assert!(Common::regex_check("*home*", "/home/rafal"));
+        assert!(Common::regex_check("*home", "/home"));
+        assert!(Common::regex_check("*home/", "/home/"));
+        assert!(Common::regex_check("*home/*", "/home/"));
+        assert!(Common::regex_check("*.git*", "/home/.git"));
+        assert!(Common::regex_check("*/home/rafal*rafal*rafal*rafal*", "/home/rafal/rafalrafalrafal"));
+        assert!(!Common::regex_check("*home", "/home/"));
+        assert!(!Common::regex_check("*home", "/homefasfasfasfasf/"));
+        assert!(!Common::regex_check("*home", "/homefasfasfasfasf"));
+        assert!(!Common::regex_check("rafal*afal*fal", "rafal"));
+        assert!(!Common::regex_check("AAAAAAAA****", "/AAAAAAAAAAAAAAAAA"));
+        assert!(!Common::regex_check("*.git/*", "/home/.git"));
+        assert!(!Common::regex_check("*home/*koc", "/koc/home/"));
+        assert!(!Common::regex_check("*home/", "/home"));
+        assert!(!Common::regex_check("*TTT", "/GGG"));
+        assert!(!Common::regex_check("AAA", "AAA"));
+    }
 }
diff --git a/czkawka_core/src/duplicate.rs b/czkawka_core/src/duplicate.rs
index 1f99a3d..b3ecc8c 100644
--- a/czkawka_core/src/duplicate.rs
+++ b/czkawka_core/src/duplicate.rs
@@ -1,4 +1,3 @@
-// TODO when using GUI all or most println!() should be used as variables passed by argument
 use humansize::{file_size_opts as options, FileSize};
 use std::collections::{BTreeMap, HashMap};
 use std::fs;
@@ -38,7 +37,7 @@ pub struct DuplicateFinder {
     files_with_identical_size: HashMap<u64, Vec<FileEntry>>,
     files_with_identical_hashes: BTreeMap<u64, Vec<Vec<FileEntry>>>,
     allowed_extensions: Vec<String>, // jpg, jpeg, mp4
-    // excluded_items: Vec<String>, // TODO, support for e.g. */.git/*
+    excluded_items: Vec<String>,     // TODO, support for e.g. */.git/*
     excluded_directories: Vec<String>,
     included_directories: Vec<String>,
     min_file_size: u64,
@@ -90,7 +89,7 @@ impl DuplicateFinder {
             infos: Info::new(),
             files_with_identical_size: Default::default(),
             files_with_identical_hashes: Default::default(),
-            // excluded_items: vec![],
+            excluded_items: vec![],
             excluded_directories: vec![],
             included_directories: vec![],
             min_file_size: 1024,
@@ -117,9 +116,32 @@ impl DuplicateFinder {
         self.min_file_size = min_size;
     }
 
-    pub fn set_excluded_items(&mut self, _excluded_items: String) {
-        // TODO Still don't know how to exactly parse this
-        // Things like /.git/ should be by default hidden with help of this *.git*
+    pub fn set_excluded_items(&mut self, mut excluded_items: String) {
+        // let start_time: SystemTime = SystemTime::now();
+
+        if excluded_items.is_empty() {
+            return;
+        }
+
+        excluded_items = excluded_items.replace("\"", "");
+        let expressions: Vec<String> = excluded_items.split(',').map(String::from).collect();
+        let mut checked_expressions: Vec<String> = Vec::new();
+
+        for expression in expressions {
+            let expression: String = expression.trim().to_string();
+
+            if expression == "" {
+                continue;
+            }
+            if !expression.contains('*') {
+                self.infos.warnings.push("Excluded Items Warning: Wildcard * is required in expression, ignoring ".to_string() + &*expression);
+                continue;
+            }
+
+            checked_expressions.push(expression);
+        }
+
+        self.excluded_items = checked_expressions;
     }
     pub fn set_allowed_extensions(&mut self, mut allowed_extensions: String) {
         if allowed_extensions.is_empty() {
@@ -320,6 +342,7 @@ impl DuplicateFinder {
 
                     let mut is_excluded_dir = false;
                     next_folder = "".to_owned() + &current_folder + &entry_data.file_name().into_string().unwrap() + "/";
+
                     for ed in &self.excluded_directories {
                         if next_folder == *ed {
                             is_excluded_dir = true;
@@ -327,6 +350,16 @@ impl DuplicateFinder {
                         }
                     }
                     if !is_excluded_dir {
+                        let mut found_expression: bool = false;
+                        for expression in &self.excluded_items {
+                            if Common::regex_check(expression, &next_folder) {
+                                found_expression = true;
+                                break;
+                            }
+                        }
+                        if found_expression {
+                            break;
+                        }
                         folders_to_check.push(next_folder);
                     }
                     self.infos.number_of_checked_folders += 1;
@@ -334,6 +367,7 @@ impl DuplicateFinder {
                     let mut have_valid_extension: bool;
                     let file_name_lowercase: String = entry_data.file_name().into_string().unwrap().to_lowercase();
 
+                    // Checking allowed extensions
                     if !self.allowed_extensions.is_empty() {
                         have_valid_extension = false;
                         for i in &self.allowed_extensions {
@@ -346,9 +380,23 @@ impl DuplicateFinder {
                         have_valid_extension = true;
                     }
 
+                    // Checking files
                     if metadata.len() >= self.min_file_size && have_valid_extension {
                         let current_file_name = "".to_owned() + &current_folder + &entry_data.file_name().into_string().unwrap();
 
+                        // Checking expressions
+                        let mut found_expression: bool = false;
+                        for expression in &self.excluded_items {
+                            if Common::regex_check(expression, &current_file_name) {
+                                found_expression = true;
+                                break;
+                            }
+                        }
+                        if found_expression {
+                            break;
+                        }
+
+                        // Creating new file entry
                         let fe: FileEntry = FileEntry {
                             path: current_file_name.clone(),
                             size: metadata.len(),
@@ -384,7 +432,7 @@ impl DuplicateFinder {
         Common::print_time(start_time, SystemTime::now(), "check_files_size".to_string());
         //println!("Duration of finding duplicates {:?}", end_time.duration_since(start_time).expect("a"));
     }
-    // pub fn save_results_to_file(&self) {}
+    // pub fn save_results_to_file(&self) {} // TODO Saving results to files
 
     /// Remove files which have unique size
     fn remove_files_with_unique_size(&mut self) {
@@ -541,9 +589,6 @@ impl DuplicateFinder {
         Common::print_time(start_time, SystemTime::now(), "print_duplicated_entries".to_string());
     }
     /// Remove unused entries when included or excluded overlaps with each other or are duplicated
-    /// ```
-    // let df : DuplicateFinder = saf
-    /// ```
     fn optimize_directories(&mut self) -> bool {
         let start_time: SystemTime = SystemTime::now();