From fe9604a772d28f0d5f0ab123f3b1a4adfbe967cb Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Fri, 21 Aug 2020 13:32:31 -0500
Subject: [PATCH] feat: Add tests for remove command

---
 archivebox/extractors/__init__.py |  1 -
 archivebox/index/__init__.py      |  3 +-
 archivebox/index/sql.py           |  1 -
 tests/test_remove.py              | 71 +++++++++++++++++++++++++++++--
 4 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 567e1bf3..d10d3ab1 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -9,7 +9,6 @@ from ..index.schema import Link
 from ..index import (
     load_link_details,
     write_link_details,
-    write_main_index,
 )
 from ..util import enforce_types
 from ..logging_util import (
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 5b3803ea..ac6c85d6 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -1,6 +1,5 @@
 __package__ = 'archivebox.index'
 
-import re
 import os
 import shutil
 import json as pyjson
@@ -373,7 +372,7 @@ LINK_FILTERS = {
     'exact': lambda pattern: Q(url=pattern),
     'substring': lambda pattern: Q(url__icontains=pattern),
     'regex': lambda pattern: Q(url__iregex=pattern),
-    'domain': lambda pattern: Q(domain=pattern),
+    'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
 }
 
 @enforce_types
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index eed92697..13bb7137 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -24,7 +24,6 @@ def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
 @enforce_types
 def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> None:
     setup_django(out_dir, check_db=True)
-    from core.models import Snapshot
     from django.db import transaction
 
     with transaction.atomic():
diff --git a/tests/test_remove.py b/tests/test_remove.py
index d26c96bb..fced2da3 100644
--- a/tests/test_remove.py
+++ b/tests/test_remove.py
@@ -1,8 +1,71 @@
+import os
+import sqlite3
+
 from .fixtures import *
 
-def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict):
+def test_remove_single_page(tmp_path, process, disable_extractors_dict):
     os.chdir(tmp_path)
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
-    remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
-    list_process = subprocess.run(['archivebox', 'list'], capture_output=True)
-    assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8")
\ No newline at end of file
+    remove_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
+    assert "Found 1 matching URLs to remove" in remove_process.stdout.decode("utf-8")
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0]
+    conn.commit()
+    conn.close()
+
+    assert count == 0
+
+
+def test_remove_single_page_filesystem(tmp_path, process, disable_extractors_dict):
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
+    assert list((tmp_path / "archive").iterdir()) != []
+
+    subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes', '--delete'], capture_output=True)
+
+    assert list((tmp_path / "archive").iterdir()) == []
+
+def test_remove_regex(tmp_path, process, disable_extractors_dict):
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)
+    assert list((tmp_path / "archive").iterdir()) != []
+
+    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete'], capture_output=True)
+
+    assert list((tmp_path / "archive").iterdir()) == []
+
+def test_remove_exact(tmp_path, process, disable_extractors_dict):
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)
+    assert list((tmp_path / "archive").iterdir()) != []
+
+    remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=exact', 'http://127.0.0.1:8080/static/iana.org.html', '--yes', '--delete'], capture_output=True)
+
+    assert len(list((tmp_path / "archive").iterdir())) == 1
+
+def test_remove_substr(tmp_path, process, disable_extractors_dict):
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)
+    assert list((tmp_path / "archive").iterdir()) != []
+
+    subprocess.run(['archivebox', 'remove', '--filter-type=substring', 'example.com', '--yes', '--delete'], capture_output=True)
+
+    assert len(list((tmp_path / "archive").iterdir())) == 1
+
+def test_remove_domain(tmp_path, process, disable_extractors_dict):
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)
+    assert list((tmp_path / "archive").iterdir()) != []
+
+    remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=domain', '127.0.0.1', '--yes', '--delete'], capture_output=True)
+
+    assert len(list((tmp_path / "archive").iterdir())) == 0
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0]
+    conn.commit()
+    conn.close()
+
+    assert count == 0
\ No newline at end of file