From 8c4ae73d657055fcdef5da44461e8b126d9f0589 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 23 Dec 2020 14:51:42 -0500 Subject: [PATCH] refactor: Initial and dirty refactor to replace link with snapshot. Barely functional add command --- archivebox/core/admin.py | 2 +- archivebox/core/models.py | 23 ++++- archivebox/extractors/__init__.py | 84 +++++++--------- archivebox/extractors/title.py | 23 +++-- archivebox/index/__init__.py | 151 +++++++++++++---------------- archivebox/index/html.py | 5 +- archivebox/index/json.py | 31 ++++-- archivebox/index/sql.py | 46 ++++----- archivebox/main.py | 51 +++++----- archivebox/parsers/__init__.py | 22 +++-- archivebox/parsers/generic_html.py | 7 +- archivebox/parsers/generic_txt.py | 14 +-- archivebox/search/__init__.py | 20 ++-- 13 files changed, 246 insertions(+), 233 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 832bea38..4eda8b59 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -21,7 +21,7 @@ from util import htmldecode, urldecode, ansi_to_html from logging_util import printable_filesize from main import add, remove from config import OUTPUT_DIR -from extractors import archive_links +from extractors import archive_snapshots # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 13d75b66..1f799156 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.core' import uuid +from pathlib import Path from django.db import models, transaction from django.utils.functional import cached_property @@ -9,9 +10,10 @@ from django.db.models import Case, When, Value, IntegerField from ..util import parse_date from ..index.schema import Link -from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE +from ..config import CONFIG -EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] +#EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] +EXTRACTORS = ["title", "wget"] STATUS_CHOICES = [ ("succeeded", "succeeded"), ("failed", "failed"), @@ -89,6 +91,7 @@ class Snapshot(models.Model): title = self.title or '-' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + @classmethod def from_json(cls, info: dict): info = {k: v for k, v in info.items() if k in cls.keys} @@ -133,8 +136,9 @@ class Snapshot(models.Model): return self.as_link().base_url @cached_property - def link_dir(self): - return self.as_link().link_dir + def snapshot_dir(self): + from ..config import CONFIG + return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp) @cached_property def archive_path(self): @@ -158,6 +162,16 @@ class Snapshot(models.Model): return self.history['title'][-1].output.strip() return None + def _asdict(self): + return { + "id": str(self.id), + "url": self.url, + "timestamp": self.timestamp, + "title": self.title, + "added": self.added, + "updated": self.updated, + } + def save_tags(self, tags=()): tags_id = [] for tag in tags: @@ -168,6 +182,7 @@ class Snapshot(models.Model): class ArchiveResultManager(models.Manager): def indexable(self, sorted: bool = True): + from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index a4acef0b..120d116a 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -4,19 +4,20 @@ import os from pathlib import Path from typing import Optional, List, Iterable, Union -from datetime import datetime -from django.db.models import QuerySet -from ..index.schema import Link -from ..index.sql import write_link_to_sql_index +from datetime import datetime +from django.db.models import QuerySet, Model + +from ..index.sql import write_snapshot_to_index from ..index import ( - load_link_details, - write_link_details, + load_snapshot_details, + write_snapshot_details, ) from ..util import enforce_types from ..logging_util import ( log_archiving_started, log_archiving_paused, + log_archiving_finished, log_link_archiving_started, log_link_archiving_finished, @@ -67,15 +68,9 @@ def ignore_methods(to_ignore: List[str]): return list(methods) @enforce_types -def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: +def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - - # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. - from core.models import Snapshot, ArchiveResult - try: - snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot - except Snapshot.DoesNotExist: - snapshot = write_link_to_sql_index(link) + from core.models import ArchiveResult ARCHIVE_METHODS = get_default_archive_methods() @@ -85,33 +80,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s if method[0] in methods ] - out_dir = out_dir or Path(link.link_dir) + out_dir = out_dir or Path(snapshot.snapshot_dir) try: is_new = not Path(out_dir).exists() if is_new: os.makedirs(out_dir) + details = {"history": {}} + write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False) + else: + details = list(load_snapshot_details(snapshot)) - link = load_link_details(link, out_dir=out_dir) - write_link_details(link, out_dir=out_dir, skip_sql_index=False) - log_link_archiving_started(link, out_dir, is_new) - link = link.overwrite(updated=datetime.now()) + #log_link_archiving_started(link, out_dir, is_new) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} for method_name, should_run, method_function in ARCHIVE_METHODS: try: - if method_name not in link.history: - link.history[method_name] = [] + if method_name not in details["history"]: + details["history"][method_name] = [] - if should_run(link, out_dir) or overwrite: + if should_run(snapshot, out_dir) or overwrite: log_archive_method_started(method_name) - result = method_function(link=link, out_dir=out_dir) + result = method_function(snapshot=snapshot, out_dir=out_dir) - link.history[method_name].append(result) + details["history"][method_name].append(result) stats[result.status] += 1 log_archive_method_finished(result) - write_search_index(link=link, texts=result.index_texts) + write_search_index(snapshot=snapshot, texts=result.index_texts) ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) @@ -121,7 +117,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s except Exception as e: raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( method_name, - link.url, + snapshot.url, )) from e # print(' ', stats) @@ -129,17 +125,17 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s try: latest_title = link.history['title'][-1].output.strip() if latest_title and len(latest_title) >= len(link.title or ''): - link = link.overwrite(title=latest_title) + snapshot.title = latest_title except Exception: pass - write_link_details(link, out_dir=out_dir, skip_sql_index=False) + write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False) - log_link_archiving_finished(link, link.link_dir, is_new, stats) + log_link_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats) except KeyboardInterrupt: try: - write_link_details(link, out_dir=link.link_dir) + write_snapshot_details(snapshot, out_dir=link.link_dir) except: pass raise @@ -148,35 +144,29 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) raise - return link + return snapshot @enforce_types -def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]: +def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet: - if type(all_links) is QuerySet: - num_links: int = all_links.count() - get_link = lambda x: x.as_link() - all_links = all_links.iterator() - else: - num_links: int = len(all_links) - get_link = lambda x: x + all_snapshots = list(all_snapshots) + num_snapshots: int = len(all_snapshots) - if num_links == 0: + if num_snapshots == 0: return [] - log_archiving_started(num_links) + log_archiving_started(num_snapshots) idx: int = 0 try: - for link in all_links: + for snapshot in all_snapshots: idx += 1 - to_archive = get_link(link) - archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir)) + archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir)) except KeyboardInterrupt: - log_archiving_paused(num_links, idx, link.timestamp) + log_archiving_paused(num_snapshots, idx, snapshot.timestamp) raise SystemExit(0) except BaseException: print() raise - log_archiving_finished(num_links) - return all_links + log_archiving_finished(num_snapshots) + return all_snapshots diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 28cb128f..519c5961 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -5,7 +5,9 @@ from html.parser import HTMLParser from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from django.db.models import Model + +from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError from ..util import ( enforce_types, is_static_file, @@ -61,12 +63,12 @@ class TitleParser(HTMLParser): @enforce_types -def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool: +def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool: # if link already has valid title, skip it - if link.title and not link.title.lower().startswith('http'): + if snapshot.title and not snapshot.title.lower().startswith('http'): return False - if is_static_file(link.url): + if is_static_file(snapshot.url): return False return SAVE_TITLE @@ -77,7 +79,7 @@ def extract_title_with_regex(html): return output @enforce_types -def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """try to guess the page's title from its content""" from core.models import Snapshot @@ -89,12 +91,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - link.url, + snapshot.url, ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - html = download_url(link.url, timeout=timeout) + html = download_url(snapshot.url, timeout=timeout) try: # try using relatively strict html parser first parser = TitleParser() @@ -108,10 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - # if title is better than the one in the db, update db with new title if isinstance(output, str) and output: - if not link.title or len(output) >= len(link.title): - Snapshot.objects.filter(url=link.url, - timestamp=link.timestamp)\ + if not snapshot.title or len(output) >= len(snapshot.title): + Snapshot.objects.filter(url=snapshot.url, + timestamp=snapshot.timestamp)\ .update(title=output) + snapshot.title = output else: raise ArchiveError('Unable to detect page title') except Exception as err: diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 8eab1d38..32af7c1d 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -10,7 +10,7 @@ from typing import List, Tuple, Dict, Optional, Iterable from collections import OrderedDict from contextlib import contextmanager from urllib.parse import urlparse -from django.db.models import QuerySet, Q +from django.db.models import QuerySet, Q, Model from ..util import ( scheme, @@ -39,15 +39,15 @@ from ..logging_util import ( from .schema import Link, ArchiveResult from .html import ( - write_html_link_details, + write_html_snapshot_details, ) from .json import ( - parse_json_link_details, - write_json_link_details, + parse_json_snapshot_details, + write_json_snapshot_details, ) from .sql import ( write_sql_main_index, - write_sql_link_details, + write_sql_snapshot_details, ) from ..search import search_backend_enabled, query_search_index @@ -55,10 +55,12 @@ from ..search import search_backend_enabled, query_search_index ### Link filtering and checking @enforce_types -def merge_links(a: Link, b: Link) -> Link: - """deterministially merge two links, favoring longer field values over shorter, +def merge_snapshots(a: Model, b: Model) -> Model: + """deterministially merge two snapshots, favoring longer field values over shorter, and "cleaner" values over worse ones. + TODO: Check if this makes sense with the new setup """ + return a assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})' # longest url wins (because a fuzzy url will always be shorter) @@ -109,55 +111,55 @@ def merge_links(a: Link, b: Link) -> Link: key=lambda result: result.start_ts, ))) - return Link( + return Snapshot( url=url, timestamp=timestamp, title=title, tags=tags, - sources=sources, - history=history, + #sources=sources, + #history=history, ) @enforce_types -def validate_links(links: Iterable[Link]) -> List[Link]: +def validate_snapshots(snapshots: List[Model]) -> List[Model]: timer = TimedProgress(TIMEOUT * 4) try: - links = archivable_links(links) # remove chrome://, about:, mailto: etc. - links = sorted_links(links) # deterministically sort the links based on timestamp, url - links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls + snapshots = archivable_snapshots(snapshots) # remove chrome://, about:, mailto: etc. + snapshots = sorted_snapshots(snapshots) # deterministically sort the links based on timestamp, url + snapshots = fix_duplicate_snapshots(snapshots) # merge/dedupe duplicate timestamps & urls finally: timer.end() - return list(links) + return list(snapshots) @enforce_types -def archivable_links(links: Iterable[Link]) -> Iterable[Link]: +def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]: """remove chrome://, about:// or other schemed links that cant be archived""" - for link in links: + for snapshot in snapshots: try: - urlparse(link.url) + urlparse(snapshot.url) except ValueError: continue - if scheme(link.url) not in ('http', 'https', 'ftp'): + if scheme(snapshot.url) not in ('http', 'https', 'ftp'): continue - if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): + if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url): continue - yield link + yield snapshot @enforce_types -def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]: +def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]: """ ensures that all non-duplicate links have monotonically increasing timestamps + TODO: Review how to do this with the new snapshots refactor """ - # from core.models import Snapshot - + return sorted_snapshots unique_urls: OrderedDict[str, Link] = OrderedDict() - for link in sorted_links: - if link.url in unique_urls: + for snapshot in sorted_snapshots: + if snapshot.url in unique_urls: # merge with any other links that share the same url link = merge_links(unique_urls[link.url], link) unique_urls[link.url] = link @@ -166,9 +168,9 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]: @enforce_types -def sorted_links(links: Iterable[Link]) -> Iterable[Link]: - sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url) - return sorted(links, key=sort_func, reverse=True) +def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]: + sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url) + return sorted(snapshots, key=sort_func, reverse=True) @enforce_types @@ -222,14 +224,14 @@ def timed_index_update(out_path: Path): @enforce_types -def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: +def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None: """Writes links to sqlite3 file for a given list of links""" - log_indexing_process_started(len(links)) + log_indexing_process_started(len(snapshots)) try: with timed_index_update(out_dir / SQL_INDEX_FILENAME): - write_sql_main_index(links, out_dir=out_dir) + write_sql_main_index(snapshots, out_dir=out_dir) os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes except (KeyboardInterrupt, SystemExit): @@ -244,7 +246,10 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: @enforce_types def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: - """parse and load existing index with any new links from import_path merged in""" + """ + Returns all of the snapshots currently in index + """ + setup_django(out_dir, check_db=True) from core.models import Snapshot try: return Snapshot.objects.all() @@ -265,88 +270,62 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]: @enforce_types -def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]: +def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]: - from ..parsers import parse_links + from ..parsers import parse_snapshots - new_links: List[Link] = [] + new_links: List[Model] = [] # parse and validate the import file - raw_links, parser_name = parse_links(source_path, root_url=root_url) - new_links = validate_links(raw_links) + raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url) + new_snapshots = validate_snapshots(raw_snapshots) if parser_name: - num_parsed = len(raw_links) + num_parsed = len(raw_snapshots) log_parsing_finished(num_parsed, parser_name) - return new_links + return new_snapshots @enforce_types -def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]: +def filter_new_urls(snapshots: QuerySet, + new_snapshots: List) -> List: """ - Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB. + Returns a list of Snapshots corresponding to the urls that were not present in the index """ - unique_urls: OrderedDict[str, Link] = OrderedDict() + urls = {snapshot.url: snapshot for snapshot in new_snapshots} + filtered_snapshots = snapshots.filter(url__in=urls.keys()) - for link in links: - index_link = snapshots.filter(url=link.url) - if index_link: - link = merge_links(index_link[0].as_link(), link) - - unique_urls[link.url] = link - - return unique_urls.values() - -@enforce_types -def dedupe_links(snapshots: QuerySet, - new_links: List[Link]) -> List[Link]: - """ - The validation of links happened at a different stage. This method will - focus on actual deduplication and timestamp fixing. - """ + for found_snapshot in filtered_snapshots: + urls.pop(found_snapshot.url) - # merge existing links in out_dir and new links - dedup_links = fix_duplicate_links_in_index(snapshots, new_links) + log_deduping_finished(len(urls.keys())) - new_links = [ - link for link in new_links - if not snapshots.filter(url=link.url).exists() - ] - - dedup_links_dict = {link.url: link for link in dedup_links} - - # Replace links in new_links with the dedup version - for i in range(len(new_links)): - if new_links[i].url in dedup_links_dict.keys(): - new_links[i] = dedup_links_dict[new_links[i].url] - log_deduping_finished(len(new_links)) - - return new_links + return list(urls.values()) ### Link Details Index @enforce_types -def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None: - out_dir = out_dir or link.link_dir +def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None: + out_dir = out_dir or snapshot.snapshot_dir - write_json_link_details(link, out_dir=out_dir) - write_html_link_details(link, out_dir=out_dir) + write_json_snapshot_details(snapshot, out_dir=out_dir) + #write_html_snapshot_details(snapshot, out_dir=out_dir) TODO: Refactor html code too if not skip_sql_index: - write_sql_link_details(link) + write_sql_snapshot_details(snapshot) @enforce_types -def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link: +def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model: """check for an existing link archive in the given directory, and load+merge it into the given link dict """ - out_dir = out_dir or link.link_dir + out_dir = out_dir or snapshot.snapshot_dir - existing_link = parse_json_link_details(out_dir) - if existing_link: - return merge_links(existing_link, link) + existing_snapshot = parse_json_snapshot_details(out_dir) + if existing_snapshot: + return merge_snapshots(existing_snapshot, snapshot) - return link + return snapshot diff --git a/archivebox/index/html.py b/archivebox/index/html.py index a62e2c7e..d1bd5ee2 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -5,6 +5,7 @@ from typing import List, Optional, Iterator, Mapping from pathlib import Path from django.utils.html import format_html +from django.db.models import Model from collections import defaultdict from .schema import Link @@ -71,8 +72,8 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> ### Link Details Index @enforce_types -def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: - out_dir = out_dir or link.link_dir +def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None: + out_dir = out_dir or snapshot.snapshot_dir rendered_html = link_details_template(link) atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html) diff --git a/archivebox/index/json.py b/archivebox/index/json.py index f24b969f..ed4c255d 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -7,6 +7,7 @@ from pathlib import Path from datetime import datetime from typing import List, Optional, Iterator, Any, Union +from django.db.models import Model from .schema import Link from ..system import atomic_write @@ -81,16 +82,17 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: ### Link Details Index @enforce_types -def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: - """write a json file with some info about the link""" +def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None: + """write a json file with some info about the snapshot""" - out_dir = out_dir or link.link_dir + out_dir = out_dir or snapshot.snapshot_dir path = Path(out_dir) / JSON_INDEX_FILENAME - atomic_write(str(path), link._asdict(extended=True)) + print(snapshot._asdict()) + atomic_write(str(path), snapshot._asdict()) @enforce_types -def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]: +def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]: """load the json link index from a given directory""" existing_index = Path(out_dir) / JSON_INDEX_FILENAME if existing_index.exists(): @@ -102,16 +104,31 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal pass return None +@enforce_types +def load_snapshot_details(snapshot: Model, out_dir: Path): + """ + Loads the detail from the local json index + """ + existing_index = Path(out_dir) / JSON_INDEX_FILENAME + if existing_index.exists(): + with open(existing_index, 'r', encoding='utf-8') as f: + try: + return pyjson.load(f) + except pyjson.JSONDecodeError: + pass + return None + + @enforce_types -def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]: +def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]: """read through all the archive data folders and return the parsed links""" for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: - link = parse_json_link_details(entry.path) + link = parse_json_snapshot_details(entry.path) except KeyError: link = None if link: diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 1e99f67c..d32a1468 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -3,8 +3,9 @@ __package__ = 'archivebox.index' from io import StringIO from pathlib import Path from typing import List, Tuple, Iterator -from django.db.models import QuerySet +from django.db.models import QuerySet, Model from django.db import transaction +from datetime import datetime from .schema import Link from ..util import enforce_types @@ -28,21 +29,20 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> snapshots.delete() @enforce_types -def write_link_to_sql_index(link: Link): +def write_snapshot_to_index(snapshot: Model): from core.models import Snapshot - info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} - tags = info.pop("tags") - if tags is None: - tags = [] - try: - info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp + timestamp = Snapshot.objects.get(url=snapshot.url).timestamp except Snapshot.DoesNotExist: - while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): - info["timestamp"] = str(float(info["timestamp"]) + 1.0) + timestamp = snapshot.timestamp + if not timestamp: + timestamp = str(datetime.now().timestamp()) + while Snapshot.objects.filter(timestamp=timestamp).exists(): + print("the timestamp is: ", timestamp) + timestamp = str(float(timestamp) + 1.0) - snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info) - snapshot.save_tags(tags) + snapshot.timestamp = timestamp + snapshot.save() return snapshot @@ -50,27 +50,29 @@ def write_link_to_sql_index(link: Link): def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: with transaction.atomic(): for link in links: - write_link_to_sql_index(link) + write_snapshot_to_index(link) @enforce_types -def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: +def write_sql_snapshot_details(snapshot: Model, out_dir: Path=OUTPUT_DIR) -> None: from core.models import Snapshot with transaction.atomic(): try: - snap = Snapshot.objects.get(url=link.url) + snap = Snapshot.objects.get(url=snapshot.url) except Snapshot.DoesNotExist: - snap = write_link_to_sql_index(link) - snap.title = link.title + snap = write_snapshot_to_sql_index(snapshot) + snap.title = snapshot.title - tag_set = ( - set(tag.strip() for tag in (link.tags or '').split(',')) - ) - tag_list = list(tag_set) or [] + # TODO: If there are actual tags, this will break + #tag_set = ( + # set(tag.strip() for tag in (snapshot.tags.all() or '').split(',')) + #) + #tag_list = list(tag_set) or [] snap.save() - snap.save_tags(tag_list) + #snap.save_tags(tag_list) + return snap diff --git a/archivebox/main.py b/archivebox/main.py index eb8cd6a0..71147f59 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -29,8 +29,9 @@ from .util import enforce_types # type: ignore from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .index import ( load_main_index, - parse_links_from_source, - dedupe_links, + get_empty_snapshot_queryset, + parse_snapshots_from_source, + filter_new_urls, write_main_index, snapshot_filter, get_indexed_folders, @@ -44,11 +45,11 @@ from .index import ( get_corrupted_folders, get_unrecognized_folders, fix_invalid_folder_locations, - write_link_details, + write_snapshot_details, ) from .index.json import ( parse_json_main_index, - parse_json_links_details, + parse_json_snapshot_details, generate_json_index_from_links, ) from .index.sql import ( @@ -60,7 +61,7 @@ from .index.html import ( generate_index_from_links, ) from .index.csv import links_to_csv -from .extractors import archive_links, archive_link, ignore_methods +from .extractors import archive_snapshots, archive_snapshot, ignore_methods from .config import ( stderr, hint, @@ -538,6 +539,7 @@ def add(urls: Union[str, List[str]], extractors: str="", out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" + from core.models import Snapshot assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' @@ -549,8 +551,8 @@ def add(urls: Union[str, List[str]], # Load list of links from the existing index check_data_folder(out_dir=out_dir) check_dependencies() - new_links: List[Link] = [] - all_links = load_main_index(out_dir=out_dir) + new_snapshots: List[Snapshot] = [] + all_snapshots = load_main_index(out_dir=out_dir) log_importing_started(urls=urls, depth=depth, index_only=index_only) if isinstance(urls, str): @@ -560,20 +562,21 @@ def add(urls: Union[str, List[str]], # save verbatim args to sources write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) - new_links += parse_links_from_source(write_ahead_log, root_url=None) + new_snapshots += parse_snapshots_from_source(write_ahead_log, root_url=None) # If we're going one level deeper, download each link and look for more links - new_links_depth = [] - if new_links and depth == 1: - log_crawl_started(new_links) - for new_link in new_links: - downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) - new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) + new_snapshots_depth = [] + if new_snapshots and depth == 1: + log_crawl_started(new_snapshots) + for new_snapshot in new_snapshots: + # TODO: Check if we need to add domain to the Snapshot model + downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir) + new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url) - imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) - new_links = dedupe_links(all_links, imported_links) + imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth] + new_snapshots = filter_new_urls(all_snapshots, imported_snapshots) - write_main_index(links=new_links, out_dir=out_dir) + write_main_index(snapshots=new_snapshots, out_dir=out_dir) all_links = load_main_index(out_dir=out_dir) if index_only: @@ -586,13 +589,13 @@ def add(urls: Union[str, List[str]], if extractors: archive_kwargs["methods"] = extractors if update_all: - archive_links(all_links, overwrite=overwrite, **archive_kwargs) + archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs) elif overwrite: - archive_links(imported_links, overwrite=True, **archive_kwargs) - elif new_links: - archive_links(new_links, overwrite=False, **archive_kwargs) + archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs) + elif new_snapshots: + archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs) - return all_links + return all_snapshots @enforce_types def remove(filter_str: Optional[str]=None, @@ -711,7 +714,7 @@ def update(resume: Optional[float]=None, if index_only: for link in all_links: - write_link_details(link, out_dir=out_dir, skip_sql_index=True) + write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True) index_links(all_links, out_dir=out_dir) return all_links @@ -733,7 +736,7 @@ def update(resume: Optional[float]=None, if extractors: archive_kwargs["methods"] = extractors - archive_links(to_archive, overwrite=overwrite, **archive_kwargs) + archive_snapshots(to_archive, overwrite=overwrite, **archive_kwargs) # Step 4: Re-write links index with updated titles, icons, and resources all_links = load_main_index(out_dir=out_dir) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 441c08ac..8b10d794 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -14,6 +14,8 @@ from typing import IO, Tuple, List, Optional from datetime import datetime from pathlib import Path +from django.db.models import Model + from ..system import atomic_write from ..config import ( ANSI, @@ -84,7 +86,7 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None): @enforce_types -def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]: +def parse_snapshots(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Model], str]: """parse a list of URLs with their metadata from an RSS feed, bookmarks export, or text file """ @@ -93,27 +95,27 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li timer = TimedProgress(TIMEOUT * 4) with open(source_file, 'r', encoding='utf-8') as file: - links, parser = run_parser_functions(file, timer, root_url=root_url) + snapshots, parser = run_parser_functions(file, timer, root_url=root_url) timer.end() if parser is None: return [], 'Failed to parse' - return links, parser + return snapshots, parser -def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]: - most_links: List[Link] = [] +def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Model], Optional[str]]: + most_snapshots: List[Model] = [] best_parser_name = None for parser_name, parser_func in PARSERS: try: - parsed_links = list(parser_func(to_parse, root_url=root_url)) - if not parsed_links: + parsed_snapshots = list(parser_func(to_parse, root_url=root_url)) + if not parsed_snapshots: raise Exception('no links found') # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed') - if len(parsed_links) > len(most_links): - most_links = parsed_links + if len(parsed_snapshots) > len(most_snapshots): + most_snapshots = parsed_snapshots best_parser_name = parser_name except Exception as err: # noqa @@ -125,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) # raise pass timer.end() - return most_links, best_parser_name + return most_snapshots, best_parser_name @enforce_types diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py index 74b3d1fc..99aebf5c 100644 --- a/archivebox/parsers/generic_html.py +++ b/archivebox/parsers/generic_html.py @@ -31,6 +31,7 @@ class HrefParser(HTMLParser): @enforce_types def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]: """Parse Generic HTML for href tags and use only the url (support for title coming later)""" + from core.models import Snapshot html_file.seek(0) for line in html_file: @@ -44,10 +45,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, url = urljoin(root_url, url) for archivable_url in re.findall(URL_REGEX, url): - yield Link( + yield Snapshot( url=htmldecode(archivable_url), timestamp=str(datetime.now().timestamp()), title=None, - tags=None, - sources=[html_file.name], + #tags=None, + #sources=[html_file.name], ) diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index e296ec7e..616f226f 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -18,6 +18,8 @@ from ..util import ( @enforce_types def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: """Parse raw links from each line in a text file""" + # TODO: Check if we should add sources list to the database + from core.models import Snapshot text_file.seek(0) for line in text_file.readlines(): @@ -40,22 +42,22 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: # otherwise look for anything that looks like a URL in the line for url in re.findall(URL_REGEX, line): - yield Link( + yield Snapshot( url=htmldecode(url), timestamp=str(datetime.now().timestamp()), title=None, - tags=None, - sources=[text_file.name], + #tags=None, + #sources=[text_file.name], ) # look inside the URL for any sub-urls, e.g. for archive.org links # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ for url in re.findall(URL_REGEX, line[1:]): - yield Link( + yield Snapshot( url=htmldecode(url), timestamp=str(datetime.now().timestamp()), title=None, - tags=None, - sources=[text_file.name], + #tags=None, + #sources=[text_file.name], ) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 6191ede9..d958f324 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -2,7 +2,7 @@ from typing import List, Union from pathlib import Path from importlib import import_module -from django.db.models import QuerySet +from django.db.models import QuerySet, Model from archivebox.index.schema import Link from archivebox.util import enforce_types @@ -28,24 +28,22 @@ def import_backend(): return backend @enforce_types -def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: +def write_search_index(snapshot: Model, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: if not indexing_enabled(): return if not skip_text_index and texts: from core.models import Snapshot - snap = Snapshot.objects.filter(url=link.url).first() backend = import_backend() - if snap: - try: - backend.index(snapshot_id=str(snap.id), texts=texts) - except Exception as err: - stderr() - stderr( - f'[X] The search backend threw an exception={err}:', + try: + backend.index(snapshot_id=str(snapshot.id), texts=texts) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', color='red', - ) + ) @enforce_types def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: