1
0
Fork 0
mirror of synced 2024-06-25 01:20:30 +12:00

refactor: Initial and dirty refactor to replace link with snapshot. Barely functional add command

This commit is contained in:
Cristian 2020-12-23 14:51:42 -05:00
parent 8e2270e21b
commit 8c4ae73d65
13 changed files with 246 additions and 233 deletions

View file

@ -21,7 +21,7 @@ from util import htmldecode, urldecode, ansi_to_html
from logging_util import printable_filesize from logging_util import printable_filesize
from main import add, remove from main import add, remove
from config import OUTPUT_DIR from config import OUTPUT_DIR
from extractors import archive_links from extractors import archive_snapshots
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.core' __package__ = 'archivebox.core'
import uuid import uuid
from pathlib import Path
from django.db import models, transaction from django.db import models, transaction
from django.utils.functional import cached_property from django.utils.functional import cached_property
@ -9,9 +10,10 @@ from django.db.models import Case, When, Value, IntegerField
from ..util import parse_date from ..util import parse_date
from ..index.schema import Link from ..index.schema import Link
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE from ..config import CONFIG
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] #EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
EXTRACTORS = ["title", "wget"]
STATUS_CHOICES = [ STATUS_CHOICES = [
("succeeded", "succeeded"), ("succeeded", "succeeded"),
("failed", "failed"), ("failed", "failed"),
@ -89,6 +91,7 @@ class Snapshot(models.Model):
title = self.title or '-' title = self.title or '-'
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
@classmethod @classmethod
def from_json(cls, info: dict): def from_json(cls, info: dict):
info = {k: v for k, v in info.items() if k in cls.keys} info = {k: v for k, v in info.items() if k in cls.keys}
@ -133,8 +136,9 @@ class Snapshot(models.Model):
return self.as_link().base_url return self.as_link().base_url
@cached_property @cached_property
def link_dir(self): def snapshot_dir(self):
return self.as_link().link_dir from ..config import CONFIG
return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
@cached_property @cached_property
def archive_path(self): def archive_path(self):
@ -158,6 +162,16 @@ class Snapshot(models.Model):
return self.history['title'][-1].output.strip() return self.history['title'][-1].output.strip()
return None return None
def _asdict(self):
return {
"id": str(self.id),
"url": self.url,
"timestamp": self.timestamp,
"title": self.title,
"added": self.added,
"updated": self.updated,
}
def save_tags(self, tags=()): def save_tags(self, tags=()):
tags_id = [] tags_id = []
for tag in tags: for tag in tags:
@ -168,6 +182,7 @@ class Snapshot(models.Model):
class ArchiveResultManager(models.Manager): class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True): def indexable(self, sorted: bool = True):
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')

View file

@ -4,19 +4,20 @@ import os
from pathlib import Path from pathlib import Path
from typing import Optional, List, Iterable, Union from typing import Optional, List, Iterable, Union
from datetime import datetime
from django.db.models import QuerySet
from ..index.schema import Link from datetime import datetime
from ..index.sql import write_link_to_sql_index from django.db.models import QuerySet, Model
from ..index.sql import write_snapshot_to_index
from ..index import ( from ..index import (
load_link_details, load_snapshot_details,
write_link_details, write_snapshot_details,
) )
from ..util import enforce_types from ..util import enforce_types
from ..logging_util import ( from ..logging_util import (
log_archiving_started, log_archiving_started,
log_archiving_paused, log_archiving_paused,
log_archiving_finished, log_archiving_finished,
log_link_archiving_started, log_link_archiving_started,
log_link_archiving_finished, log_link_archiving_finished,
@ -67,15 +68,9 @@ def ignore_methods(to_ignore: List[str]):
return list(methods) return list(methods)
@enforce_types @enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
from core.models import ArchiveResult
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
from core.models import Snapshot, ArchiveResult
try:
snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
except Snapshot.DoesNotExist:
snapshot = write_link_to_sql_index(link)
ARCHIVE_METHODS = get_default_archive_methods() ARCHIVE_METHODS = get_default_archive_methods()
@ -85,33 +80,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
if method[0] in methods if method[0] in methods
] ]
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(snapshot.snapshot_dir)
try: try:
is_new = not Path(out_dir).exists() is_new = not Path(out_dir).exists()
if is_new: if is_new:
os.makedirs(out_dir) os.makedirs(out_dir)
details = {"history": {}}
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
else:
details = list(load_snapshot_details(snapshot))
link = load_link_details(link, out_dir=out_dir) #log_link_archiving_started(link, out_dir, is_new)
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
log_link_archiving_started(link, out_dir, is_new)
link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
for method_name, should_run, method_function in ARCHIVE_METHODS: for method_name, should_run, method_function in ARCHIVE_METHODS:
try: try:
if method_name not in link.history: if method_name not in details["history"]:
link.history[method_name] = [] details["history"][method_name] = []
if should_run(link, out_dir) or overwrite: if should_run(snapshot, out_dir) or overwrite:
log_archive_method_started(method_name) log_archive_method_started(method_name)
result = method_function(link=link, out_dir=out_dir) result = method_function(snapshot=snapshot, out_dir=out_dir)
link.history[method_name].append(result) details["history"][method_name].append(result)
stats[result.status] += 1 stats[result.status] += 1
log_archive_method_finished(result) log_archive_method_finished(result)
write_search_index(link=link, texts=result.index_texts) write_search_index(snapshot=snapshot, texts=result.index_texts)
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
@ -121,7 +117,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
except Exception as e: except Exception as e:
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name, method_name,
link.url, snapshot.url,
)) from e )) from e
# print(' ', stats) # print(' ', stats)
@ -129,17 +125,17 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
try: try:
latest_title = link.history['title'][-1].output.strip() latest_title = link.history['title'][-1].output.strip()
if latest_title and len(latest_title) >= len(link.title or ''): if latest_title and len(latest_title) >= len(link.title or ''):
link = link.overwrite(title=latest_title) snapshot.title = latest_title
except Exception: except Exception:
pass pass
write_link_details(link, out_dir=out_dir, skip_sql_index=False) write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
log_link_archiving_finished(link, link.link_dir, is_new, stats) log_link_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats)
except KeyboardInterrupt: except KeyboardInterrupt:
try: try:
write_link_details(link, out_dir=link.link_dir) write_snapshot_details(snapshot, out_dir=link.link_dir)
except: except:
pass pass
raise raise
@ -148,35 +144,29 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
raise raise
return link return snapshot
@enforce_types @enforce_types
def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]: def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet:
if type(all_links) is QuerySet: all_snapshots = list(all_snapshots)
num_links: int = all_links.count() num_snapshots: int = len(all_snapshots)
get_link = lambda x: x.as_link()
all_links = all_links.iterator()
else:
num_links: int = len(all_links)
get_link = lambda x: x
if num_links == 0: if num_snapshots == 0:
return [] return []
log_archiving_started(num_links) log_archiving_started(num_snapshots)
idx: int = 0 idx: int = 0
try: try:
for link in all_links: for snapshot in all_snapshots:
idx += 1 idx += 1
to_archive = get_link(link) archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir))
archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
except KeyboardInterrupt: except KeyboardInterrupt:
log_archiving_paused(num_links, idx, link.timestamp) log_archiving_paused(num_snapshots, idx, snapshot.timestamp)
raise SystemExit(0) raise SystemExit(0)
except BaseException: except BaseException:
print() print()
raise raise
log_archiving_finished(num_links) log_archiving_finished(num_snapshots)
return all_links return all_snapshots

View file

@ -5,7 +5,9 @@ from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from django.db.models import Model
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..util import ( from ..util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
@ -61,12 +63,12 @@ class TitleParser(HTMLParser):
@enforce_types @enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool: def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it # if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'): if snapshot.title and not snapshot.title.lower().startswith('http'):
return False return False
if is_static_file(link.url): if is_static_file(snapshot.url):
return False return False
return SAVE_TITLE return SAVE_TITLE
@ -77,7 +79,7 @@ def extract_title_with_regex(html):
return output return output
@enforce_types @enforce_types
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content""" """try to guess the page's title from its content"""
from core.models import Snapshot from core.models import Snapshot
@ -89,12 +91,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
link.url, snapshot.url,
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
html = download_url(link.url, timeout=timeout) html = download_url(snapshot.url, timeout=timeout)
try: try:
# try using relatively strict html parser first # try using relatively strict html parser first
parser = TitleParser() parser = TitleParser()
@ -108,10 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
# if title is better than the one in the db, update db with new title # if title is better than the one in the db, update db with new title
if isinstance(output, str) and output: if isinstance(output, str) and output:
if not link.title or len(output) >= len(link.title): if not snapshot.title or len(output) >= len(snapshot.title):
Snapshot.objects.filter(url=link.url, Snapshot.objects.filter(url=snapshot.url,
timestamp=link.timestamp)\ timestamp=snapshot.timestamp)\
.update(title=output) .update(title=output)
snapshot.title = output
else: else:
raise ArchiveError('Unable to detect page title') raise ArchiveError('Unable to detect page title')
except Exception as err: except Exception as err:

View file

@ -10,7 +10,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict from collections import OrderedDict
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import urlparse from urllib.parse import urlparse
from django.db.models import QuerySet, Q from django.db.models import QuerySet, Q, Model
from ..util import ( from ..util import (
scheme, scheme,
@ -39,15 +39,15 @@ from ..logging_util import (
from .schema import Link, ArchiveResult from .schema import Link, ArchiveResult
from .html import ( from .html import (
write_html_link_details, write_html_snapshot_details,
) )
from .json import ( from .json import (
parse_json_link_details, parse_json_snapshot_details,
write_json_link_details, write_json_snapshot_details,
) )
from .sql import ( from .sql import (
write_sql_main_index, write_sql_main_index,
write_sql_link_details, write_sql_snapshot_details,
) )
from ..search import search_backend_enabled, query_search_index from ..search import search_backend_enabled, query_search_index
@ -55,10 +55,12 @@ from ..search import search_backend_enabled, query_search_index
### Link filtering and checking ### Link filtering and checking
@enforce_types @enforce_types
def merge_links(a: Link, b: Link) -> Link: def merge_snapshots(a: Model, b: Model) -> Model:
"""deterministially merge two links, favoring longer field values over shorter, """deterministially merge two snapshots, favoring longer field values over shorter,
and "cleaner" values over worse ones. and "cleaner" values over worse ones.
TODO: Check if this makes sense with the new setup
""" """
return a
assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})' assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'
# longest url wins (because a fuzzy url will always be shorter) # longest url wins (because a fuzzy url will always be shorter)
@ -109,55 +111,55 @@ def merge_links(a: Link, b: Link) -> Link:
key=lambda result: result.start_ts, key=lambda result: result.start_ts,
))) )))
return Link( return Snapshot(
url=url, url=url,
timestamp=timestamp, timestamp=timestamp,
title=title, title=title,
tags=tags, tags=tags,
sources=sources, #sources=sources,
history=history, #history=history,
) )
@enforce_types @enforce_types
def validate_links(links: Iterable[Link]) -> List[Link]: def validate_snapshots(snapshots: List[Model]) -> List[Model]:
timer = TimedProgress(TIMEOUT * 4) timer = TimedProgress(TIMEOUT * 4)
try: try:
links = archivable_links(links) # remove chrome://, about:, mailto: etc. snapshots = archivable_snapshots(snapshots) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timestamp, url snapshots = sorted_snapshots(snapshots) # deterministically sort the links based on timestamp, url
links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls snapshots = fix_duplicate_snapshots(snapshots) # merge/dedupe duplicate timestamps & urls
finally: finally:
timer.end() timer.end()
return list(links) return list(snapshots)
@enforce_types @enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]: def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
for link in links: for snapshot in snapshots:
try: try:
urlparse(link.url) urlparse(snapshot.url)
except ValueError: except ValueError:
continue continue
if scheme(link.url) not in ('http', 'https', 'ftp'): if scheme(snapshot.url) not in ('http', 'https', 'ftp'):
continue continue
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url):
continue continue
yield link yield snapshot
@enforce_types @enforce_types
def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]: def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]:
""" """
ensures that all non-duplicate links have monotonically increasing timestamps ensures that all non-duplicate links have monotonically increasing timestamps
TODO: Review how to do this with the new snapshots refactor
""" """
# from core.models import Snapshot return sorted_snapshots
unique_urls: OrderedDict[str, Link] = OrderedDict() unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links: for snapshot in sorted_snapshots:
if link.url in unique_urls: if snapshot.url in unique_urls:
# merge with any other links that share the same url # merge with any other links that share the same url
link = merge_links(unique_urls[link.url], link) link = merge_links(unique_urls[link.url], link)
unique_urls[link.url] = link unique_urls[link.url] = link
@ -166,9 +168,9 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
@enforce_types @enforce_types
def sorted_links(links: Iterable[Link]) -> Iterable[Link]: def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url) sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url)
return sorted(links, key=sort_func, reverse=True) return sorted(snapshots, key=sort_func, reverse=True)
@enforce_types @enforce_types
@ -222,14 +224,14 @@ def timed_index_update(out_path: Path):
@enforce_types @enforce_types
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None:
"""Writes links to sqlite3 file for a given list of links""" """Writes links to sqlite3 file for a given list of links"""
log_indexing_process_started(len(links)) log_indexing_process_started(len(snapshots))
try: try:
with timed_index_update(out_dir / SQL_INDEX_FILENAME): with timed_index_update(out_dir / SQL_INDEX_FILENAME):
write_sql_main_index(links, out_dir=out_dir) write_sql_main_index(snapshots, out_dir=out_dir)
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
@ -244,7 +246,10 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
@enforce_types @enforce_types
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in""" """
Returns all of the snapshots currently in index
"""
setup_django(out_dir, check_db=True)
from core.models import Snapshot from core.models import Snapshot
try: try:
return Snapshot.objects.all() return Snapshot.objects.all()
@ -265,88 +270,62 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
@enforce_types @enforce_types
def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]: def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]:
from ..parsers import parse_links from ..parsers import parse_snapshots
new_links: List[Link] = [] new_links: List[Model] = []
# parse and validate the import file # parse and validate the import file
raw_links, parser_name = parse_links(source_path, root_url=root_url) raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url)
new_links = validate_links(raw_links) new_snapshots = validate_snapshots(raw_snapshots)
if parser_name: if parser_name:
num_parsed = len(raw_links) num_parsed = len(raw_snapshots)
log_parsing_finished(num_parsed, parser_name) log_parsing_finished(num_parsed, parser_name)
return new_links return new_snapshots
@enforce_types @enforce_types
def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]: def filter_new_urls(snapshots: QuerySet,
new_snapshots: List) -> List:
""" """
Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB. Returns a list of Snapshots corresponding to the urls that were not present in the index
""" """
unique_urls: OrderedDict[str, Link] = OrderedDict() urls = {snapshot.url: snapshot for snapshot in new_snapshots}
filtered_snapshots = snapshots.filter(url__in=urls.keys())
for link in links: for found_snapshot in filtered_snapshots:
index_link = snapshots.filter(url=link.url) urls.pop(found_snapshot.url)
if index_link:
link = merge_links(index_link[0].as_link(), link)
unique_urls[link.url] = link
return unique_urls.values()
@enforce_types
def dedupe_links(snapshots: QuerySet,
new_links: List[Link]) -> List[Link]:
"""
The validation of links happened at a different stage. This method will
focus on actual deduplication and timestamp fixing.
"""
# merge existing links in out_dir and new links log_deduping_finished(len(urls.keys()))
dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
new_links = [ return list(urls.values())
link for link in new_links
if not snapshots.filter(url=link.url).exists()
]
dedup_links_dict = {link.url: link for link in dedup_links}
# Replace links in new_links with the dedup version
for i in range(len(new_links)):
if new_links[i].url in dedup_links_dict.keys():
new_links[i] = dedup_links_dict[new_links[i].url]
log_deduping_finished(len(new_links))
return new_links
### Link Details Index ### Link Details Index
@enforce_types @enforce_types
def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None: def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
out_dir = out_dir or link.link_dir out_dir = out_dir or snapshot.snapshot_dir
write_json_link_details(link, out_dir=out_dir) write_json_snapshot_details(snapshot, out_dir=out_dir)
write_html_link_details(link, out_dir=out_dir) #write_html_snapshot_details(snapshot, out_dir=out_dir) TODO: Refactor html code too
if not skip_sql_index: if not skip_sql_index:
write_sql_link_details(link) write_sql_snapshot_details(snapshot)
@enforce_types @enforce_types
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link: def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model:
"""check for an existing link archive in the given directory, """check for an existing link archive in the given directory,
and load+merge it into the given link dict and load+merge it into the given link dict
""" """
out_dir = out_dir or link.link_dir out_dir = out_dir or snapshot.snapshot_dir
existing_link = parse_json_link_details(out_dir) existing_snapshot = parse_json_snapshot_details(out_dir)
if existing_link: if existing_snapshot:
return merge_links(existing_link, link) return merge_snapshots(existing_snapshot, snapshot)
return link return snapshot

View file

@ -5,6 +5,7 @@ from typing import List, Optional, Iterator, Mapping
from pathlib import Path from pathlib import Path
from django.utils.html import format_html from django.utils.html import format_html
from django.db.models import Model
from collections import defaultdict from collections import defaultdict
from .schema import Link from .schema import Link
@ -71,8 +72,8 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
### Link Details Index ### Link Details Index
@enforce_types @enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir out_dir = out_dir or snapshot.snapshot_dir
rendered_html = link_details_template(link) rendered_html = link_details_template(link)
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html) atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)

View file

@ -7,6 +7,7 @@ from pathlib import Path
from datetime import datetime from datetime import datetime
from typing import List, Optional, Iterator, Any, Union from typing import List, Optional, Iterator, Any, Union
from django.db.models import Model
from .schema import Link from .schema import Link
from ..system import atomic_write from ..system import atomic_write
@ -81,16 +82,17 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
### Link Details Index ### Link Details Index
@enforce_types @enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link""" """write a json file with some info about the snapshot"""
out_dir = out_dir or link.link_dir out_dir = out_dir or snapshot.snapshot_dir
path = Path(out_dir) / JSON_INDEX_FILENAME path = Path(out_dir) / JSON_INDEX_FILENAME
atomic_write(str(path), link._asdict(extended=True)) print(snapshot._asdict())
atomic_write(str(path), snapshot._asdict())
@enforce_types @enforce_types
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]: def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]:
"""load the json link index from a given directory""" """load the json link index from a given directory"""
existing_index = Path(out_dir) / JSON_INDEX_FILENAME existing_index = Path(out_dir) / JSON_INDEX_FILENAME
if existing_index.exists(): if existing_index.exists():
@ -102,16 +104,31 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
pass pass
return None return None
@enforce_types
def load_snapshot_details(snapshot: Model, out_dir: Path):
"""
Loads the detail from the local json index
"""
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
if existing_index.exists():
with open(existing_index, 'r', encoding='utf-8') as f:
try:
return pyjson.load(f)
except pyjson.JSONDecodeError:
pass
return None
@enforce_types @enforce_types
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]: def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links""" """read through all the archive data folders and return the parsed links"""
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True): if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists(): if (Path(entry.path) / 'index.json').exists():
try: try:
link = parse_json_link_details(entry.path) link = parse_json_snapshot_details(entry.path)
except KeyError: except KeyError:
link = None link = None
if link: if link:

View file

@ -3,8 +3,9 @@ __package__ = 'archivebox.index'
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path
from typing import List, Tuple, Iterator from typing import List, Tuple, Iterator
from django.db.models import QuerySet from django.db.models import QuerySet, Model
from django.db import transaction from django.db import transaction
from datetime import datetime
from .schema import Link from .schema import Link
from ..util import enforce_types from ..util import enforce_types
@ -28,21 +29,20 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
snapshots.delete() snapshots.delete()
@enforce_types @enforce_types
def write_link_to_sql_index(link: Link): def write_snapshot_to_index(snapshot: Model):
from core.models import Snapshot from core.models import Snapshot
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
tags = []
try: try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp timestamp = Snapshot.objects.get(url=snapshot.url).timestamp
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): timestamp = snapshot.timestamp
info["timestamp"] = str(float(info["timestamp"]) + 1.0) if not timestamp:
timestamp = str(datetime.now().timestamp())
while Snapshot.objects.filter(timestamp=timestamp).exists():
print("the timestamp is: ", timestamp)
timestamp = str(float(timestamp) + 1.0)
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info) snapshot.timestamp = timestamp
snapshot.save_tags(tags) snapshot.save()
return snapshot return snapshot
@ -50,27 +50,29 @@ def write_link_to_sql_index(link: Link):
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
with transaction.atomic(): with transaction.atomic():
for link in links: for link in links:
write_link_to_sql_index(link) write_snapshot_to_index(link)
@enforce_types @enforce_types
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: def write_sql_snapshot_details(snapshot: Model, out_dir: Path=OUTPUT_DIR) -> None:
from core.models import Snapshot from core.models import Snapshot
with transaction.atomic(): with transaction.atomic():
try: try:
snap = Snapshot.objects.get(url=link.url) snap = Snapshot.objects.get(url=snapshot.url)
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link) snap = write_snapshot_to_sql_index(snapshot)
snap.title = link.title snap.title = snapshot.title
tag_set = ( # TODO: If there are actual tags, this will break
set(tag.strip() for tag in (link.tags or '').split(',')) #tag_set = (
) # set(tag.strip() for tag in (snapshot.tags.all() or '').split(','))
tag_list = list(tag_set) or [] #)
#tag_list = list(tag_set) or []
snap.save() snap.save()
snap.save_tags(tag_list) #snap.save_tags(tag_list)
return snap

View file

@ -29,8 +29,9 @@ from .util import enforce_types # type: ignore
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import ( from .index import (
load_main_index, load_main_index,
parse_links_from_source, get_empty_snapshot_queryset,
dedupe_links, parse_snapshots_from_source,
filter_new_urls,
write_main_index, write_main_index,
snapshot_filter, snapshot_filter,
get_indexed_folders, get_indexed_folders,
@ -44,11 +45,11 @@ from .index import (
get_corrupted_folders, get_corrupted_folders,
get_unrecognized_folders, get_unrecognized_folders,
fix_invalid_folder_locations, fix_invalid_folder_locations,
write_link_details, write_snapshot_details,
) )
from .index.json import ( from .index.json import (
parse_json_main_index, parse_json_main_index,
parse_json_links_details, parse_json_snapshot_details,
generate_json_index_from_links, generate_json_index_from_links,
) )
from .index.sql import ( from .index.sql import (
@ -60,7 +61,7 @@ from .index.html import (
generate_index_from_links, generate_index_from_links,
) )
from .index.csv import links_to_csv from .index.csv import links_to_csv
from .extractors import archive_links, archive_link, ignore_methods from .extractors import archive_snapshots, archive_snapshot, ignore_methods
from .config import ( from .config import (
stderr, stderr,
hint, hint,
@ -538,6 +539,7 @@ def add(urls: Union[str, List[str]],
extractors: str="", extractors: str="",
out_dir: Path=OUTPUT_DIR) -> List[Link]: out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive""" """Add a new URL or list of URLs to your archive"""
from core.models import Snapshot
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
@ -549,8 +551,8 @@ def add(urls: Union[str, List[str]],
# Load list of links from the existing index # Load list of links from the existing index
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
check_dependencies() check_dependencies()
new_links: List[Link] = [] new_snapshots: List[Snapshot] = []
all_links = load_main_index(out_dir=out_dir) all_snapshots = load_main_index(out_dir=out_dir)
log_importing_started(urls=urls, depth=depth, index_only=index_only) log_importing_started(urls=urls, depth=depth, index_only=index_only)
if isinstance(urls, str): if isinstance(urls, str):
@ -560,20 +562,21 @@ def add(urls: Union[str, List[str]],
# save verbatim args to sources # save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
new_links += parse_links_from_source(write_ahead_log, root_url=None) new_snapshots += parse_snapshots_from_source(write_ahead_log, root_url=None)
# If we're going one level deeper, download each link and look for more links # If we're going one level deeper, download each link and look for more links
new_links_depth = [] new_snapshots_depth = []
if new_links and depth == 1: if new_snapshots and depth == 1:
log_crawl_started(new_links) log_crawl_started(new_snapshots)
for new_link in new_links: for new_snapshot in new_snapshots:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) # TODO: Check if we need to add domain to the Snapshot model
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
new_links = dedupe_links(all_links, imported_links) new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
write_main_index(links=new_links, out_dir=out_dir) write_main_index(snapshots=new_snapshots, out_dir=out_dir)
all_links = load_main_index(out_dir=out_dir) all_links = load_main_index(out_dir=out_dir)
if index_only: if index_only:
@ -586,13 +589,13 @@ def add(urls: Union[str, List[str]],
if extractors: if extractors:
archive_kwargs["methods"] = extractors archive_kwargs["methods"] = extractors
if update_all: if update_all:
archive_links(all_links, overwrite=overwrite, **archive_kwargs) archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs)
elif overwrite: elif overwrite:
archive_links(imported_links, overwrite=True, **archive_kwargs) archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs)
elif new_links: elif new_snapshots:
archive_links(new_links, overwrite=False, **archive_kwargs) archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs)
return all_links return all_snapshots
@enforce_types @enforce_types
def remove(filter_str: Optional[str]=None, def remove(filter_str: Optional[str]=None,
@ -711,7 +714,7 @@ def update(resume: Optional[float]=None,
if index_only: if index_only:
for link in all_links: for link in all_links:
write_link_details(link, out_dir=out_dir, skip_sql_index=True) write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
index_links(all_links, out_dir=out_dir) index_links(all_links, out_dir=out_dir)
return all_links return all_links
@ -733,7 +736,7 @@ def update(resume: Optional[float]=None,
if extractors: if extractors:
archive_kwargs["methods"] = extractors archive_kwargs["methods"] = extractors
archive_links(to_archive, overwrite=overwrite, **archive_kwargs) archive_snapshots(to_archive, overwrite=overwrite, **archive_kwargs)
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links = load_main_index(out_dir=out_dir) all_links = load_main_index(out_dir=out_dir)

View file

@ -14,6 +14,8 @@ from typing import IO, Tuple, List, Optional
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from django.db.models import Model
from ..system import atomic_write from ..system import atomic_write
from ..config import ( from ..config import (
ANSI, ANSI,
@ -84,7 +86,7 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
@enforce_types @enforce_types
def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]: def parse_snapshots(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Model], str]:
"""parse a list of URLs with their metadata from an """parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file RSS feed, bookmarks export, or text file
""" """
@ -93,27 +95,27 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
timer = TimedProgress(TIMEOUT * 4) timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file: with open(source_file, 'r', encoding='utf-8') as file:
links, parser = run_parser_functions(file, timer, root_url=root_url) snapshots, parser = run_parser_functions(file, timer, root_url=root_url)
timer.end() timer.end()
if parser is None: if parser is None:
return [], 'Failed to parse' return [], 'Failed to parse'
return links, parser return snapshots, parser
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]: def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Model], Optional[str]]:
most_links: List[Link] = [] most_snapshots: List[Model] = []
best_parser_name = None best_parser_name = None
for parser_name, parser_func in PARSERS: for parser_name, parser_func in PARSERS:
try: try:
parsed_links = list(parser_func(to_parse, root_url=root_url)) parsed_snapshots = list(parser_func(to_parse, root_url=root_url))
if not parsed_links: if not parsed_snapshots:
raise Exception('no links found') raise Exception('no links found')
# print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed') # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
if len(parsed_links) > len(most_links): if len(parsed_snapshots) > len(most_snapshots):
most_links = parsed_links most_snapshots = parsed_snapshots
best_parser_name = parser_name best_parser_name = parser_name
except Exception as err: # noqa except Exception as err: # noqa
@ -125,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
# raise # raise
pass pass
timer.end() timer.end()
return most_links, best_parser_name return most_snapshots, best_parser_name
@enforce_types @enforce_types

View file

@ -31,6 +31,7 @@ class HrefParser(HTMLParser):
@enforce_types @enforce_types
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]: def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
"""Parse Generic HTML for href tags and use only the url (support for title coming later)""" """Parse Generic HTML for href tags and use only the url (support for title coming later)"""
from core.models import Snapshot
html_file.seek(0) html_file.seek(0)
for line in html_file: for line in html_file:
@ -44,10 +45,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
url = urljoin(root_url, url) url = urljoin(root_url, url)
for archivable_url in re.findall(URL_REGEX, url): for archivable_url in re.findall(URL_REGEX, url):
yield Link( yield Snapshot(
url=htmldecode(archivable_url), url=htmldecode(archivable_url),
timestamp=str(datetime.now().timestamp()), timestamp=str(datetime.now().timestamp()),
title=None, title=None,
tags=None, #tags=None,
sources=[html_file.name], #sources=[html_file.name],
) )

View file

@ -18,6 +18,8 @@ from ..util import (
@enforce_types @enforce_types
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse raw links from each line in a text file""" """Parse raw links from each line in a text file"""
# TODO: Check if we should add sources list to the database
from core.models import Snapshot
text_file.seek(0) text_file.seek(0)
for line in text_file.readlines(): for line in text_file.readlines():
@ -40,22 +42,22 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
# otherwise look for anything that looks like a URL in the line # otherwise look for anything that looks like a URL in the line
for url in re.findall(URL_REGEX, line): for url in re.findall(URL_REGEX, line):
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(datetime.now().timestamp()), timestamp=str(datetime.now().timestamp()),
title=None, title=None,
tags=None, #tags=None,
sources=[text_file.name], #sources=[text_file.name],
) )
# look inside the URL for any sub-urls, e.g. for archive.org links # look inside the URL for any sub-urls, e.g. for archive.org links
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
for url in re.findall(URL_REGEX, line[1:]): for url in re.findall(URL_REGEX, line[1:]):
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(datetime.now().timestamp()), timestamp=str(datetime.now().timestamp()),
title=None, title=None,
tags=None, #tags=None,
sources=[text_file.name], #sources=[text_file.name],
) )

View file

@ -2,7 +2,7 @@ from typing import List, Union
from pathlib import Path from pathlib import Path
from importlib import import_module from importlib import import_module
from django.db.models import QuerySet from django.db.models import QuerySet, Model
from archivebox.index.schema import Link from archivebox.index.schema import Link
from archivebox.util import enforce_types from archivebox.util import enforce_types
@ -28,24 +28,22 @@ def import_backend():
return backend return backend
@enforce_types @enforce_types
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: def write_search_index(snapshot: Model, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
if not indexing_enabled(): if not indexing_enabled():
return return
if not skip_text_index and texts: if not skip_text_index and texts:
from core.models import Snapshot from core.models import Snapshot
snap = Snapshot.objects.filter(url=link.url).first()
backend = import_backend() backend = import_backend()
if snap: try:
try: backend.index(snapshot_id=str(snapshot.id), texts=texts)
backend.index(snapshot_id=str(snap.id), texts=texts) except Exception as err:
except Exception as err: stderr()
stderr() stderr(
stderr( f'[X] The search backend threw an exception={err}:',
f'[X] The search backend threw an exception={err}:',
color='red', color='red',
) )
@enforce_types @enforce_types
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: