1
0
Fork 0
mirror of synced 2024-06-16 17:35:15 +12:00

refactor: Initial and dirty refactor to replace link with snapshot. Barely functional add command

This commit is contained in:
Cristian 2020-12-23 14:51:42 -05:00
parent 8e2270e21b
commit 8c4ae73d65
13 changed files with 246 additions and 233 deletions

View file

@ -21,7 +21,7 @@ from util import htmldecode, urldecode, ansi_to_html
from logging_util import printable_filesize
from main import add, remove
from config import OUTPUT_DIR
from extractors import archive_links
from extractors import archive_snapshots
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.core'
import uuid
from pathlib import Path
from django.db import models, transaction
from django.utils.functional import cached_property
@ -9,9 +10,10 @@ from django.db.models import Case, When, Value, IntegerField
from ..util import parse_date
from ..index.schema import Link
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
from ..config import CONFIG
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
#EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
EXTRACTORS = ["title", "wget"]
STATUS_CHOICES = [
("succeeded", "succeeded"),
("failed", "failed"),
@ -89,6 +91,7 @@ class Snapshot(models.Model):
title = self.title or '-'
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
@classmethod
def from_json(cls, info: dict):
info = {k: v for k, v in info.items() if k in cls.keys}
@ -133,8 +136,9 @@ class Snapshot(models.Model):
return self.as_link().base_url
@cached_property
def link_dir(self):
return self.as_link().link_dir
def snapshot_dir(self):
from ..config import CONFIG
return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
@cached_property
def archive_path(self):
@ -158,6 +162,16 @@ class Snapshot(models.Model):
return self.history['title'][-1].output.strip()
return None
def _asdict(self):
return {
"id": str(self.id),
"url": self.url,
"timestamp": self.timestamp,
"title": self.title,
"added": self.added,
"updated": self.updated,
}
def save_tags(self, tags=()):
tags_id = []
for tag in tags:
@ -168,6 +182,7 @@ class Snapshot(models.Model):
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')

View file

@ -4,19 +4,20 @@ import os
from pathlib import Path
from typing import Optional, List, Iterable, Union
from datetime import datetime
from django.db.models import QuerySet
from ..index.schema import Link
from ..index.sql import write_link_to_sql_index
from datetime import datetime
from django.db.models import QuerySet, Model
from ..index.sql import write_snapshot_to_index
from ..index import (
load_link_details,
write_link_details,
load_snapshot_details,
write_snapshot_details,
)
from ..util import enforce_types
from ..logging_util import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
log_link_archiving_started,
log_link_archiving_finished,
@ -67,15 +68,9 @@ def ignore_methods(to_ignore: List[str]):
return list(methods)
@enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
from core.models import Snapshot, ArchiveResult
try:
snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
except Snapshot.DoesNotExist:
snapshot = write_link_to_sql_index(link)
from core.models import ArchiveResult
ARCHIVE_METHODS = get_default_archive_methods()
@ -85,33 +80,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
if method[0] in methods
]
out_dir = out_dir or Path(link.link_dir)
out_dir = out_dir or Path(snapshot.snapshot_dir)
try:
is_new = not Path(out_dir).exists()
if is_new:
os.makedirs(out_dir)
details = {"history": {}}
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
else:
details = list(load_snapshot_details(snapshot))
link = load_link_details(link, out_dir=out_dir)
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
log_link_archiving_started(link, out_dir, is_new)
link = link.overwrite(updated=datetime.now())
#log_link_archiving_started(link, out_dir, is_new)
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
for method_name, should_run, method_function in ARCHIVE_METHODS:
try:
if method_name not in link.history:
link.history[method_name] = []
if method_name not in details["history"]:
details["history"][method_name] = []
if should_run(link, out_dir) or overwrite:
if should_run(snapshot, out_dir) or overwrite:
log_archive_method_started(method_name)
result = method_function(link=link, out_dir=out_dir)
result = method_function(snapshot=snapshot, out_dir=out_dir)
link.history[method_name].append(result)
details["history"][method_name].append(result)
stats[result.status] += 1
log_archive_method_finished(result)
write_search_index(link=link, texts=result.index_texts)
write_search_index(snapshot=snapshot, texts=result.index_texts)
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
@ -121,7 +117,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
except Exception as e:
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
snapshot.url,
)) from e
# print(' ', stats)
@ -129,17 +125,17 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
try:
latest_title = link.history['title'][-1].output.strip()
if latest_title and len(latest_title) >= len(link.title or ''):
link = link.overwrite(title=latest_title)
snapshot.title = latest_title
except Exception:
pass
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
log_link_archiving_finished(link, link.link_dir, is_new, stats)
log_link_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats)
except KeyboardInterrupt:
try:
write_link_details(link, out_dir=link.link_dir)
write_snapshot_details(snapshot, out_dir=link.link_dir)
except:
pass
raise
@ -148,35 +144,29 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
raise
return link
return snapshot
@enforce_types
def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:
def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet:
if type(all_links) is QuerySet:
num_links: int = all_links.count()
get_link = lambda x: x.as_link()
all_links = all_links.iterator()
else:
num_links: int = len(all_links)
get_link = lambda x: x
all_snapshots = list(all_snapshots)
num_snapshots: int = len(all_snapshots)
if num_links == 0:
if num_snapshots == 0:
return []
log_archiving_started(num_links)
log_archiving_started(num_snapshots)
idx: int = 0
try:
for link in all_links:
for snapshot in all_snapshots:
idx += 1
to_archive = get_link(link)
archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir))
except KeyboardInterrupt:
log_archiving_paused(num_links, idx, link.timestamp)
log_archiving_paused(num_snapshots, idx, snapshot.timestamp)
raise SystemExit(0)
except BaseException:
print()
raise
log_archiving_finished(num_links)
return all_links
log_archiving_finished(num_snapshots)
return all_snapshots

View file

@ -5,7 +5,9 @@ from html.parser import HTMLParser
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from django.db.models import Model
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
enforce_types,
is_static_file,
@ -61,12 +63,12 @@ class TitleParser(HTMLParser):
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
if snapshot.title and not snapshot.title.lower().startswith('http'):
return False
if is_static_file(link.url):
if is_static_file(snapshot.url):
return False
return SAVE_TITLE
@ -77,7 +79,7 @@ def extract_title_with_regex(html):
return output
@enforce_types
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
from core.models import Snapshot
@ -89,12 +91,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
link.url,
snapshot.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
html = download_url(link.url, timeout=timeout)
html = download_url(snapshot.url, timeout=timeout)
try:
# try using relatively strict html parser first
parser = TitleParser()
@ -108,10 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
# if title is better than the one in the db, update db with new title
if isinstance(output, str) and output:
if not link.title or len(output) >= len(link.title):
Snapshot.objects.filter(url=link.url,
timestamp=link.timestamp)\
if not snapshot.title or len(output) >= len(snapshot.title):
Snapshot.objects.filter(url=snapshot.url,
timestamp=snapshot.timestamp)\
.update(title=output)
snapshot.title = output
else:
raise ArchiveError('Unable to detect page title')
except Exception as err:

View file

@ -10,7 +10,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
from urllib.parse import urlparse
from django.db.models import QuerySet, Q
from django.db.models import QuerySet, Q, Model
from ..util import (
scheme,
@ -39,15 +39,15 @@ from ..logging_util import (
from .schema import Link, ArchiveResult
from .html import (
write_html_link_details,
write_html_snapshot_details,
)
from .json import (
parse_json_link_details,
write_json_link_details,
parse_json_snapshot_details,
write_json_snapshot_details,
)
from .sql import (
write_sql_main_index,
write_sql_link_details,
write_sql_snapshot_details,
)
from ..search import search_backend_enabled, query_search_index
@ -55,10 +55,12 @@ from ..search import search_backend_enabled, query_search_index
### Link filtering and checking
@enforce_types
def merge_links(a: Link, b: Link) -> Link:
"""deterministially merge two links, favoring longer field values over shorter,
def merge_snapshots(a: Model, b: Model) -> Model:
"""deterministially merge two snapshots, favoring longer field values over shorter,
and "cleaner" values over worse ones.
TODO: Check if this makes sense with the new setup
"""
return a
assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'
# longest url wins (because a fuzzy url will always be shorter)
@ -109,55 +111,55 @@ def merge_links(a: Link, b: Link) -> Link:
key=lambda result: result.start_ts,
)))
return Link(
return Snapshot(
url=url,
timestamp=timestamp,
title=title,
tags=tags,
sources=sources,
history=history,
#sources=sources,
#history=history,
)
@enforce_types
def validate_links(links: Iterable[Link]) -> List[Link]:
def validate_snapshots(snapshots: List[Model]) -> List[Model]:
timer = TimedProgress(TIMEOUT * 4)
try:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timestamp, url
links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls
snapshots = archivable_snapshots(snapshots) # remove chrome://, about:, mailto: etc.
snapshots = sorted_snapshots(snapshots) # deterministically sort the links based on timestamp, url
snapshots = fix_duplicate_snapshots(snapshots) # merge/dedupe duplicate timestamps & urls
finally:
timer.end()
return list(links)
return list(snapshots)
@enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
for snapshot in snapshots:
try:
urlparse(link.url)
urlparse(snapshot.url)
except ValueError:
continue
if scheme(link.url) not in ('http', 'https', 'ftp'):
if scheme(snapshot.url) not in ('http', 'https', 'ftp'):
continue
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url):
continue
yield link
yield snapshot
@enforce_types
def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]:
"""
ensures that all non-duplicate links have monotonically increasing timestamps
TODO: Review how to do this with the new snapshots refactor
"""
# from core.models import Snapshot
return sorted_snapshots
unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links:
if link.url in unique_urls:
for snapshot in sorted_snapshots:
if snapshot.url in unique_urls:
# merge with any other links that share the same url
link = merge_links(unique_urls[link.url], link)
unique_urls[link.url] = link
@ -166,9 +168,9 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
@enforce_types
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
return sorted(links, key=sort_func, reverse=True)
def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url)
return sorted(snapshots, key=sort_func, reverse=True)
@enforce_types
@ -222,14 +224,14 @@ def timed_index_update(out_path: Path):
@enforce_types
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None:
"""Writes links to sqlite3 file for a given list of links"""
log_indexing_process_started(len(links))
log_indexing_process_started(len(snapshots))
try:
with timed_index_update(out_dir / SQL_INDEX_FILENAME):
write_sql_main_index(links, out_dir=out_dir)
write_sql_main_index(snapshots, out_dir=out_dir)
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
except (KeyboardInterrupt, SystemExit):
@ -244,7 +246,10 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
@enforce_types
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
"""
Returns all of the snapshots currently in index
"""
setup_django(out_dir, check_db=True)
from core.models import Snapshot
try:
return Snapshot.objects.all()
@ -265,88 +270,62 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
@enforce_types
def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]:
from ..parsers import parse_links
from ..parsers import parse_snapshots
new_links: List[Link] = []
new_links: List[Model] = []
# parse and validate the import file
raw_links, parser_name = parse_links(source_path, root_url=root_url)
new_links = validate_links(raw_links)
raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url)
new_snapshots = validate_snapshots(raw_snapshots)
if parser_name:
num_parsed = len(raw_links)
num_parsed = len(raw_snapshots)
log_parsing_finished(num_parsed, parser_name)
return new_links
return new_snapshots
@enforce_types
def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
def filter_new_urls(snapshots: QuerySet,
new_snapshots: List) -> List:
"""
Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB.
Returns a list of Snapshots corresponding to the urls that were not present in the index
"""
unique_urls: OrderedDict[str, Link] = OrderedDict()
urls = {snapshot.url: snapshot for snapshot in new_snapshots}
filtered_snapshots = snapshots.filter(url__in=urls.keys())
for link in links:
index_link = snapshots.filter(url=link.url)
if index_link:
link = merge_links(index_link[0].as_link(), link)
unique_urls[link.url] = link
return unique_urls.values()
@enforce_types
def dedupe_links(snapshots: QuerySet,
new_links: List[Link]) -> List[Link]:
"""
The validation of links happened at a different stage. This method will
focus on actual deduplication and timestamp fixing.
"""
for found_snapshot in filtered_snapshots:
urls.pop(found_snapshot.url)
# merge existing links in out_dir and new links
dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
log_deduping_finished(len(urls.keys()))
new_links = [
link for link in new_links
if not snapshots.filter(url=link.url).exists()
]
dedup_links_dict = {link.url: link for link in dedup_links}
# Replace links in new_links with the dedup version
for i in range(len(new_links)):
if new_links[i].url in dedup_links_dict.keys():
new_links[i] = dedup_links_dict[new_links[i].url]
log_deduping_finished(len(new_links))
return new_links
return list(urls.values())
### Link Details Index
@enforce_types
def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
out_dir = out_dir or link.link_dir
def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
out_dir = out_dir or snapshot.snapshot_dir
write_json_link_details(link, out_dir=out_dir)
write_html_link_details(link, out_dir=out_dir)
write_json_snapshot_details(snapshot, out_dir=out_dir)
#write_html_snapshot_details(snapshot, out_dir=out_dir) TODO: Refactor html code too
if not skip_sql_index:
write_sql_link_details(link)
write_sql_snapshot_details(snapshot)
@enforce_types
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model:
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
out_dir = out_dir or link.link_dir
out_dir = out_dir or snapshot.snapshot_dir
existing_link = parse_json_link_details(out_dir)
if existing_link:
return merge_links(existing_link, link)
existing_snapshot = parse_json_snapshot_details(out_dir)
if existing_snapshot:
return merge_snapshots(existing_snapshot, snapshot)
return link
return snapshot

View file

@ -5,6 +5,7 @@ from typing import List, Optional, Iterator, Mapping
from pathlib import Path
from django.utils.html import format_html
from django.db.models import Model
from collections import defaultdict
from .schema import Link
@ -71,8 +72,8 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
### Link Details Index
@enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or snapshot.snapshot_dir
rendered_html = link_details_template(link)
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)

View file

@ -7,6 +7,7 @@ from pathlib import Path
from datetime import datetime
from typing import List, Optional, Iterator, Any, Union
from django.db.models import Model
from .schema import Link
from ..system import atomic_write
@ -81,16 +82,17 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
### Link Details Index
@enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the snapshot"""
out_dir = out_dir or link.link_dir
out_dir = out_dir or snapshot.snapshot_dir
path = Path(out_dir) / JSON_INDEX_FILENAME
atomic_write(str(path), link._asdict(extended=True))
print(snapshot._asdict())
atomic_write(str(path), snapshot._asdict())
@enforce_types
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]:
"""load the json link index from a given directory"""
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
if existing_index.exists():
@ -102,16 +104,31 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
pass
return None
@enforce_types
def load_snapshot_details(snapshot: Model, out_dir: Path):
"""
Loads the detail from the local json index
"""
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
if existing_index.exists():
with open(existing_index, 'r', encoding='utf-8') as f:
try:
return pyjson.load(f)
except pyjson.JSONDecodeError:
pass
return None
@enforce_types
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links"""
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists():
try:
link = parse_json_link_details(entry.path)
link = parse_json_snapshot_details(entry.path)
except KeyError:
link = None
if link:

View file

@ -3,8 +3,9 @@ __package__ = 'archivebox.index'
from io import StringIO
from pathlib import Path
from typing import List, Tuple, Iterator
from django.db.models import QuerySet
from django.db.models import QuerySet, Model
from django.db import transaction
from datetime import datetime
from .schema import Link
from ..util import enforce_types
@ -28,21 +29,20 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
snapshots.delete()
@enforce_types
def write_link_to_sql_index(link: Link):
def write_snapshot_to_index(snapshot: Model):
from core.models import Snapshot
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
tags = []
try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
timestamp = Snapshot.objects.get(url=snapshot.url).timestamp
except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
timestamp = snapshot.timestamp
if not timestamp:
timestamp = str(datetime.now().timestamp())
while Snapshot.objects.filter(timestamp=timestamp).exists():
print("the timestamp is: ", timestamp)
timestamp = str(float(timestamp) + 1.0)
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot.save_tags(tags)
snapshot.timestamp = timestamp
snapshot.save()
return snapshot
@ -50,27 +50,29 @@ def write_link_to_sql_index(link: Link):
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
with transaction.atomic():
for link in links:
write_link_to_sql_index(link)
write_snapshot_to_index(link)
@enforce_types
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
def write_sql_snapshot_details(snapshot: Model, out_dir: Path=OUTPUT_DIR) -> None:
from core.models import Snapshot
with transaction.atomic():
try:
snap = Snapshot.objects.get(url=link.url)
snap = Snapshot.objects.get(url=snapshot.url)
except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link)
snap.title = link.title
snap = write_snapshot_to_sql_index(snapshot)
snap.title = snapshot.title
tag_set = (
set(tag.strip() for tag in (link.tags or '').split(','))
)
tag_list = list(tag_set) or []
# TODO: If there are actual tags, this will break
#tag_set = (
# set(tag.strip() for tag in (snapshot.tags.all() or '').split(','))
#)
#tag_list = list(tag_set) or []
snap.save()
snap.save_tags(tag_list)
#snap.save_tags(tag_list)
return snap

View file

@ -29,8 +29,9 @@ from .util import enforce_types # type: ignore
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import (
load_main_index,
parse_links_from_source,
dedupe_links,
get_empty_snapshot_queryset,
parse_snapshots_from_source,
filter_new_urls,
write_main_index,
snapshot_filter,
get_indexed_folders,
@ -44,11 +45,11 @@ from .index import (
get_corrupted_folders,
get_unrecognized_folders,
fix_invalid_folder_locations,
write_link_details,
write_snapshot_details,
)
from .index.json import (
parse_json_main_index,
parse_json_links_details,
parse_json_snapshot_details,
generate_json_index_from_links,
)
from .index.sql import (
@ -60,7 +61,7 @@ from .index.html import (
generate_index_from_links,
)
from .index.csv import links_to_csv
from .extractors import archive_links, archive_link, ignore_methods
from .extractors import archive_snapshots, archive_snapshot, ignore_methods
from .config import (
stderr,
hint,
@ -538,6 +539,7 @@ def add(urls: Union[str, List[str]],
extractors: str="",
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
from core.models import Snapshot
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
@ -549,8 +551,8 @@ def add(urls: Union[str, List[str]],
# Load list of links from the existing index
check_data_folder(out_dir=out_dir)
check_dependencies()
new_links: List[Link] = []
all_links = load_main_index(out_dir=out_dir)
new_snapshots: List[Snapshot] = []
all_snapshots = load_main_index(out_dir=out_dir)
log_importing_started(urls=urls, depth=depth, index_only=index_only)
if isinstance(urls, str):
@ -560,20 +562,21 @@ def add(urls: Union[str, List[str]],
# save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
new_links += parse_links_from_source(write_ahead_log, root_url=None)
new_snapshots += parse_snapshots_from_source(write_ahead_log, root_url=None)
# If we're going one level deeper, download each link and look for more links
new_links_depth = []
if new_links and depth == 1:
log_crawl_started(new_links)
for new_link in new_links:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
new_snapshots_depth = []
if new_snapshots and depth == 1:
log_crawl_started(new_snapshots)
for new_snapshot in new_snapshots:
# TODO: Check if we need to add domain to the Snapshot model
downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
new_links = dedupe_links(all_links, imported_links)
imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
write_main_index(links=new_links, out_dir=out_dir)
write_main_index(snapshots=new_snapshots, out_dir=out_dir)
all_links = load_main_index(out_dir=out_dir)
if index_only:
@ -586,13 +589,13 @@ def add(urls: Union[str, List[str]],
if extractors:
archive_kwargs["methods"] = extractors
if update_all:
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs)
elif overwrite:
archive_links(imported_links, overwrite=True, **archive_kwargs)
elif new_links:
archive_links(new_links, overwrite=False, **archive_kwargs)
archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs)
elif new_snapshots:
archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs)
return all_links
return all_snapshots
@enforce_types
def remove(filter_str: Optional[str]=None,
@ -711,7 +714,7 @@ def update(resume: Optional[float]=None,
if index_only:
for link in all_links:
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
index_links(all_links, out_dir=out_dir)
return all_links
@ -733,7 +736,7 @@ def update(resume: Optional[float]=None,
if extractors:
archive_kwargs["methods"] = extractors
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
archive_snapshots(to_archive, overwrite=overwrite, **archive_kwargs)
# Step 4: Re-write links index with updated titles, icons, and resources
all_links = load_main_index(out_dir=out_dir)

View file

@ -14,6 +14,8 @@ from typing import IO, Tuple, List, Optional
from datetime import datetime
from pathlib import Path
from django.db.models import Model
from ..system import atomic_write
from ..config import (
ANSI,
@ -84,7 +86,7 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
@enforce_types
def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
def parse_snapshots(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Model], str]:
"""parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file
"""
@ -93,27 +95,27 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
links, parser = run_parser_functions(file, timer, root_url=root_url)
snapshots, parser = run_parser_functions(file, timer, root_url=root_url)
timer.end()
if parser is None:
return [], 'Failed to parse'
return links, parser
return snapshots, parser
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
most_links: List[Link] = []
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Model], Optional[str]]:
most_snapshots: List[Model] = []
best_parser_name = None
for parser_name, parser_func in PARSERS:
try:
parsed_links = list(parser_func(to_parse, root_url=root_url))
if not parsed_links:
parsed_snapshots = list(parser_func(to_parse, root_url=root_url))
if not parsed_snapshots:
raise Exception('no links found')
# print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
if len(parsed_links) > len(most_links):
most_links = parsed_links
if len(parsed_snapshots) > len(most_snapshots):
most_snapshots = parsed_snapshots
best_parser_name = parser_name
except Exception as err: # noqa
@ -125,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
# raise
pass
timer.end()
return most_links, best_parser_name
return most_snapshots, best_parser_name
@enforce_types

View file

@ -31,6 +31,7 @@ class HrefParser(HTMLParser):
@enforce_types
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
"""Parse Generic HTML for href tags and use only the url (support for title coming later)"""
from core.models import Snapshot
html_file.seek(0)
for line in html_file:
@ -44,10 +45,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
url = urljoin(root_url, url)
for archivable_url in re.findall(URL_REGEX, url):
yield Link(
yield Snapshot(
url=htmldecode(archivable_url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[html_file.name],
#tags=None,
#sources=[html_file.name],
)

View file

@ -18,6 +18,8 @@ from ..util import (
@enforce_types
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse raw links from each line in a text file"""
# TODO: Check if we should add sources list to the database
from core.models import Snapshot
text_file.seek(0)
for line in text_file.readlines():
@ -40,22 +42,22 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
# otherwise look for anything that looks like a URL in the line
for url in re.findall(URL_REGEX, line):
yield Link(
yield Snapshot(
url=htmldecode(url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
#tags=None,
#sources=[text_file.name],
)
# look inside the URL for any sub-urls, e.g. for archive.org links
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
for url in re.findall(URL_REGEX, line[1:]):
yield Link(
yield Snapshot(
url=htmldecode(url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
#tags=None,
#sources=[text_file.name],
)

View file

@ -2,7 +2,7 @@ from typing import List, Union
from pathlib import Path
from importlib import import_module
from django.db.models import QuerySet
from django.db.models import QuerySet, Model
from archivebox.index.schema import Link
from archivebox.util import enforce_types
@ -28,24 +28,22 @@ def import_backend():
return backend
@enforce_types
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
def write_search_index(snapshot: Model, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
if not indexing_enabled():
return
if not skip_text_index and texts:
from core.models import Snapshot
snap = Snapshot.objects.filter(url=link.url).first()
backend = import_backend()
if snap:
try:
backend.index(snapshot_id=str(snap.id), texts=texts)
except Exception as err:
stderr()
stderr(
f'[X] The search backend threw an exception={err}:',
try:
backend.index(snapshot_id=str(snapshot.id), texts=texts)
except Exception as err:
stderr()
stderr(
f'[X] The search backend threw an exception={err}:',
color='red',
)
)
@enforce_types
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: