/index.html
def get(self, request, path):
- # missing trailing slash -> redirect to index
- if '/' not in path:
- return redirect(f'{path}/index.html')
-
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
@@ -55,46 +52,163 @@ class SnapshotView(View):
except (IndexError, ValueError):
slug, archivefile = path.split('/', 1)[0], 'index.html'
- all_pages = list(Snapshot.objects.all())
-
# slug is a timestamp
- by_ts = {page.timestamp: page for page in all_pages}
- try:
- # print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
- response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
- response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
- return response
- except KeyError:
- pass
+ if slug.replace('.','').isdigit():
- # slug is a hash
- by_hash = {page.url_hash: page for page in all_pages}
- try:
- timestamp = by_hash[slug].timestamp
- return redirect(f'/archive/{timestamp}/{archivefile}')
- except KeyError:
- pass
+ # missing trailing slash -> redirect to index
+ if '/' not in path:
+ return redirect(f'{path}/index.html')
+ try:
+ try:
+ snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
+ response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
+ response["Link"] = f'<{snapshot.url}>; rel="canonical"'
+ return response
+ except Snapshot.DoesNotExist:
+ if Snapshot.objects.filter(timestamp__startswith=slug).exists():
+ raise Snapshot.MultipleObjectsReturned
+ else:
+ raise
+ except Snapshot.DoesNotExist:
+ # Snapshot does not exist
+ return HttpResponse(
+ format_html(
+ (
+ '
'
+ 'No Snapshot directories match the given timestamp or UUID: {}
'
+ 'You can add a new Snapshot, or return to the Main Index'
+ ''
+ ),
+ slug,
+ path,
+ ),
+ content_type="text/html",
+ status=404,
+ )
+ except Snapshot.MultipleObjectsReturned:
+ snapshot_hrefs = mark_safe('
').join(
+ format_html(
+ '{} {}
{} {}',
+ snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+ snap.timestamp,
+ snap.timestamp,
+ snap.url,
+ snap.title or '',
+ )
+ for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
+ )
+ return HttpResponse(
+ format_html(
+ (
+ 'Multiple Snapshots match the given timestamp/UUID {}
'
+ ),
+ slug,
+ ) + snapshot_hrefs + format_html(
+ (
+ '
'
+ 'Choose a Snapshot to proceed or go back to the Main Index'
+ )
+ ),
+ content_type="text/html",
+ status=404,
+ )
+ except Http404:
+ # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
+ return HttpResponse(
+ format_html(
+ (
+ '
'
+ f'Snapshot [{snapshot.timestamp}]
exists in DB, but resource {snapshot.timestamp}/'
+ '{}'
+ f'
does not exist in snapshot dir yet.
'
+ 'Maybe this resource type is not availabe for this Snapshot,
or the archiving process has not completed yet?
'
+ f'# run this cmd to finish archiving this Snapshot
archivebox update -t timestamp {snapshot.timestamp}
'
+ ''
+ '
Next steps:'
+ f'- list all the
Snapshot files .*
'
+ f'- view the
Snapshot ./index.html
'
+ f'- go to the
Snapshot admin to edit
'
+ f'- go to the
Snapshot actions to re-archive
'
+ '- or return to
the main index... '
+ ''
+ ),
+ archivefile,
+ ),
+ content_type="text/html",
+ status=404,
+ )
# slug is a URL
- by_url = {page.base_url: page for page in all_pages}
try:
- # TODO: add multiple snapshot support by showing index of all snapshots
- # for given url instead of redirecting to timestamp index
- timestamp = by_url[base_url(path)].timestamp
- return redirect(f'/archive/{timestamp}/index.html')
- except KeyError:
- pass
-
- return HttpResponse(
- 'No archived link matches the given timestamp or hash.',
- content_type="text/plain",
- status=404,
- )
+ try:
+ # try exact match on full url first
+ snapshot = Snapshot.objects.get(
+ Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
+ )
+ except Snapshot.DoesNotExist:
+ # fall back to match on exact base_url
+ try:
+ snapshot = Snapshot.objects.get(
+ Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
+ )
+ except Snapshot.DoesNotExist:
+ # fall back to matching base_url as prefix
+ snapshot = Snapshot.objects.get(
+ Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+ )
+ return redirect(f'/archive/{snapshot.timestamp}/index.html')
+ except Snapshot.DoesNotExist:
+ return HttpResponse(
+ format_html(
+ (
+ '
'
+ 'No Snapshots match the given url: {}
'
+ 'Return to the Main Index, or:
'
+ '+ Add a new Snapshot for {}
'
+ ''
+ ),
+ base_url(path),
+ path if '://' in path else f'https://{path}',
+ path,
+ ),
+ content_type="text/html",
+ status=404,
+ )
+ except Snapshot.MultipleObjectsReturned:
+ snapshot_hrefs = mark_safe('
').join(
+ format_html(
+ '{} {}
{} {}',
+ snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+ snap.timestamp,
+ snap.timestamp,
+ snap.url,
+ snap.title or '',
+ )
+ for snap in Snapshot.objects.filter(
+ Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+ ).only('url', 'timestamp', 'title', 'added').order_by('-added')
+ )
+ return HttpResponse(
+ format_html(
+ (
+ 'Multiple Snapshots match the given URL {}
'
+ ),
+ base_url(path),
+ ) + snapshot_hrefs + format_html(
+ (
+ '
'
+ 'Choose a Snapshot to proceed or go back to the Main Index'
+ )
+ ),
+ content_type="text/html",
+ status=404,
+ )
+
class PublicIndexView(ListView):
template_name = 'public_index.html'
model = Snapshot
- paginate_by = 100
+ paginate_by = SNAPSHOTS_PER_PAGE
ordering = ['title']
def get_context_data(self, **kwargs):
@@ -105,12 +219,14 @@ class PublicIndexView(ListView):
}
def get_queryset(self, **kwargs):
- qs = super().get_queryset(**kwargs)
+ qs = super().get_queryset(**kwargs)
query = self.request.GET.get('q')
if query:
qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
+
for snapshot in qs:
- snapshot.icons = snapshot_icons(snapshot)
+ # lazy load snapshot icons, otherwise it will load icons for entire index at once
+ snapshot.icons = lambda: snapshot_icons(snapshot)
return qs
def get(self, *args, **kwargs):
@@ -130,9 +246,9 @@ class AddView(UserPassesTestMixin, FormView):
if self.request.method == 'GET':
url = self.request.GET.get('url', None)
if url:
- return {'url': url}
- else:
- return super().get_initial()
+ return {'url': url if '://' in url else f'https://{url}'}
+
+ return super().get_initial()
def test_func(self):
return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
@@ -145,15 +261,18 @@ class AddView(UserPassesTestMixin, FormView):
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': FOOTER_INFO,
+ 'stdout': '',
}
def form_valid(self, form):
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
+ tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": url,
+ "tag": tag,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py
index f933afae..94993b92 100644
--- a/archivebox/core/wsgi.py
+++ b/archivebox/core/wsgi.py
@@ -7,10 +7,10 @@ For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
-import os
+
+from archivebox.config import setup_django
+setup_django(in_memory_db=False, check_db=True)
from django.core.wsgi import get_wsgi_application
-os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
-
application = get_wsgi_application()
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 15968097..09b56c66 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -44,16 +44,16 @@ def get_default_archive_methods():
return [
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
- ('wget', should_save_wget, save_wget),
+ ('headers', should_save_headers, save_headers),
('singlefile', should_save_singlefile, save_singlefile),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
- ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
+ ('wget', should_save_wget, save_wget),
+ ('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
('mercury', should_save_mercury, save_mercury),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
- ('headers', should_save_headers, save_headers),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
]
@@ -115,6 +115,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
+
+ # bump the updated time on the main Snapshot here, this is critical
+ # to be able to cache summaries of the ArchiveResults for a given
+ # snapshot without having to load all the results from the DB each time.
+ # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
+ # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
+ snapshot.save()
else:
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
index 1f382190..a0883113 100644
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'archive.org.txt').exists():
- # if open(path, 'r').read().strip() != 'None':
+ # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG
diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py
index d9e32c0a..e7d20362 100644
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@@ -54,11 +54,13 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "mercury"
- output = str(output_folder)
+ output = "mercury"
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
+ output_folder.mkdir(exist_ok=True)
+
# Get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -71,6 +73,11 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
except json.JSONDecodeError:
raise ShellError(cmd, result)
+ if article_text.get('failed'):
+ raise ArchiveError('Mercury was not able to get article text from the URL')
+
+ atomic_write(str(output_folder / "content.txt"), article_text["content"])
+
# Get HTML version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -82,9 +89,10 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
except json.JSONDecodeError:
raise ShellError(cmd, result)
- output_folder.mkdir(exist_ok=True)
+ if article_text.get('failed'):
+ raise ArchiveError('Mercury was not able to get article HTML from the URL')
+
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
- atomic_write(str(output_folder / "content.txt"), article_text["content"])
atomic_write(str(output_folder / "article.json"), article_json)
# Check for common failure cases
diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py
index 6e48cd9a..d7c1e303 100644
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str:
document = None
for source in sources:
try:
- with open(abs_path / source, "r") as f:
+ with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
@@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "readability"
- output = str(output_folder)
+ output = "readability"
# Readability Docs: https://github.com/mozilla/readability
@@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
temp_doc.write(document.encode("utf-8"))
temp_doc.close()
+ if not document or len(document) < 10:
+ raise ArchiveError('Readability could not find HTML to parse for article text')
+
cmd = [
DEPENDENCIES['READABILITY_BINARY']['path'],
- temp_doc.name
+ temp_doc.name,
]
result = run(cmd, cwd=out_dir, timeout=timeout)
- result_json = json.loads(result.stdout)
+ try:
+ result_json = json.loads(result.stdout)
+ except json.JSONDecodeError:
+ raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
+
output_folder.mkdir(exist_ok=True)
readability_content = result_json.pop("textContent")
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
@@ -112,6 +119,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
except (Exception, OSError) as err:
status = 'failed'
output = err
+ cmd = [cmd[0], './{singlefile,dom}.html']
finally:
timer.end()
@@ -121,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
cmd_version=READABILITY_VERSION,
output=output,
status=status,
- index_texts= [readability_content] if readability_content else [],
+ index_texts=[readability_content] if readability_content else [],
**timer.stats,
)
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 04ab0a8d..d3d1bedc 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -356,6 +356,7 @@ LINK_FILTERS = {
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
'tag': lambda pattern: Q(tags__name=pattern),
+ 'timestamp': lambda pattern: Q(timestamp=pattern),
}
@enforce_types
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index ebfe7d78..c4f66f55 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -1,11 +1,12 @@
__package__ = 'archivebox.index'
-from datetime import datetime
-from typing import List, Optional, Iterator, Mapping
from pathlib import Path
+from datetime import datetime
+from collections import defaultdict
+from typing import List, Optional, Iterator, Mapping
from django.utils.html import format_html, mark_safe
-from collections import defaultdict
+from django.core.cache import cache
from .schema import Link
from ..system import atomic_write
@@ -20,7 +21,6 @@ from ..util import (
from ..config import (
OUTPUT_DIR,
VERSION,
- GIT_SHA,
FOOTER_INFO,
HTML_INDEX_FILENAME,
SAVE_ARCHIVE_DOT_ORG,
@@ -60,7 +60,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
return render_django_template(template, {
'version': VERSION,
- 'git_sha': GIT_SHA,
+ 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
'num_links': str(len(links)),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
@@ -116,71 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str:
- from core.models import EXTRACTORS
+ cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
+
+ def calc_snapshot_icons():
+ from core.models import EXTRACTORS
+ # start = datetime.now()
- # start = datetime.now()
+ archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+ link = snapshot.as_link()
+ path = link.archive_path
+ canon = link.canonical_outputs()
+ output = ""
+ output_template = '{} '
+ icons = {
+ "singlefile": "❶",
+ "wget": "🆆",
+ "dom": "🅷",
+ "pdf": "📄",
+ "screenshot": "💻",
+ "media": "📼",
+ "git": "🅶",
+ "archive_org": "🏛",
+ "readability": "🆁",
+ "mercury": "🅼",
+ "warc": "📦"
+ }
+ exclude = ["favicon", "title", "headers", "archive_org"]
+ # Missing specific entry for WARC
- archive_results = snapshot.archiveresult_set.filter(status="succeeded")
- link = snapshot.as_link()
- path = link.archive_path
- canon = link.canonical_outputs()
- output = ""
- output_template = '{} '
- icons = {
- "singlefile": "❶",
- "wget": "🆆",
- "dom": "🅷",
- "pdf": "📄",
- "screenshot": "💻",
- "media": "📼",
- "git": "🅶",
- "archive_org": "🏛",
- "readability": "🆁",
- "mercury": "🅼",
- "warc": "📦"
- }
- exclude = ["favicon", "title", "headers", "archive_org"]
- # Missing specific entry for WARC
+ extractor_outputs = defaultdict(lambda: None)
+ for extractor, _ in EXTRACTORS:
+ for result in archive_results:
+ if result.extractor == extractor and result:
+ extractor_outputs[extractor] = result
- extractor_outputs = defaultdict(lambda: None)
- for extractor, _ in EXTRACTORS:
- for result in archive_results:
- if result.extractor == extractor and result:
- extractor_outputs[extractor] = result
+ for extractor, _ in EXTRACTORS:
+ if extractor not in exclude:
+ existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
+ # if existing:
+ # existing = (Path(path) / existing)
+ # if existing.is_file():
+ # existing = True
+ # elif existing.is_dir():
+ # existing = any(existing.glob('*.*'))
+ output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
+ extractor, icons.get(extractor, "?"))
+ if extractor == "wget":
+ # warc isn't technically it's own extractor, so we have to add it after wget
+
+ # get from db (faster but less thurthful)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower but more accurate)
+ # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+ output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
- for extractor, _ in EXTRACTORS:
- if extractor not in exclude:
- existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
- # if existing:
- # existing = (Path(path) / existing)
- # if existing.is_file():
- # existing = True
- # elif existing.is_dir():
- # existing = any(existing.glob('*.*'))
- output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
- extractor, icons.get(extractor, "?"))
- if extractor == "wget":
- # warc isn't technically it's own extractor, so we have to add it after wget
-
- # get from db (faster but less thurthful)
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # get from filesystem (slower but more accurate)
- # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
- output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
+ if extractor == "archive_org":
+ # The check for archive_org is different, so it has to be handled separately
- if extractor == "archive_org":
- # The check for archive_org is different, so it has to be handled separately
+ # get from db (faster)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower)
+ # target_path = Path(path) / "archive.org.txt"
+ # exists = target_path.exists()
+ output += '{} '.format(canon["archive_org_path"], str(exists),
+ "archive_org", icons.get("archive_org", "?"))
- # get from db (faster)
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # get from filesystem (slower)
- # target_path = Path(path) / "archive.org.txt"
- # exists = target_path.exists()
- output += '{} '.format(canon["archive_org_path"], str(exists),
- "archive_org", icons.get("archive_org", "?"))
+ result = format_html('{}', mark_safe(output))
+ # end = datetime.now()
+ # print(((end - start).total_seconds()*1000) // 1, 'ms')
+ return result
- result = format_html('{}', mark_safe(output))
- # end = datetime.now()
- # print(((end - start).total_seconds()*1000) // 1, 'ms')
- return result
+ return cache.get_or_set(cache_key, calc_snapshot_icons)
+ # return calc_snapshot_icons()
+
+
diff --git a/archivebox/index/json.py b/archivebox/index/json.py
index f24b969f..441e6854 100644
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@@ -15,7 +15,6 @@ from ..config import (
VERSION,
OUTPUT_DIR,
FOOTER_INFO,
- GIT_SHA,
DEPENDENCIES,
JSON_INDEX_FILENAME,
ARCHIVE_DIR_NAME,
@@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = {
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
- 'git_sha': GIT_SHA,
+ 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 1ca4e801..00831e19 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, asdict, field, fields
+from django.utils.functional import cached_property
from ..system import get_dir_size
@@ -133,7 +134,6 @@ class Link:
updated: Optional[datetime] = None
schema: str = 'Link'
-
def __str__(self) -> str:
return f'[{self.timestamp}] {self.url} "{self.title}"'
@@ -190,6 +190,7 @@ class Link:
}
if extended:
info.update({
+ 'snapshot_id': self.snapshot_id,
'link_dir': self.link_dir,
'archive_path': self.archive_path,
@@ -201,6 +202,9 @@ class Link:
'basename': self.basename,
'extension': self.extension,
'is_static': self.is_static,
+
+ 'tags_str': self.tags, # only used to render static index in index/html.py, remove if no longer needed there
+ 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there
'bookmarked_date': self.bookmarked_date,
'updated_date': self.updated_date,
@@ -255,6 +259,11 @@ class Link:
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
+ @cached_property
+ def snapshot_id(self):
+ from core.models import Snapshot
+ return str(Snapshot.objects.only('id').get(url=self.url).id)
+
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index 1e99f67c..2fcabd61 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -7,7 +7,7 @@ from django.db.models import QuerySet
from django.db import transaction
from .schema import Link
-from ..util import enforce_types
+from ..util import enforce_types, parse_date
from ..config import OUTPUT_DIR
@@ -23,13 +23,15 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
)
@enforce_types
-def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None:
- with transaction.atomic():
- snapshots.delete()
+def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+ if atomic:
+ with transaction.atomic():
+ return snapshots.delete()
+ return snapshots.delete()
@enforce_types
def write_link_to_sql_index(link: Link):
- from core.models import Snapshot
+ from core.models import Snapshot, ArchiveResult
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
@@ -41,36 +43,74 @@ def write_link_to_sql_index(link: Link):
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
- snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
+ snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot.save_tags(tags)
+
+ for extractor, entries in link.history.items():
+ for entry in entries:
+ if isinstance(entry, dict):
+ result, _ = ArchiveResult.objects.get_or_create(
+ snapshot_id=snapshot.id,
+ extractor=extractor,
+ start_ts=parse_date(entry['start_ts']),
+ defaults={
+ 'end_ts': parse_date(entry['end_ts']),
+ 'cmd': entry['cmd'],
+ 'output': entry['output'],
+ 'cmd_version': entry.get('cmd_version') or 'unknown',
+ 'pwd': entry['pwd'],
+ 'status': entry['status'],
+ }
+ )
+ else:
+ result, _ = ArchiveResult.objects.update_or_create(
+ snapshot_id=snapshot.id,
+ extractor=extractor,
+ start_ts=parse_date(entry.start_ts),
+ defaults={
+ 'end_ts': parse_date(entry.end_ts),
+ 'cmd': entry.cmd,
+ 'output': entry.output,
+ 'cmd_version': entry.cmd_version or 'unknown',
+ 'pwd': entry.pwd,
+ 'status': entry.status,
+ }
+ )
+
return snapshot
@enforce_types
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
- with transaction.atomic():
- for link in links:
- write_link_to_sql_index(link)
+ for link in links:
+ # with transaction.atomic():
+ # write_link_to_sql_index(link)
+ write_link_to_sql_index(link)
@enforce_types
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
from core.models import Snapshot
- with transaction.atomic():
- try:
- snap = Snapshot.objects.get(url=link.url)
- except Snapshot.DoesNotExist:
- snap = write_link_to_sql_index(link)
- snap.title = link.title
+ # with transaction.atomic():
+ # try:
+ # snap = Snapshot.objects.get(url=link.url)
+ # except Snapshot.DoesNotExist:
+ # snap = write_link_to_sql_index(link)
+ # snap.title = link.title
+ try:
+ snap = Snapshot.objects.get(url=link.url)
+ except Snapshot.DoesNotExist:
+ snap = write_link_to_sql_index(link)
+ snap.title = link.title
- tag_set = (
- set(tag.strip() for tag in (link.tags or '').split(','))
- )
- tag_list = list(tag_set) or []
+ tag_set = (
+ set(tag.strip() for tag in (link.tags or '').split(','))
+ )
+ tag_list = list(tag_set) or []
- snap.save()
- snap.save_tags(tag_list)
+ snap.save()
+ snap.save_tags(tag_list)
diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py
index f2b86735..492ae55e 100644
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re
import os
import sys
+import stat
import time
import argparse
from math import log
@@ -11,18 +12,21 @@ from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
-from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
+from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
if TYPE_CHECKING:
from .index.schema import Link, ArchiveResult
+from .system import get_dir_size
from .util import enforce_types
from .config import (
ConfigDict,
OUTPUT_DIR,
PYTHON_ENCODING,
+ VERSION,
ANSI,
IS_TTY,
+ IN_DOCKER,
TERM_WIDTH,
SHOW_PROGRESS,
SOURCES_DIR_NAME,
@@ -50,6 +54,37 @@ class RuntimeStats:
_LAST_RUN_STATS = RuntimeStats()
+def debug_dict_summary(obj: Dict[Any, Any]) -> None:
+ stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
+
+
+def get_fd_info(fd) -> Dict[str, Any]:
+ NAME = fd.name[1:-1]
+ FILENO = fd.fileno()
+ MODE = os.fstat(FILENO).st_mode
+ IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
+ IS_PIPE = stat.S_ISFIFO(MODE)
+ IS_FILE = stat.S_ISREG(MODE)
+ IS_TERMINAL = not (IS_PIPE or IS_FILE)
+ IS_LINE_BUFFERED = fd.line_buffering
+ IS_READABLE = fd.readable()
+ return {
+ 'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
+ 'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
+ 'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
+ 'IS_READABLE': IS_READABLE,
+ }
+
+
+# # Log debug information about stdin, stdout, and stderr
+# sys.stdout.write('[>&1] this is python stdout\n')
+# sys.stderr.write('[>&2] this is python stderr\n')
+
+# debug_dict_summary(get_fd_info(sys.stdin))
+# debug_dict_summary(get_fd_info(sys.stdout))
+# debug_dict_summary(get_fd_info(sys.stderr))
+
+
class SmartFormatter(argparse.HelpFormatter):
"""Patched formatter that prints newlines in argparse help strings"""
@@ -62,22 +97,40 @@ class SmartFormatter(argparse.HelpFormatter):
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
- if stdin and not stdin.isatty():
- stdin_raw_text = stdin.read().strip()
+ if not stdin:
+ return None
+
+ if IN_DOCKER:
+ # when TTY is disabled in docker we cant tell if stdin is being piped in or not
+ # if we try to read stdin when its not piped we will hang indefinitely waiting for it
+ return None
+
+ if not stdin.isatty():
+ # stderr('READING STDIN TO REJECT...')
+ stdin_raw_text = stdin.read()
if stdin_raw_text:
+ # stderr('GOT STDIN!', len(stdin_str))
stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
stderr()
raise SystemExit(1)
+ return None
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
"""accept any standard input and return it as a string or None"""
+
if not stdin:
return None
- elif stdin and not stdin.isatty():
- stdin_str = stdin.read().strip()
- return stdin_str or None
+
+ if not stdin.isatty():
+ # stderr('READING STDIN TO ACCEPT...')
+ stdin_str = stdin.read()
+
+ if stdin_str:
+ # stderr('GOT STDIN...', len(stdin_str))
+ return stdin_str
+
return None
@@ -174,7 +227,6 @@ def progress_bar(seconds: int, prefix: str='') -> None:
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
- from .config import VERSION, ANSI
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -233,11 +285,11 @@ def log_indexing_process_finished():
def log_indexing_started(out_path: str):
if IS_TTY:
- sys.stdout.write(f' > {out_path}')
+ sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
def log_indexing_finished(out_path: str):
- print(f'\r √ {out_path}')
+ print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
### Archiving Stage
@@ -272,8 +324,6 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
total=num_links,
))
print()
- print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
- print(' archivebox server # then visit http://127.0.0.1:8000')
print(' Continue archiving where you left off by running:')
print(' archivebox update --resume={}'.format(timestamp))
@@ -331,6 +381,9 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
else:
_LAST_RUN_STATS.succeeded += 1
+ size = get_dir_size(link_dir)
+ print(' {black}{} files ({}){reset}'.format(size[2], printable_filesize(size[0]), **ANSI))
+
def log_archive_method_started(method: str):
print(' > {}'.format(method))
diff --git a/archivebox/main.py b/archivebox/main.py
index c1751528..5c697c55 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -67,6 +67,7 @@ from .config import (
ConfigDict,
ANSI,
IS_TTY,
+ DEBUG,
IN_DOCKER,
USER,
ARCHIVEBOX_BINARY,
@@ -76,6 +77,7 @@ from .config import (
ARCHIVE_DIR,
LOGS_DIR,
CONFIG_FILE,
+ CONFIG_FILENAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@@ -84,6 +86,7 @@ from .config import (
SQL_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
+ SEARCH_BACKEND_ENGINE,
check_dependencies,
check_data_folder,
write_config_file,
@@ -125,14 +128,19 @@ ALLOWED_IN_OUTPUT_DIR = {
'node_modules',
'package-lock.json',
'static',
+ 'sonic',
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
SQL_INDEX_FILENAME,
+ f'{SQL_INDEX_FILENAME}-wal',
+ f'{SQL_INDEX_FILENAME}-shm',
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
+ CONFIG_FILENAME,
+ f'{CONFIG_FILENAME}.bak',
}
@enforce_types
@@ -214,9 +222,23 @@ def version(quiet: bool=False,
if quiet:
print(VERSION)
else:
+ # ArchiveBox v0.5.6
+ # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
print('ArchiveBox v{}'.format(VERSION))
p = platform.uname()
- print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
+ print(
+ sys.implementation.name.title(),
+ p.system,
+ platform.platform(),
+ p.machine,
+ )
+ print(
+ f'IN_DOCKER={IN_DOCKER}',
+ f'DEBUG={DEBUG}',
+ f'IS_TTY={IS_TTY}',
+ f'TZ={os.environ.get("TZ", "UTC")}',
+ f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
+ )
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@@ -261,7 +283,7 @@ def run(subcommand: str,
@enforce_types
-def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
from core.models import Snapshot
@@ -276,13 +298,12 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
if is_empty and not existing_index:
- print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
- print(f' {out_dir}')
- print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
+ print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
elif existing_index:
- print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
- print(f' {out_dir}')
- print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ # TODO: properly detect and print the existing version in current index as well
+ print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
+ print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
else:
if force:
stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
@@ -303,30 +324,25 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
else:
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
+ print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
Path(SOURCES_DIR).mkdir(exist_ok=True)
- print(f' √ {SOURCES_DIR}')
-
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
- print(f' √ {ARCHIVE_DIR}')
-
Path(LOGS_DIR).mkdir(exist_ok=True)
- print(f' √ {LOGS_DIR}')
-
+ print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
write_config_file({}, out_dir=out_dir)
- print(f' √ {CONFIG_FILE}')
+
if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
- print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
+ print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
else:
- print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
+ print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
- print(f' √ {DATABASE_FILE}')
- print()
for migration_line in apply_migrations(out_dir):
print(f' {migration_line}')
-
assert DATABASE_FILE.exists()
+ print()
+ print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
# from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@@ -334,7 +350,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
# call_command("createsuperuser", interactive=True)
print()
- print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
+ print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {}
@@ -343,63 +359,77 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
all_links = load_main_index(out_dir=out_dir, warn=False)
print(' √ Loaded {} links from existing main index.'.format(all_links.count()))
- # Links in data folders that dont match their timestamp
- fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
- if fixed:
- print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
- if cant_fix:
- print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
+ if quick:
+ print(' > Skipping full snapshot directory check (quick mode)')
+ else:
+ try:
+ # Links in data folders that dont match their timestamp
+ fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
+ if fixed:
+ print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
+ if cant_fix:
+ print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
- # Links in JSON index but not in main index
- orphaned_json_links = {
- link.url: link
- for link in parse_json_main_index(out_dir)
- if not all_links.filter(url=link.url).exists()
- }
- if orphaned_json_links:
- pending_links.update(orphaned_json_links)
- print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
+ # Links in JSON index but not in main index
+ orphaned_json_links = {
+ link.url: link
+ for link in parse_json_main_index(out_dir)
+ if not all_links.filter(url=link.url).exists()
+ }
+ if orphaned_json_links:
+ pending_links.update(orphaned_json_links)
+ print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
- # Links in data dir indexes but not in main index
- orphaned_data_dir_links = {
- link.url: link
- for link in parse_json_links_details(out_dir)
- if not all_links.filter(url=link.url).exists()
- }
- if orphaned_data_dir_links:
- pending_links.update(orphaned_data_dir_links)
- print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
+ # Links in data dir indexes but not in main index
+ orphaned_data_dir_links = {
+ link.url: link
+ for link in parse_json_links_details(out_dir)
+ if not all_links.filter(url=link.url).exists()
+ }
+ if orphaned_data_dir_links:
+ pending_links.update(orphaned_data_dir_links)
+ print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
- # Links in invalid/duplicate data dirs
- invalid_folders = {
- folder: link
- for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
- }
- if invalid_folders:
- print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
- print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
- print()
- print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
- print(' archivebox status')
- print(' archivebox list --status=invalid')
+ # Links in invalid/duplicate data dirs
+ invalid_folders = {
+ folder: link
+ for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
+ }
+ if invalid_folders:
+ print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
+ print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
+ print()
+ print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
+ print(' archivebox status')
+ print(' archivebox list --status=invalid')
+ except (KeyboardInterrupt, SystemExit):
+ stderr()
+ stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
+ stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
+ stderr()
+ stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
+ stderr(' archivebox init --quick')
+ raise SystemExit(1)
+
+ write_main_index(list(pending_links.values()), out_dir=out_dir)
- write_main_index(list(pending_links.values()), out_dir=out_dir)
-
- print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
- print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
- print()
- print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
- print(' archivebox server # then visit http://127.0.0.1:8000')
- print()
- print(' To add new links, you can run:')
- print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
- print()
- print(' For more usage and examples, run:')
- print(' archivebox help')
+ print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
+
+ if Snapshot.objects.count() < 25: # hide the hints for experienced users
+ print()
+ print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
+ print(' archivebox server # then visit http://127.0.0.1:8000')
+ print()
+ print(' To add new links, you can run:')
+ print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
+ print()
+ print(' For more usage and examples, run:')
+ print(' archivebox help')
json_index = Path(out_dir) / JSON_INDEX_FILENAME
html_index = Path(out_dir) / HTML_INDEX_FILENAME
@@ -531,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
@enforce_types
def add(urls: Union[str, List[str]],
+ tag: str='',
depth: int=0,
update_all: bool=not ONLY_NEW,
index_only: bool=False,
@@ -540,6 +571,8 @@ def add(urls: Union[str, List[str]],
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
+ from core.models import Tag
+
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
extractors = extractors.split(",") if extractors else []
@@ -572,26 +605,48 @@ def add(urls: Union[str, List[str]],
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
+
new_links = dedupe_links(all_links, imported_links)
write_main_index(links=new_links, out_dir=out_dir)
all_links = load_main_index(out_dir=out_dir)
if index_only:
- return all_links
+ # mock archive all the links using the fake index_only extractor method in order to update their state
+ if overwrite:
+ archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir)
+ else:
+ archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir)
+ else:
+ # fully run the archive extractor methods for each link
+ archive_kwargs = {
+ "out_dir": out_dir,
+ }
+ if extractors:
+ archive_kwargs["methods"] = extractors
+
+ if update_all:
+ archive_links(all_links, overwrite=overwrite, **archive_kwargs)
+ elif overwrite:
+ archive_links(imported_links, overwrite=True, **archive_kwargs)
+ elif new_links:
+ archive_links(new_links, overwrite=False, **archive_kwargs)
+
+
+ # add any tags to imported links
+ tags = [
+ Tag.objects.get_or_create(name=name.strip())[0]
+ for name in tag.split(',')
+ if name.strip()
+ ]
+ if tags:
+ for link in imported_links:
+ snapshot = link.as_snapshot()
+ snapshot.tags.add(*tags)
+ snapshot.tags_str(nocache=True)
+ snapshot.save()
+ # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
- # Run the archive methods for each link
- archive_kwargs = {
- "out_dir": out_dir,
- }
- if extractors:
- archive_kwargs["methods"] = extractors
- if update_all:
- archive_links(all_links, overwrite=overwrite, **archive_kwargs)
- elif overwrite:
- archive_links(imported_links, overwrite=True, **archive_kwargs)
- elif new_links:
- archive_links(new_links, overwrite=False, **archive_kwargs)
return all_links
@@ -811,11 +866,15 @@ def list_links(snapshots: Optional[QuerySet]=None,
all_snapshots = load_main_index(out_dir=out_dir)
if after is not None:
- all_snapshots = all_snapshots.filter(timestamp__lt=after)
+ all_snapshots = all_snapshots.filter(timestamp__gte=after)
if before is not None:
- all_snapshots = all_snapshots.filter(timestamp__gt=before)
+ all_snapshots = all_snapshots.filter(timestamp__lt=before)
if filter_patterns:
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
+
+ if not all_snapshots:
+ stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
+
return all_snapshots
@enforce_types
@@ -1061,6 +1120,7 @@ def server(runserver_args: Optional[List[str]]=None,
reload: bool=False,
debug: bool=False,
init: bool=False,
+ quick_init: bool=False,
createsuperuser: bool=False,
out_dir: Path=OUTPUT_DIR) -> None:
"""Run the ArchiveBox HTTP server"""
@@ -1069,9 +1129,14 @@ def server(runserver_args: Optional[List[str]]=None,
if init:
run_subcommand('init', stdin=None, pwd=out_dir)
+ print()
+ elif quick_init:
+ run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
+ print()
if createsuperuser:
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
+ print()
# setup config for django runserver
from . import config
@@ -1083,12 +1148,9 @@ def server(runserver_args: Optional[List[str]]=None,
from django.core.management import call_command
from django.contrib.auth.models import User
- admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last()
-
print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
- if admin_user:
- hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI))
- else:
+ print(' > Logging errors to ./logs/errors.log')
+ if not User.objects.filter(is_superuser=True).exists():
print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
print()
print(' To create an admin user, run:')
@@ -1106,7 +1168,6 @@ def server(runserver_args: Optional[List[str]]=None,
config.SHOW_PROGRESS = False
config.DEBUG = config.DEBUG or debug
-
call_command("runserver", *runserver_args)
diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py
index 441c08ac..4af2c5ac 100644
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
"""
parse a list of URLS without touching the filesystem
"""
- check_url_parsing_invariants()
timer = TimedProgress(TIMEOUT * 4)
#urls = list(map(lambda x: x + "\n", urls))
@@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
RSS feed, bookmarks export, or text file
"""
- check_url_parsing_invariants()
-
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
links, parser = run_parser_functions(file, timer, root_url=root_url)
@@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
return source_path
-def check_url_parsing_invariants() -> None:
- """Check that plain text regex URL parsing works as expected"""
-
- # this is last-line-of-defense to make sure the URL_REGEX isn't
- # misbehaving, as the consequences could be disastrous and lead to many
- # incorrect/badly parsed links being added to the archive
-
- test_urls = '''
- https://example1.com/what/is/happening.html?what=1#how-about-this=1
- https://example2.com/what/is/happening/?what=1#how-about-this=1
- HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
- https://example4.com/what/is/happening.html
- https://example5.com/
- https://example6.com
-
- http://example7.com
- [https://example8.com/what/is/this.php?what=1]
- [and http://example9.com?what=1&other=3#and-thing=2]
- https://example10.com#and-thing=2 "
- abcdef
- sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
- example13.bada
- and example14.badb
- htt://example15.badc
- '''
- # print('\n'.join(re.findall(URL_REGEX, test_urls)))
- assert len(re.findall(URL_REGEX, test_urls)) == 12
-
+# Check that plain text regex URL parsing works as expected
+# this is last-line-of-defense to make sure the URL_REGEX isn't
+# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
+# the consequences of bad URL parsing could be disastrous and lead to many
+# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
+_test_url_strs = {
+ 'example.com': 0,
+ '/example.com': 0,
+ '//example.com': 0,
+ ':/example.com': 0,
+ '://example.com': 0,
+ 'htt://example8.com': 0,
+ '/htt://example.com': 0,
+ 'https://example': 1,
+ 'https://localhost/2345': 1,
+ 'https://localhost:1234/123': 1,
+ '://': 0,
+ 'https://': 0,
+ 'http://': 0,
+ 'ftp://': 0,
+ 'ftp://example.com': 0,
+ 'https://example.com': 1,
+ 'https://example.com/': 1,
+ 'https://a.example.com': 1,
+ 'https://a.example.com/': 1,
+ 'https://a.example.com/what/is/happening.html': 1,
+ 'https://a.example.com/what/ís/happening.html': 1,
+ 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
+ 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
+ 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
+ 'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
+ 'https://example.com?what=1#how-about-this=1&2%20baf': 1,
+ 'http://example7.com': 1,
+ '[https://example8.com/what/is/this.php?what=1]': 1,
+ '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
+ 'https://example10.com#and-thing=2 "': 1,
+ 'abcdef': 1,
+ 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
+ 'http://examplehttp://15.badc': 2,
+ 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
+ '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
+}
+for url_str, num_urls in _test_url_strs.items():
+ assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
+ f'{url_str} does not contain {num_urls} urls')
diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py
index e6d15455..82d1880e 100644
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
if extra_path:
fpath = f'{fpath}/{extra_path}'
- with open(fpath, 'r') as file:
+ with open(fpath, 'r', encoding='utf-8') as file:
data = file.read()
if data:
return [data]
diff --git a/archivebox/system.py b/archivebox/system.py
index 2191c70a..3adf2e73 100644
--- a/archivebox/system.py
+++ b/archivebox/system.py
@@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
mode = 'wb+' if isinstance(contents, bytes) else 'w'
+ encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
try:
- with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
+ with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f:
if isinstance(contents, dict):
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
elif isinstance(contents, (bytes, str)):
diff --git a/archivebox/templates/admin/actions_as_select.html b/archivebox/templates/admin/actions_as_select.html
index 86a77190..e69de29b 100644
--- a/archivebox/templates/admin/actions_as_select.html
+++ b/archivebox/templates/admin/actions_as_select.html
@@ -1 +0,0 @@
-actions_as_select
diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html
index d8ad8d00..a3d21ba9 100644
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
@@ -20,7 +20,7 @@
-