1
0
Fork 0
mirror of synced 2024-06-14 08:25:21 +12:00

make everything take link_dir as an optional arg since its derivable from link url

This commit is contained in:
Nick Sweeting 2019-03-27 18:24:30 -04:00
parent 9fc1e3c3e1
commit a214bd7c02
5 changed files with 65 additions and 59 deletions

View file

@ -180,7 +180,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_links_index(out_dir=OUTPUT_DIR, links=list(all_links))
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
# Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links
@ -189,7 +189,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
link: Optional[Link] = None
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link)
archive_link(link, link_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
@ -203,7 +203,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
# Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(out_dir=OUTPUT_DIR, links=list(all_links), finished=True)
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links
if __name__ == '__main__':

View file

@ -1,6 +1,6 @@
import os
from typing import Dict, List, Tuple
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from datetime import datetime
@ -69,7 +69,7 @@ class ArchiveError(Exception):
@enforce_types
def archive_link(link: Link, page=None) -> Link:
def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
ARCHIVE_METHODS = (
@ -84,13 +84,14 @@ def archive_link(link: Link, page=None) -> Link:
('archive_org', should_fetch_archive_dot_org, archive_dot_org),
)
link_dir = link_dir or link.link_dir
try:
is_new = not os.path.exists(link.link_dir)
is_new = not os.path.exists(link_dir)
if is_new:
os.makedirs(link.link_dir)
os.makedirs(link_dir)
link = load_json_link_index(link.link_dir, link)
log_link_archiving_started(link.link_dir, link, is_new)
link = load_json_link_index(link, link_dir)
log_link_archiving_started(link, link_dir, is_new)
link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
@ -99,10 +100,10 @@ def archive_link(link: Link, page=None) -> Link:
if method_name not in link.history:
link.history[method_name] = []
if should_run(link.link_dir, link):
if should_run(link, link_dir):
log_archive_method_started(method_name)
result = method_function(link.link_dir, link)
result = method_function(link, link_dir)
link.history[method_name].append(result)
@ -126,7 +127,7 @@ def archive_link(link: Link, page=None) -> Link:
patch_links_index(link)
log_link_archiving_finished(link.link_dir, link, is_new, stats)
log_link_archiving_finished(link, link.link_dir, is_new, stats)
except KeyboardInterrupt:
raise
@ -141,7 +142,7 @@ def archive_link(link: Link, page=None) -> Link:
### Archive Method Functions
@enforce_types
def should_fetch_title(link_dir: str, link: Link) -> bool:
def should_fetch_title(link: Link, link_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
return False
@ -152,7 +153,7 @@ def should_fetch_title(link_dir: str, link: Link) -> bool:
return FETCH_TITLE
@enforce_types
def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def fetch_title(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
output = None
@ -186,14 +187,14 @@ def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResul
@enforce_types
def should_fetch_favicon(link_dir: str, link: Link) -> bool:
def should_fetch_favicon(link: Link, link_dir: Optional[str]=None) -> bool:
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
return False
return FETCH_FAVICON
@enforce_types
def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def fetch_favicon(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""
output = 'favicon.ico'
@ -226,7 +227,7 @@ def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveRes
)
@enforce_types
def should_fetch_wget(link_dir: str, link: Link) -> bool:
def should_fetch_wget(link: Link, link_dir: Optional[str]=None) -> bool:
output_path = wget_output_path(link)
if output_path and os.path.exists(os.path.join(link_dir, output_path)):
return False
@ -235,7 +236,7 @@ def should_fetch_wget(link_dir: str, link: Link) -> bool:
@enforce_types
def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def fetch_wget(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget"""
if FETCH_WARC:
@ -315,7 +316,7 @@ def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult
)
@enforce_types
def should_fetch_pdf(link_dir: str, link: Link) -> bool:
def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url):
return False
@ -326,7 +327,7 @@ def should_fetch_pdf(link_dir: str, link: Link) -> bool:
@enforce_types
def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def fetch_pdf(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
output = 'output.pdf'
@ -361,7 +362,7 @@ def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
)
@enforce_types
def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
def should_fetch_screenshot(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url):
return False
@ -371,7 +372,7 @@ def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
return FETCH_SCREENSHOT
@enforce_types
def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def fetch_screenshot(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
output = 'screenshot.png'
@ -406,7 +407,7 @@ def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> Archive
)
@enforce_types
def should_fetch_dom(link_dir: str, link: Link) -> bool:
def should_fetch_dom(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url):
return False
@ -416,7 +417,7 @@ def should_fetch_dom(link_dir: str, link: Link) -> bool:
return FETCH_DOM
@enforce_types
def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def fetch_dom(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
output = 'output.html'
@ -453,7 +454,7 @@ def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
)
@enforce_types
def should_fetch_git(link_dir: str, link: Link) -> bool:
def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url):
return False
@ -471,7 +472,7 @@ def should_fetch_git(link_dir: str, link: Link) -> bool:
@enforce_types
def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git"""
output = 'git'
@ -514,7 +515,7 @@ def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
@enforce_types
def should_fetch_media(link_dir: str, link: Link) -> bool:
def should_fetch_media(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url):
return False
@ -524,7 +525,7 @@ def should_fetch_media(link_dir: str, link: Link) -> bool:
return FETCH_MEDIA
@enforce_types
def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media'
@ -588,7 +589,7 @@ def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> Archiv
@enforce_types
def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bool:
if is_static_file(link.url):
return False
@ -599,7 +600,7 @@ def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
return SUBMIT_ARCHIVE_DOT_ORG
@enforce_types
def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
def archive_dot_org(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url"""
output = 'archive.org.txt'

View file

@ -39,23 +39,25 @@ from .logs import (
TITLE_LOADING_MSG = 'Not yet archived...'
### Homepage index for all the links
@enforce_types
def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links"""
log_indexing_process_started()
log_indexing_started(out_dir, 'index.json')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
write_json_links_index(out_dir, links)
write_json_links_index(links, out_dir=out_dir)
timer.end()
log_indexing_finished(out_dir, 'index.json')
log_indexing_started(out_dir, 'index.html')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
write_html_links_index(out_dir, links, finished=finished)
write_html_links_index(links, out_dir=out_dir, finished=finished)
timer.end()
log_indexing_finished(out_dir, 'index.html')
@ -87,7 +89,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -
@enforce_types
def write_json_links_index(out_dir: str, links: List[Link]) -> None:
def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path"""
assert isinstance(links, List), 'Links must be a list, not a generator.'
@ -199,7 +201,6 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
successful = link.num_outputs
# Patch JSON index
changed = False
json_file_links = parse_json_links_index(out_dir)
patched_links = []
for saved_link in json_file_links:
@ -212,7 +213,7 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
else:
patched_links.append(saved_link)
write_json_links_index(out_dir, patched_links)
write_json_links_index(patched_links, out_dir=out_dir)
# Patch HTML index
html_path = os.path.join(out_dir, 'index.html')
@ -231,27 +232,27 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
### Individual link index
@enforce_types
def write_link_index(out_dir: str, link: Link) -> None:
write_json_link_index(out_dir, link)
write_html_link_index(out_dir, link)
def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
write_json_link_index(link, link_dir)
write_html_link_index(link, link_dir)
@enforce_types
def write_json_link_index(out_dir: str, link: Link) -> None:
def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
path = os.path.join(out_dir, 'index.json')
with open(path, 'w', encoding='utf-8') as f:
json.dump(link._asdict(), f, indent=4, cls=ExtendedEncoder)
link_dir = link_dir or link.link_dir
path = os.path.join(link_dir, 'index.json')
chmod_file(path)
@enforce_types
def parse_json_link_index(out_dir: str) -> Optional[Link]:
def parse_json_link_index(link_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json')
existing_index = os.path.join(link_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
@ -260,18 +261,21 @@ def parse_json_link_index(out_dir: str) -> Optional[Link]:
@enforce_types
def load_json_link_index(out_dir: str, link: Link) -> Link:
def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
existing_link = parse_json_link_index(out_dir)
link_dir = link_dir or link.link_dir
existing_link = parse_json_link_index(link_dir)
if existing_link:
return merge_links(existing_link, link)
return link
@enforce_types
def write_html_link_index(out_dir: str, link: Link) -> None:
def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
link_html = f.read()

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass
from typing import Optional
from .schema import Link, ArchiveResult
from .config import ANSI, REPO_DIR, OUTPUT_DIR
from .config import ANSI, OUTPUT_DIR
@dataclass
@ -17,14 +17,14 @@ class RuntimeStats:
succeeded: int = 0
failed: int = 0
parse_start_ts: datetime = None
parse_end_ts: datetime = None
parse_start_ts: Optional[datetime] = None
parse_end_ts: Optional[datetime] = None
index_start_ts: datetime = None
index_end_ts: datetime = None
index_start_ts: Optional[datetime] = None
index_end_ts: Optional[datetime] = None
archiving_start_ts: datetime = None
archiving_end_ts: datetime = None
archiving_start_ts: Optional[datetime] = None
archiving_end_ts: Optional[datetime] = None
# globals are bad, mmkay
_LAST_RUN_STATS = RuntimeStats()
@ -131,7 +131,7 @@ def log_archiving_finished(num_links: int):
print(' {}/index.html'.format(OUTPUT_DIR))
def log_link_archiving_started(link_dir: str, link: Link, is_new: bool):
def log_link_archiving_started(link: Link, link_dir: str, is_new: bool):
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
# > output/archive/1478739709
@ -149,7 +149,7 @@ def log_link_archiving_started(link_dir: str, link: Link, is_new: bool):
pretty_path(link_dir),
))
def log_link_archiving_finished(link_dir: str, link: Link, is_new: bool, stats: dict):
def log_link_archiving_finished(link: Link, link_dir: str, is_new: bool, stats: dict):
total = sum(stats.values())
if stats['failed'] > 0 :

View file

@ -1,11 +1,12 @@
import os
import re
import sys
import json
import time
import shutil
from json import JSONEncoder
from typing import List, Optional, Any
from typing import List, Optional, Any, Union
from inspect import signature, _empty
from functools import wraps
from hashlib import sha256