1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

save all imports to sources dir

This commit is contained in:
Nick Sweeting 2019-03-27 20:48:41 -04:00
parent cc3d1e9cc9
commit 4c8e45b8d7
3 changed files with 36 additions and 33 deletions

View file

@ -43,8 +43,8 @@ from .config import (
)
from .util import (
enforce_types,
save_remote_source,
save_stdin_source,
handle_stdin_import,
handle_file_import,
)
from .logs import (
log_archiving_started,
@ -160,12 +160,12 @@ def main(args=None) -> None:
print_help()
raise SystemExit(1)
import_path = save_stdin_source(stdin_raw_text)
import_path = handle_stdin_import(stdin_raw_text)
### Handle ingesting urls from a remote file/feed
### Handle ingesting url from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
import_path = save_remote_source(import_path)
if import_path:
import_path = handle_file_import(import_path)
### Run the main archive update process
update_archive_data(import_path=import_path, resume=resume)

View file

@ -90,7 +90,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
if is_new:
os.makedirs(link_dir)
link = load_json_link_index(link, link_dir)
link = load_json_link_index(link, link_dir=link_dir)
log_link_archiving_started(link, link_dir, is_new)
link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
@ -103,7 +103,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
if should_run(link, link_dir):
log_archive_method_started(method_name)
result = method_function(link, link_dir)
result = method_function(link=link, link_dir=link_dir)
link.history[method_name].append(result)

View file

@ -187,7 +187,7 @@ def check_url_parsing_invariants() -> None:
### Random Helpers
@enforce_types
def save_stdin_source(raw_text: str) -> str:
def handle_stdin_import(raw_text: str) -> str:
if not os.path.exists(SOURCES_DIR):
os.makedirs(SOURCES_DIR)
@ -195,14 +195,12 @@ def save_stdin_source(raw_text: str) -> str:
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
with open(source_path, 'w', encoding='utf-8') as f:
f.write(raw_text)
atomic_write(raw_text, source_path)
return source_path
@enforce_types
def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
if not os.path.exists(SOURCES_DIR):
@ -210,30 +208,35 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
url,
ANSI['reset'],
))
timer = TimedProgress(timeout, prefix=' ')
try:
downloaded_xml = download_url(url, timeout=timeout)
timer.end()
except Exception as e:
timer.end()
print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'],
url,
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
path,
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
timer = TimedProgress(timeout, prefix=' ')
try:
raw_source_text = download_url(path, timeout=timeout)
timer.end()
except Exception as e:
timer.end()
print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'],
path,
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
with open(source_path, 'w', encoding='utf-8') as f:
f.write(downloaded_xml)
else:
with open(path, 'r') as f:
raw_source_text = f.read()
atomic_write(raw_source_text, source_path)
print(' > {}'.format(pretty_path(source_path)))