1
0
Fork 0
mirror of synced 2024-06-26 10:00:19 +12:00

cleanup ARCHIVE_DIR paths

This commit is contained in:
Nick Sweeting 2018-06-10 21:26:11 -04:00
parent 46ea65d4f2
commit c90f4bfd5b
3 changed files with 8 additions and 5 deletions

View file

@ -27,6 +27,7 @@ from config import (
CHROME_USER_DATA_DIR,
TIMEOUT,
ANSI,
ARCHIVE_DIR,
)
from util import (
check_dependencies,
@ -50,7 +51,7 @@ def archive_links(archive_path, links, source=None, resume=None):
try:
for idx, link in enumerate(to_archive):
link_dir = os.path.join(archive_path, 'archive', link['timestamp'])
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except (KeyboardInterrupt, SystemExit, Exception) as e:

View file

@ -36,6 +36,7 @@ FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
OUTPUT_DIR = os.path.join(REPO_DIR, 'output')
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive')
SOURCES_DIR = os.path.join(OUTPUT_DIR, 'sources')
PYTHON_PATH = os.path.join(REPO_DIR, 'archiver')

View file

@ -16,6 +16,7 @@ from config import (
REPO_DIR,
SOURCES_DIR,
OUTPUT_DIR,
ARCHIVE_DIR,
TIMEOUT,
TERM_WIDTH,
SHOW_PROGRESS,
@ -262,7 +263,7 @@ def find_link(folder, links):
timestamp = folder.split('.')[0]
for link in links:
if link['timestamp'].startswith(timestamp):
if link['domain'] in os.listdir(os.path.join(OUTPUT_DIR, 'archive', folder)):
if link['domain'] in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
return link # careful now, this isn't safe for most ppl
if link['domain'] in parse_url(folder):
return link
@ -271,7 +272,7 @@ def find_link(folder, links):
def parse_url(folder):
"""for a given archive folder, figure out what url it's for"""
link_json = os.path.join(OUTPUT_DIR, 'archive', folder, 'index.json')
link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json')
if os.path.exists(link_json):
with open(link_json, 'r') as f:
try:
@ -282,7 +283,7 @@ def parse_url(folder):
except ValueError:
print('File contains invalid JSON: {}!'.format(link_json))
archive_org_txt = os.path.join(OUTPUT_DIR, 'archive', folder, 'archive.org.txt')
archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt')
if os.path.exists(archive_org_txt):
with open(archive_org_txt, 'r') as f:
original_link = f.read().strip().split('/http', 1)[-1]
@ -417,7 +418,7 @@ def wget_output_path(link, look_in=None):
# instead of trying to emulate it here, we just look in the output folder
# to see what html file wget actually created as the output
wget_folder = link['base_url'].rsplit('/', 1)[0].split('/')
look_in = os.path.join(OUTPUT_DIR, 'archive', link['timestamp'], *wget_folder)
look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
if look_in and os.path.exists(look_in):
html_files = [