1
0
Fork 0
mirror of synced 2024-06-01 18:20:20 +12:00
ArchiveBox/archivebox/plugins/gallerydl/extractors.py
2024-01-19 06:45:33 -08:00

50 lines
1.1 KiB
Python

# browsertrix extractor
def save_browsertrix(link, out_dir, timeout, config):
browsertrix_dir = out_dir / 'browsertrix'
browsertrix_dir.mkdir(exist_ok=True)
crawl_id = link.timestamp
browsertrix_crawler_cmd = [
'crawl',
f'--url', link.url,
f'--collection={crawl_id}',
'--scopeType=page',
'--generateWACZ',
'--text=final-to-warc',
'--timeLimit=60',
]
remote_cmd = """
rm /tmp/dump.rdb;
rm -rf /crawls/collections;
mkdir /crawls/collections;
env CRAWL_ID={crawl_id}
"""
local_cmd = ['nc', 'browsertrix', '2222']
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
cmd_output = result.stdout.decode()
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
TEMPLATE = """
"""
# rm /tmp/dump.rdb;
# rm -rf /crawls/collections;
# mkdir /crawls/collections;
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60