1
0
Fork 0
mirror of synced 2024-06-26 10:00:19 +12:00

properly handle Archive.org denied by robots.txt

This commit is contained in:
Nick Sweeting 2017-07-05 16:57:19 -05:00
parent 9e4b97340d
commit b894e0ff92

View file

@ -118,12 +118,21 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
try:
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt
end()
# Parse archive.org response headers
headers = result.stdout.splitlines()
content_location = [h for h in headers if b'Content-Location: ' in h]
errors = [h for h in headers if b'X-Archive-Wayback-Runtime-Error: ' in h]
if content_location:
archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8')
saved_url = 'https://web.archive.org{}'.format(archive_path)
success = True
elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]:
raise ValueError('Archive.org denied by {}/robots.txt'.format(link['domain']))
elif errors:
raise Exception(', '.join(e.decode() for e in errors))
else:
raise Exception('Failed to find "Content-Location" URL header in Archive.org response.')
except Exception as e: