1
0
Fork 0
mirror of synced 2024-06-29 03:20:58 +12:00

properly handle Archive.org denied by robots.txt

This commit is contained in:
Nick Sweeting 2017-07-05 16:57:19 -05:00
parent 9e4b97340d
commit b894e0ff92

View file

@ -118,12 +118,21 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
try: try:
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt
end() end()
# Parse archive.org response headers
headers = result.stdout.splitlines() headers = result.stdout.splitlines()
content_location = [h for h in headers if b'Content-Location: ' in h] content_location = [h for h in headers if b'Content-Location: ' in h]
errors = [h for h in headers if b'X-Archive-Wayback-Runtime-Error: ' in h]
if content_location: if content_location:
archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8') archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8')
saved_url = 'https://web.archive.org{}'.format(archive_path) saved_url = 'https://web.archive.org{}'.format(archive_path)
success = True success = True
elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]:
raise ValueError('Archive.org denied by {}/robots.txt'.format(link['domain']))
elif errors:
raise Exception(', '.join(e.decode() for e in errors))
else: else:
raise Exception('Failed to find "Content-Location" URL header in Archive.org response.') raise Exception('Failed to find "Content-Location" URL header in Archive.org response.')
except Exception as e: except Exception as e: