diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 410e1ea6..603134e5 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -48,6 +48,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=T '--silent', '--location', '--head', + '--compressed', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 0e46ef2c..272272ea 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -37,6 +37,7 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) '--silent', '--max-time', str(timeout), '--location', + '--compressed', '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index e2d7e12e..642c45b7 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -43,18 +43,19 @@ def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool: def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: """try to guess the page's title from its content""" + setup_django(out_dir=out_dir) + from core.models import Snapshot + output: ArchiveOutput = None cmd = [ CURL_BINARY, '--silent', '--max-time', str(timeout), '--location', + '--compressed', *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), link.url, - '|', - 'grep', - '