From eef8ca29f0c359263af9f988001ed127ae25432c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 20:31:28 -0400 Subject: [PATCH 01/71] hide compression detection failure during config setup --- archivebox/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index 0d49a5d2..23a92ebf 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -74,7 +74,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC -WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) +WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL, stderr=DEVNULL).returncode) URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) From 924de7f68c315cd55fcfb77257b01f28365f855c Mon Sep 17 00:00:00 2001 From: luoliyan Date: Tue, 2 Apr 2019 13:13:07 +0930 Subject: [PATCH 02/71] Update purge script to match codebase cleanup --- archivebox/purge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/purge.py b/archivebox/purge.py index 26b18817..e2e4e97c 100755 --- a/archivebox/purge.py +++ b/archivebox/purge.py @@ -6,9 +6,9 @@ from os.path import exists, join from shutil import rmtree from typing import List -from archive import parse_json_link_index from config import ARCHIVE_DIR, OUTPUT_DIR -from index import write_html_links_index, write_json_links_index +from index import (parse_json_links_index, write_html_links_index, + write_json_links_index) def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: @@ -16,7 +16,7 @@ def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: exit('index.json is missing; nothing to do') compiled = [re.compile(r) for r in regexes] - links = parse_json_link_index(OUTPUT_DIR)['links'] + links = parse_json_links_index(OUTPUT_DIR) filtered = [] remaining = [] From 0d2bf610b2ed82c87c78c3655a1f6512551f2ddb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 3 Apr 2019 03:27:15 -0400 Subject: [PATCH 03/71] typo fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 15358d5d..5c698868 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ I don't think everything should be preserved in an automated fashion, making all #### User Interface & Intended Purpose -ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI inferface for users to ingest built feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. +ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI inferface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. An alternative tool [pywb](https://github.com/webrecorder/pywb) allows you to run a browser through an always-running archiving proxy which records the traffic to WARC files. ArchiveBox intends to support this style of live proxy-archiving using `pywb` in the future, but for now it only ingests lists of links at a time via browser history, bookmarks, RSS, etc. From 585a28e7c919c980a3d0e53cdac835ed3748b630 Mon Sep 17 00:00:00 2001 From: Anton Rieder <1301152+aried3r@users.noreply.github.com> Date: Wed, 3 Apr 2019 12:49:32 +0200 Subject: [PATCH 04/71] Small typo fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5c698868..0fc21154 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ echo 'https://example.com' | ./archive # pass URLs to archive v ./archive https://getpocket.com/users/example/feed/all # or import an RSS/JSON/XML/TXT feed ``` -One you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archive.sweeting.me](https://archive.sweeting.me) +Once you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archive.sweeting.me](https://archive.sweeting.me) For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs. *(`pip install archivebox` will be available in the near future, follow our [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for progress)* From 403025a73b1d96ebcd2dba8c681c63529a5a4980 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 10 Apr 2019 17:09:54 -0400 Subject: [PATCH 05/71] Update bug_report.md --- .github/ISSUE_TEMPLATE/bug_report.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b350fb28..66a2d21b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,24 +7,24 @@ assignees: '' --- -(please fill out the following information, feel free to delete sections if they're not applicable) +(please fill out the following information, feel free to delete sections if they're not applicable or if long issue templates annoy you) -## Describe the bug +#### Describe the bug A description of what the bug is, what you expected to happen, and any relevant context about issue. -## Steps to reproduce +#### Steps to reproduce 1. Ran ArchiveBox with the following config '...' 2. Saw this output during archiving '....' 3. UI didn't show the thing I was expecting '....' -## Screenshots or log output +#### Screenshots or log output If applicable, post any relevant screenshots or copy/pasted terminal output from ArchiveBox. If you're reporting a parsing / importing error, **you must paste a copy of your redacted import file here**. -## Software versions +#### Software versions - OS: ([e.g. macOS 10.14] the operating system you're running ArchiveBox on) - ArchiveBox version: (`git rev-parse HEAD | head -c7` [e.g. d798117] commit ID of the version you're running) From 4f599c0b0b07c842b1a2d0ec31f229d8fa0d6294 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 10 Apr 2019 22:46:20 -0400 Subject: [PATCH 06/71] escape all non-windows-friendly filenames --- archivebox/archive_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 56009cd1..b2f04f33 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -226,7 +226,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--span-hosts', '--no-parent', '-e', 'robots=off', - '--restrict-file-names=unix', + '--restrict-file-names=windows', '--timeout={}'.format(timeout), *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), From e9f9c1ec5da2433ef95b23d7526d69458e01ad3c Mon Sep 17 00:00:00 2001 From: Bruno Tavares Date: Thu, 11 Apr 2019 22:43:52 -0300 Subject: [PATCH 07/71] Copy project into image instead of cloning Docker `RUN` statements cache based on the text of the command executed, not the content of what it does to the image. Since the command was cloning the project, and the text didn't change, building the image would not update the code if the image was already cached. This lead to a stale Docker image distributed on Docker Hub. This could also cause some confusion, as modified code would not show up on the image during the build process. This commit changes the build process to copy the content of the project into the image. Whenever a file changes it will trigger a new updated image. --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index d5683cad..c53e5c7a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,8 +45,8 @@ RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \ && chown -R pptruser:pptruser /node_modules # Install the ArchiveBox repository and pip requirements -RUN git clone https://github.com/pirate/ArchiveBox /home/pptruser/app \ - && mkdir -p /data \ +COPY . /home/pptruser/app +RUN mkdir -p /data \ && chown -R pptruser:pptruser /data \ && ln -s /data /home/pptruser/app/archivebox/output \ && ln -s /home/pptruser/app/bin/* /bin/ \ From 6401158f7f30f04a15bd070d9a94416a1c621e77 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 12 Apr 2019 13:59:22 -0400 Subject: [PATCH 08/71] comment out IRC links until we find a better chat solution --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0fc21154..435e8b82 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ "Your own personal internet archive" (网站存档 / 爬虫) - + @@ -178,7 +178,7 @@ Because ArchiveBox is designed to ingest a firehose of browser history and bookm ## Learn more -▶ **Join out our [community chat](http://webchat.freenode.net?channels=ArchiveBox&uio=d4) hosted on IRC freenode.net:`#ArchiveBox`!** + Whether you want learn which organizations are the big players in the web archiving space, want to find a specific open source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! @@ -268,7 +268,7 @@ Contributor Spotlight:


- +

From adfcb1517a086d77441ff6b4d9d766a5c8d94d84 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Apr 2019 21:03:51 -0400 Subject: [PATCH 09/71] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 435e8b82..6b36c859 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,9 @@
+ +*Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info...* + **ArchiveBox takes a list of website URLs you want to archive, and creates a local, static, browsable HTML clone of the content from those websites (it saves HTML, JS, media files, PDFs, images and more).** From 24e8eb95ddf1af7040e539503a56c1dc55774bcc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Apr 2019 21:04:18 -0400 Subject: [PATCH 10/71] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b36c859..63fa7f32 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@
-*Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info...* +*💥 Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info... 💥* From 59da48206ad7f64ea9b5a7e869d47a87e5534c3a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Apr 2019 21:21:40 -0400 Subject: [PATCH 11/71] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 63fa7f32..1622c393 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@
-*💥 Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info... 💥* +*💥 Attention: Big API changes are coming soon! Check out [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) and help us test it! 💥* From 332a32f4f9b6f548d9a61495ec9008667ca1f5f6 Mon Sep 17 00:00:00 2001 From: Drewry Pope Date: Sat, 20 Apr 2019 02:59:44 -0500 Subject: [PATCH 12/71] Resolve 3 typos in util.py --- archivebox/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index cec23035..3c08c9bb 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -66,7 +66,7 @@ HTML_TITLE_REGEX = re.compile( re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, ) STATICFILE_EXTENSIONS = { - # 99.999% of the time, URLs ending in these extentions are static files + # 99.999% of the time, URLs ending in these extensions are static files # that can be downloaded as-is, not html pages that need to be rendered 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', @@ -82,7 +82,7 @@ STATICFILE_EXTENSIONS = { # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml - # Thse are always treated as pages, not as static files, never add them: + # These are always treated as pages, not as static files, never add them: # html, htm, shtml, xhtml, xml, aspx, php, cgi } @@ -293,7 +293,7 @@ def str_between(string, start, end=None): ### Link Helpers def merge_links(a, b): - """deterministially merge two links, favoring longer field values over shorter, + """deterministically merge two links, favoring longer field values over shorter, and "cleaner" values over worse ones. """ longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key]) From 34270b2b1239b948d9598b7bb6ea8b31131066b8 Mon Sep 17 00:00:00 2001 From: Pig Monkey Date: Tue, 30 Apr 2019 17:25:41 -0700 Subject: [PATCH 13/71] only use stdin if it has a value Closes #228 --- archivebox/archive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/archive.py b/archivebox/archive.py index 5c0d195d..3e553e6e 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -86,8 +86,8 @@ def main(*args): ) print_help() raise SystemExit(1) - - import_path = save_stdin_source(stdin_raw_text) + if stdin_raw_text: + import_path = save_stdin_source(stdin_raw_text) ### Handle ingesting urls from a remote file/feed # (e.g. if an RSS feed URL is used as the import path) From 500534f4be87e94f05d9cf6063babd4faa5145cc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 15:17:16 -0400 Subject: [PATCH 14/71] fix missing comma in staticfile extensions list --- archivebox/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 3c08c9bb..6f63b53f 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -70,7 +70,7 @@ STATICFILE_EXTENSIONS = { # that can be downloaded as-is, not html pages that need to be rendered 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', - 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8' + 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'atom', 'rss', 'css', 'js', 'json', 'dmg', 'iso', 'img', From 3b0236b087defc2e73e8f0301c016ce6efbd0b01 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Tue, 14 May 2019 23:54:17 +0100 Subject: [PATCH 15/71] Add prefers-color-scheme: dark support --- archivebox/templates/index.html | 46 +++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/archivebox/templates/index.html b/archivebox/templates/index.html index 264deb4d..dd2e16cd 100644 --- a/archivebox/templates/index.html +++ b/archivebox/templates/index.html @@ -3,6 +3,34 @@ Archived Sites