From 8e07fe3f2efc1d4a740967ec6ac209d194492b8e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 13:31:57 -0400 Subject: [PATCH 01/21] expose sub-dependency --- package.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 25ad24b1..6d44822e 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,9 @@ "archivebox": "./bin/archive" }, "bin": { - "archivebox": "./bin/archive" + "archivebox-node": "./bin/archive", + "single-file": "single-file", + "readability-extractor": "single-file" }, "dependencies": { "readability-extractor": "git+https://github.com/pirate/readability-extractor.git", From 8b427c9d793c1fe6db154fceb87cf7e6eb5d7649 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 15:00:00 -0400 Subject: [PATCH 02/21] get VERSION from package.json instead of VERSION to avoid duplication --- MANIFEST.in | 3 ++- archivebox.egg-info/SOURCES.txt | 3 ++- archivebox/VERSION | 1 - archivebox/config/__init__.py | 8 ++++---- bin/release.sh | 26 +++++++++++++------------- package-lock.json | 4 ++-- package.json | 8 ++++---- setup.py | 6 ++++-- 8 files changed, 31 insertions(+), 28 deletions(-) delete mode 100644 archivebox/VERSION diff --git a/MANIFEST.in b/MANIFEST.in index a73ef711..e94f3b11 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include LICENSE include README.md -include archivebox/VERSION +include package.json +include package-lock.json recursive-include archivebox/themes * diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt index ee6a2fc5..9541d8fc 100644 --- a/archivebox.egg-info/SOURCES.txt +++ b/archivebox.egg-info/SOURCES.txt @@ -1,8 +1,9 @@ LICENSE MANIFEST.in README.md +package-lock.json +package.json setup.py -archivebox/VERSION archivebox/__init__.py archivebox/__main__.py archivebox/logging_util.py diff --git a/archivebox/VERSION b/archivebox/VERSION deleted file mode 100644 index 7040b811..00000000 --- a/archivebox/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.4.17 diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 066be01f..2eb60e09 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -4,10 +4,11 @@ import os import io import re import sys -import django +import json import getpass import shutil import platform +import django from hashlib import md5 from pathlib import Path @@ -185,7 +186,6 @@ STATICFILE_EXTENSIONS = { # html, htm, shtml, xhtml, xml, aspx, php, cgi } -VERSION_FILENAME = 'VERSION' PYTHON_DIR_NAME = 'archivebox' TEMPLATES_DIR_NAME = 'themes' @@ -231,10 +231,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'CONFIG_FILE': {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else os.path.join(c['OUTPUT_DIR'], CONFIG_FILENAME)}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))}, 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)}, - 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'], re.IGNORECASE | re.UNICODE | re.MULTILINE)}, + 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, - 'VERSION': {'default': lambda c: open(os.path.join(c['PYTHON_DIR'], VERSION_FILENAME), 'r').read().strip()}, + 'VERSION': {'default': lambda c: json.loads((Path(c['REPO_DIR']) / 'package.json').read_text().strip())['version']}, 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'}, 'PYTHON_BINARY': {'default': lambda c: sys.executable}, diff --git a/bin/release.sh b/bin/release.sh index 7f5a7db4..bd7f19a7 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -10,29 +10,28 @@ set -o nounset set -o pipefail IFS=$'\n' -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION_FILE="$DIR/archivebox/VERSION" +REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" function bump_semver { echo "$1" | awk -F. '{$NF = $NF + 1;} 1' | sed 's/ /./g' } -source "$DIR/.venv/bin/activate" -cd "$DIR" +source "$REPO_DIR/.venv/bin/activate" +cd "$REPO_DIR" -OLD_VERSION="$(cat "$VERSION_FILE")" +OLD_VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" NEW_VERSION="$(bump_semver "$OLD_VERSION")" echo "[*] Fetching latest docs version" -cd "$DIR/docs" +cd "$REPO_DIR/docs" git pull -cd "$DIR" +cd "$REPO_DIR" echo "[+] Building docs" sphinx-apidoc -o docs archivebox -cd "$DIR/docs" +cd "$REPO_DIR/docs" make html -cd "$DIR" +cd "$REPO_DIR" if [ -z "$(git status --porcelain)" ] && [[ "$(git branch --show-current)" == "master" ]]; then git pull @@ -43,19 +42,20 @@ else fi echo "[*] Bumping VERSION from $OLD_VERSION to $NEW_VERSION" -echo "$NEW_VERSION" > "$VERSION_FILE" -git add "$DIR/docs" +contents="$(jq ".version = \"$NEW_VERSION\"" "$REPO_DIR/package.json")" && \ +echo "${contents}" > package.json +git add "$REPO_DIR/docs" git add "$VERSION_FILE" echo "[*] Cleaning up build dirs" -cd "$DIR" +cd "$REPO_DIR" rm -Rf build dist echo "[+] Building sdist and bdist_wheel" python3 setup.py sdist bdist_wheel echo "[^] Pushing source to github" -git add "$DIR/archivebox.egg-info" +git add "$REPO_DIR/archivebox.egg-info" git commit -m "$NEW_VERSION release" git tag -a "v$NEW_VERSION" -m "v$NEW_VERSION" git push origin master diff --git a/package-lock.json b/package-lock.json index 221be8d9..dfc101a1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -902,7 +902,7 @@ "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==" }, "readability-extractor": { - "version": "git+https://github.com/pirate/readability-extractor.git#afa6a5bb8473f629ee3f1e0dcbf093b73d4eff40", + "version": "git+https://github.com/pirate/readability-extractor.git#0098f142b0a015c8c90766d3b74d9eb6fb7b7e6a", "from": "git+https://github.com/pirate/readability-extractor.git", "requires": { "@mozilla/readability": "^0.3.0", @@ -1054,7 +1054,7 @@ "integrity": "sha1-SysbJ+uAip+NzEgaWOXlb1mfP2E=" }, "single-file": { - "version": "git+https://github.com/gildas-lormeau/SingleFile.git#27c1ba673979f593b3c2c6cd353634bf869743f9", + "version": "git+https://github.com/gildas-lormeau/SingleFile.git#e2e15381a6cbb9c3a6ca0ea8ff7307174e98ad12", "from": "git+https://github.com/gildas-lormeau/SingleFile.git", "requires": { "file-url": "^3.0.0", diff --git a/package.json b/package.json index 6d44822e..9b031470 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.14", + "version": "0.4.17", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", @@ -8,9 +8,9 @@ "archivebox": "./bin/archive" }, "bin": { - "archivebox-node": "./bin/archive", - "single-file": "single-file", - "readability-extractor": "single-file" + "archivebox-node": "./bin/archive", + "single-file": "./node_modules/.bin/single-file", + "readability-extractor": "./node_modules/.bin/single-file" }, "dependencies": { "readability-extractor": "git+https://github.com/pirate/readability-extractor.git", diff --git a/setup.py b/setup.py index e4794f00..2871df75 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,6 @@ +import json import setuptools + from pathlib import Path PKG_NAME = "archivebox" @@ -6,13 +8,13 @@ REPO_URL = "https://github.com/pirate/ArchiveBox" BASE_DIR = Path(__file__).parent.resolve() SOURCE_DIR = BASE_DIR / PKG_NAME README = (BASE_DIR / "README.md").read_text() -VERSION = (SOURCE_DIR / "VERSION").read_text().strip() +VERSION = json.loads((BASE_DIR / "package.json").read_text().strip())['version'] # To see when setup.py gets called (uncomment for debugging) # import sys # print(SOURCE_DIR, f" (v{VERSION})") # print('>', sys.executable, *sys.argv) -# raise SystemExit(0) + setuptools.setup( name=PKG_NAME, From 71788cfd26c48f7d0cc02c8b18e4fcac8f919892 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 15:00:12 -0400 Subject: [PATCH 03/21] single-file supports version cli flag now --- archivebox/config/__init__.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 2eb60e09..04da2632 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -508,16 +508,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]: return None try: - if binary.split('/')[-1] in ('single-file',): - # these dependencies dont support the --version flag, but are valid still - if run([abspath, "--help"], stdout=PIPE).returncode == 0: - return '0.0.0' - else: - return None - else: - version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode() - # take first 3 columns of first line of version info - return ' '.join(version_str.split('\n')[0].strip().split()[:3]) + version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode() + # take first 3 columns of first line of version info + return ' '.join(version_str.split('\n')[0].strip().split()[:3]) except OSError: pass # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red') From 4673f837494876f45fbabc011e966ba720324d44 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 15:00:32 -0400 Subject: [PATCH 04/21] bump docs --- docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs b/docs index 101aec0b..4a7052eb 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit 101aec0bc1e98c1f7b1a42281a686a098ef9cdde +Subproject commit 4a7052eb5000f179ece678c0e98eea3cb581c079 From 80863ad05077ea2a92e01a7c871daf6fecf0255e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 15:02:36 -0400 Subject: [PATCH 05/21] fix release script semver bump --- bin/release.sh | 3 ++- package.json | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/bin/release.sh b/bin/release.sh index bd7f19a7..5d9d87d4 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -45,7 +45,8 @@ echo "[*] Bumping VERSION from $OLD_VERSION to $NEW_VERSION" contents="$(jq ".version = \"$NEW_VERSION\"" "$REPO_DIR/package.json")" && \ echo "${contents}" > package.json git add "$REPO_DIR/docs" -git add "$VERSION_FILE" +git add "$REPO_DIR/package.json" +git add "$REPO_DIR/package-lock.json" echo "[*] Cleaning up build dirs" cd "$REPO_DIR" diff --git a/package.json b/package.json index 9b031470..cf933e83 100644 --- a/package.json +++ b/package.json @@ -1,19 +1,19 @@ { - "name": "archivebox", - "version": "0.4.17", - "description": "ArchiveBox: The self-hosted internet archive", - "author": "Nick Sweeting ", - "license": "MIT", - "scripts": { - "archivebox": "./bin/archive" - }, - "bin": { - "archivebox-node": "./bin/archive", - "single-file": "./node_modules/.bin/single-file", - "readability-extractor": "./node_modules/.bin/single-file" - }, - "dependencies": { - "readability-extractor": "git+https://github.com/pirate/readability-extractor.git", - "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git" - } + "name": "archivebox", + "version": "0.4.17", + "description": "ArchiveBox: The self-hosted internet archive", + "author": "Nick Sweeting ", + "license": "MIT", + "scripts": { + "archivebox": "./bin/archive" + }, + "bin": { + "archivebox-node": "./bin/archive", + "single-file": "./node_modules/.bin/single-file", + "readability-extractor": "./node_modules/.bin/single-file" + }, + "dependencies": { + "readability-extractor": "git+https://github.com/pirate/readability-extractor.git", + "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git" + } } From 9d7541ba4723c6fd230d046f58980e23967660bb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 15:02:53 -0400 Subject: [PATCH 06/21] 0.4.18 release --- archivebox.egg-info/PKG-INFO | 2 +- package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index ba659878..8415a466 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: archivebox -Version: 0.4.17 +Version: 0.4.18 Summary: The self-hosted internet archive. Home-page: https://github.com/pirate/ArchiveBox Author: Nick Sweeting diff --git a/package.json b/package.json index cf933e83..034e5bcc 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.17", + "version": "0.4.18", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", From 73408fb035b028892a9ccfe038f25ba216be4892 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 15:58:05 -0400 Subject: [PATCH 07/21] fix version parsing and attempt to npm install during pip post_install --- MANIFEST.in | 8 ++++---- archivebox/cli/__init__.py | 8 ++++---- package-lock.json | 2 +- setup.py | 41 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 9 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index e94f3b11..2a60c904 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ -include LICENSE -include README.md -include package.json -include package-lock.json +graft LICENSE +graft README.md +graft package.json +graft package-lock.json recursive-include archivebox/themes * diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 70a6866e..aa26715b 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -104,11 +104,11 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, ) command = parser.parse_args(args or ()) - if command.help or command.subcommand is None: - command.subcommand = 'help' - elif command.version: + if command.version: command.subcommand = 'version' - + elif command.help or command.subcommand is None: + command.subcommand = 'help' + if command.subcommand not in ('help', 'version', 'status'): from ..logging_util import log_cli_command diff --git a/package-lock.json b/package-lock.json index dfc101a1..88baaf08 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.14", + "version": "0.4.18", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/setup.py b/setup.py index 2871df75..10965bfa 100755 --- a/setup.py +++ b/setup.py @@ -1,7 +1,13 @@ +import sys import json import setuptools from pathlib import Path +from subprocess import check_call +from setuptools.command.install import install +from setuptools.command.develop import develop +from setuptools.command.egg_info import egg_info + PKG_NAME = "archivebox" REPO_URL = "https://github.com/pirate/ArchiveBox" @@ -16,6 +22,36 @@ VERSION = json.loads((BASE_DIR / "package.json").read_text().strip())['version'] # print('>', sys.executable, *sys.argv) +def setup_js(): + if sys.platform.lower() not in ('darwin', 'linux'): + sys.stderr.write('[!] Warning: ArchiveBox is not supported on this platform.\n') + + sys.stderr.write(f'[+] Installing ArchiveBox npm package (BASE_DIR={BASE_DIR})...\n') + try: + check_call(f'which npm && npm --version && npm install --global "{BASE_DIR}"', shell=True) + sys.stderr.write('[√] Automatically installed npm dependencies.\n') + except Exception as err: + sys.stderr.write(f'[!] Failed to auto-install npm dependencies: {err}\n') + sys.stderr.write(' Install NPM/npm using your system package manager, then run:\n') + sys.stderr.write(' npm install -g "git+https://github.com/pirate/ArchiveBox.git\n') + + +class CustomInstallCommand(install): + def run(self): + super().run() + setup_js() + +class CustomDevelopCommand(develop): + def run(self): + super().run() + setup_js() + +class CustomEggInfoCommand(egg_info): + def run(self): + super().run() + setup_js() + + setuptools.setup( name=PKG_NAME, version=VERSION, @@ -81,6 +117,11 @@ setuptools.setup( ], }, include_package_data=True, + cmdclass={ + 'install': CustomInstallCommand, + 'develop': CustomDevelopCommand, + 'egg_info': CustomEggInfoCommand, + }, classifiers=[ "License :: OSI Approved :: MIT License", "Natural Language :: English", From c0d9bdf52fb4cefaa046ef9a5ed56c63e0eb570a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 15:58:40 -0400 Subject: [PATCH 08/21] 0.4.19 release --- archivebox.egg-info/PKG-INFO | 2 +- package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index 8415a466..0357040c 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: archivebox -Version: 0.4.18 +Version: 0.4.19 Summary: The self-hosted internet archive. Home-page: https://github.com/pirate/ArchiveBox Author: Nick Sweeting diff --git a/package.json b/package.json index 034e5bcc..4aaa4867 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.18", + "version": "0.4.19", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", From 5ff852bd07af736624309696572206c975dde545 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 17:00:51 -0400 Subject: [PATCH 09/21] add npmignore --- .npmignore | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .npmignore diff --git a/.npmignore b/.npmignore new file mode 100644 index 00000000..45bf83f5 --- /dev/null +++ b/.npmignore @@ -0,0 +1,15 @@ +tests/ +archivebox/ +build/ +dist/ +docs/ +etc/ +.dockerignore +.flake8 +CNAME +docker-compose.yaml +docker-compose.yml +Dockerfile +MANIFEST.in +Pipfile +setup.py From 87b79fe5e3ed840818d563b91de63381e052b9af Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 17:12:49 -0400 Subject: [PATCH 10/21] cleanup sdist and bdist build process --- MANIFEST.in | 9 ++-- archivebox.egg-info/SOURCES.txt | 23 +++------ archivebox.egg-info/top_level.txt | 1 - archivebox/LICENSE | 1 + archivebox/README.md | 1 + archivebox/config/__init__.py | 2 +- archivebox/package.json | 1 + bin/release.sh | 4 +- package-lock.json | 2 +- setup.py | 85 ++++++++++++++++--------------- 10 files changed, 61 insertions(+), 68 deletions(-) create mode 120000 archivebox/LICENSE create mode 120000 archivebox/README.md create mode 120000 archivebox/package.json diff --git a/MANIFEST.in b/MANIFEST.in index 2a60c904..c9ae1535 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ -graft LICENSE -graft README.md -graft package.json -graft package-lock.json -recursive-include archivebox/themes * +graft archivebox +global-exclude .DS_Store +global-exclude __pycache__ +global-exclude *.pyc diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt index 9541d8fc..d186b2fb 100644 --- a/archivebox.egg-info/SOURCES.txt +++ b/archivebox.egg-info/SOURCES.txt @@ -1,14 +1,16 @@ -LICENSE MANIFEST.in README.md -package-lock.json -package.json setup.py +archivebox/.flake8 +archivebox/LICENSE +archivebox/README.md archivebox/__init__.py archivebox/__main__.py archivebox/logging_util.py archivebox/main.py archivebox/manage.py +archivebox/mypy.ini +archivebox/package.json archivebox/system.py archivebox/util.py archivebox.egg-info/PKG-INFO @@ -46,6 +48,7 @@ archivebox/core/urls.py archivebox/core/views.py archivebox/core/welcome_message.py archivebox/core/wsgi.py +archivebox/core/management/commands/archivebox.py archivebox/core/migrations/0001_initial.py archivebox/core/migrations/0002_auto_20200625_1521.py archivebox/core/migrations/0003_auto_20200630_1034.py @@ -111,16 +114,4 @@ archivebox/themes/legacy/static/jquery.min.js archivebox/themes/legacy/static/sort_asc.png archivebox/themes/legacy/static/sort_both.png archivebox/themes/legacy/static/sort_desc.png -archivebox/themes/legacy/static/spinner.gif -tests/__init__.py -tests/conftest.py -tests/fixtures.py -tests/test_args.py -tests/test_extractors.py -tests/test_init.py -tests/test_oneshot.py -tests/test_remove.py -tests/test_title.py -tests/test_util.py -tests/mock_server/__init__.py -tests/mock_server/server.py \ No newline at end of file +archivebox/themes/legacy/static/spinner.gif \ No newline at end of file diff --git a/archivebox.egg-info/top_level.txt b/archivebox.egg-info/top_level.txt index 9c6a6349..74056b65 100644 --- a/archivebox.egg-info/top_level.txt +++ b/archivebox.egg-info/top_level.txt @@ -1,2 +1 @@ archivebox -tests diff --git a/archivebox/LICENSE b/archivebox/LICENSE new file mode 120000 index 00000000..ea5b6064 --- /dev/null +++ b/archivebox/LICENSE @@ -0,0 +1 @@ +../LICENSE \ No newline at end of file diff --git a/archivebox/README.md b/archivebox/README.md new file mode 120000 index 00000000..32d46ee8 --- /dev/null +++ b/archivebox/README.md @@ -0,0 +1 @@ +../README.md \ No newline at end of file diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 04da2632..b1130c34 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -234,7 +234,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]}, - 'VERSION': {'default': lambda c: json.loads((Path(c['REPO_DIR']) / 'package.json').read_text().strip())['version']}, + 'VERSION': {'default': lambda c: json.loads((Path(c['PYTHON_DIR']) / 'package.json').read_text().strip())['version']}, 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'}, 'PYTHON_BINARY': {'default': lambda c: sys.executable}, diff --git a/archivebox/package.json b/archivebox/package.json new file mode 120000 index 00000000..4e26811d --- /dev/null +++ b/archivebox/package.json @@ -0,0 +1 @@ +../package.json \ No newline at end of file diff --git a/bin/release.sh b/bin/release.sh index 5d9d87d4..16cde4d3 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -50,10 +50,10 @@ git add "$REPO_DIR/package-lock.json" echo "[*] Cleaning up build dirs" cd "$REPO_DIR" -rm -Rf build dist +rm -Rf build dist archivebox.egg-info echo "[+] Building sdist and bdist_wheel" -python3 setup.py sdist bdist_wheel +python3 setup.py sdist bdist_egg bdist_wheel echo "[^] Pushing source to github" git add "$REPO_DIR/archivebox.egg-info" diff --git a/package-lock.json b/package-lock.json index 88baaf08..f1483913 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.18", + "version": "0.4.19", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/setup.py b/setup.py index 10965bfa..db83e9bf 100755 --- a/setup.py +++ b/setup.py @@ -1,56 +1,57 @@ -import sys +# import sys import json import setuptools from pathlib import Path -from subprocess import check_call -from setuptools.command.install import install -from setuptools.command.develop import develop -from setuptools.command.egg_info import egg_info +# from subprocess import check_call +# from setuptools.command.install import install +# from setuptools.command.develop import develop +# from setuptools.command.egg_info import egg_info PKG_NAME = "archivebox" REPO_URL = "https://github.com/pirate/ArchiveBox" -BASE_DIR = Path(__file__).parent.resolve() -SOURCE_DIR = BASE_DIR / PKG_NAME -README = (BASE_DIR / "README.md").read_text() -VERSION = json.loads((BASE_DIR / "package.json").read_text().strip())['version'] +REPO_DIR = Path(__file__).parent.resolve() +PYTHON_DIR = REPO_DIR / PKG_NAME +README = (PYTHON_DIR / "README.md").read_text() +VERSION = json.loads((PYTHON_DIR / "package.json").read_text().strip())['version'] + +# To see when setup.py gets called (uncomment for debugging): -# To see when setup.py gets called (uncomment for debugging) # import sys -# print(SOURCE_DIR, f" (v{VERSION})") +# print(PYTHON_DIR, f" (v{VERSION})") # print('>', sys.executable, *sys.argv) +# Sketchy way to install npm dependencies as a pip post-install script -def setup_js(): - if sys.platform.lower() not in ('darwin', 'linux'): - sys.stderr.write('[!] Warning: ArchiveBox is not supported on this platform.\n') +# def setup_js(): +# if sys.platform.lower() not in ('darwin', 'linux'): +# sys.stderr.write('[!] Warning: ArchiveBox is not officially supported on this platform.\n') - sys.stderr.write(f'[+] Installing ArchiveBox npm package (BASE_DIR={BASE_DIR})...\n') - try: - check_call(f'which npm && npm --version && npm install --global "{BASE_DIR}"', shell=True) - sys.stderr.write('[√] Automatically installed npm dependencies.\n') - except Exception as err: - sys.stderr.write(f'[!] Failed to auto-install npm dependencies: {err}\n') - sys.stderr.write(' Install NPM/npm using your system package manager, then run:\n') - sys.stderr.write(' npm install -g "git+https://github.com/pirate/ArchiveBox.git\n') +# sys.stderr.write(f'[+] Installing ArchiveBox npm package (PYTHON_DIR={PYTHON_DIR})...\n') +# try: +# check_call(f'npm install -g "{REPO_DIR}"', shell=True) +# sys.stderr.write('[√] Automatically installed npm dependencies.\n') +# except Exception as err: +# sys.stderr.write(f'[!] Failed to auto-install npm dependencies: {err}\n') +# sys.stderr.write(' Install NPM/npm using your system package manager, then run:\n') +# sys.stderr.write(' npm install -g "git+https://github.com/pirate/ArchiveBox.git\n') -class CustomInstallCommand(install): - def run(self): - super().run() - setup_js() +# class CustomInstallCommand(install): +# def run(self): +# super().run() +# setup_js() -class CustomDevelopCommand(develop): - def run(self): - super().run() - setup_js() - -class CustomEggInfoCommand(egg_info): - def run(self): - super().run() - setup_js() +# class CustomDevelopCommand(develop): +# def run(self): +# super().run() +# setup_js() +# class CustomEggInfoCommand(egg_info): +# def run(self): +# super().run() +# setup_js() setuptools.setup( name=PKG_NAME, @@ -110,18 +111,18 @@ setuptools.setup( # 'redis': ['redis', 'django-redis'], # 'pywb': ['pywb', 'redis'], }, - packages=setuptools.find_packages(), + packages=['archivebox'], + include_package_data=True, # see MANIFEST.in entry_points={ "console_scripts": [ f"{PKG_NAME} = {PKG_NAME}.cli:main", ], }, - include_package_data=True, - cmdclass={ - 'install': CustomInstallCommand, - 'develop': CustomDevelopCommand, - 'egg_info': CustomEggInfoCommand, - }, + # cmdclass={ + # 'install': CustomInstallCommand, + # 'develop': CustomDevelopCommand, + # 'egg_info': CustomEggInfoCommand, + # }, classifiers=[ "License :: OSI Approved :: MIT License", "Natural Language :: English", From 1c0df87f786f7a527a60fe5bda752712baaf8b4d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 17:12:58 -0400 Subject: [PATCH 11/21] add build script --- bin/build.sh | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 bin/build.sh diff --git a/bin/build.sh b/bin/build.sh new file mode 100755 index 00000000..5df6721a --- /dev/null +++ b/bin/build.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +### Bash Environment Setup +# http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html +# set -o xtrace +set -o errexit +set -o errtrace +set -o nounset +set -o pipefail +IFS=$'\n' + +REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" + +source "$REPO_DIR/.venv/bin/activate" +cd "$REPO_DIR" + +# echo "[*] Fetching latest docs version" +# cd "$REPO_DIR/docs" +# git pull +# cd "$REPO_DIR" + +# echo "[+] Building docs" +# sphinx-apidoc -o docs archivebox +# cd "$REPO_DIR/docs" +# make html +# cd "$REPO_DIR" + +echo "[*] Cleaning up build dirs" +cd "$REPO_DIR" +rm -Rf build dist archivebox.egg-info + +echo "[+] Building sdist, bdist_egg, and bdist_wheel" +python3 setup.py sdist bdist_egg bdist_wheel + +echo "[+] Building docker image in the background..." +docker build . -t archivebox \ + -t archivebox:latest > /tmp/archivebox_docker_build.log 2>&1 & +ps "$!" + +echo "[√] Done. Install the built package by running:" +echo " python3 setup.py install" +echo " # or" +echo " pip3 install ." From f6484d8ddc8e4934f3d8fd47806cc186621cd6c7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:14:56 -0400 Subject: [PATCH 12/21] add node modules to path --- archivebox/config/__init__.py | 25 ++++++++++++------------- package.json | 2 +- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index b1130c34..59bb521f 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -525,6 +525,10 @@ def bin_path(binary: Optional[str]) -> Optional[str]: if binary is None: return None + node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary + if node_modules_bin.exists(): + return str(node_modules_bin.resolve()) + return shutil.which(os.path.expanduser(binary)) or binary def bin_hash(binary: Optional[str]) -> Optional[str]: @@ -775,6 +779,10 @@ globals().update(CONFIG) # Timezone set as UTC os.environ["TZ"] = 'UTC' +# add ./node_modules/.bin to $PATH so we can use node scripts in extractors +NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]) / 'node_modules' / '.bin').resolve()) +sys.path.append(NODE_BIN_PATH) + ############################## Importable Checkers ############################# @@ -816,16 +824,6 @@ def check_system_config(config: ConfigDict=CONFIG) -> None: stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0])) raise SystemExit(2) -def print_dependency_additional_info(dependency: str) -> None: - if dependency == "SINGLEFILE_BINARY": - hint(('npm install -g git+https://github.com/gildas-lormeau/SingleFile.git"', - 'or set SAVE_SINGLEFILE=False to silence this warning', - '')) - if dependency == "READABILITY_BINARY": - hint(('npm install -g git+https://github.com/pirate/readability-extractor.git"', - 'or set SAVE_READABILITY=False to silence this warning', - '')) - def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: invalid_dependencies = [ @@ -842,9 +840,10 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: info['version'] or 'unable to detect version', ) ) - print_dependency_additional_info(dependency) - stderr(' {lightred}Hint:{reset} To get more info on dependencies run:'.format(**ANSI)) - stderr(' archivebox --version') + # if dependency in ("SINGLEFILE_BINARY", "READABILITY_BINARY"): + # hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"', + # f'or set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning', + # '')) stderr('') if config['TIMEOUT'] < 5: diff --git a/package.json b/package.json index 4aaa4867..b4c96da1 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,7 @@ "bin": { "archivebox-node": "./bin/archive", "single-file": "./node_modules/.bin/single-file", - "readability-extractor": "./node_modules/.bin/single-file" + "readability-extractor": "./node_modules/.bin/readability-extractor" }, "dependencies": { "readability-extractor": "git+https://github.com/pirate/readability-extractor.git", From 364388424cf7065716c303787d183bf11a717ddd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:15:45 -0400 Subject: [PATCH 13/21] update config stubs --- archivebox/config/stubs.py | 63 +++++++++----------------------------- 1 file changed, 15 insertions(+), 48 deletions(-) diff --git a/archivebox/config/stubs.py b/archivebox/config/stubs.py index 0e984624..542691ae 100644 --- a/archivebox/config/stubs.py +++ b/archivebox/config/stubs.py @@ -31,15 +31,16 @@ class ConfigDict(BaseConfig, total=False): SHOW_PROGRESS: bool IN_DOCKER: bool - OUTPUT_DIR: str - CONFIG_FILE: str + OUTPUT_DIR: Optional[str] + CONFIG_FILE: Optional[str] ONLY_NEW: bool TIMEOUT: int MEDIA_TIMEOUT: int OUTPUT_PERMISSIONS: str - URL_BLACKLIST: Optional[str] + RESTRICT_FILE_NAMES: str + URL_BLACKLIST: str - SECRET_KEY: str + SECRET_KEY: Optional[str] BIND_ADDR: str ALLOWED_HOSTS: str DEBUG: bool @@ -52,10 +53,11 @@ class ConfigDict(BaseConfig, total=False): SAVE_FAVICON: bool SAVE_WGET: bool SAVE_WGET_REQUISITES: bool + SAVE_SINGLEFILE: bool + SAVE_READABILITY: bool SAVE_PDF: bool SAVE_SCREENSHOT: bool SAVE_DOM: bool - SAVE_SINGLEFILE: bool SAVE_WARC: bool SAVE_GIT: bool SAVE_MEDIA: bool @@ -75,53 +77,18 @@ class ConfigDict(BaseConfig, total=False): USE_CURL: bool USE_WGET: bool + USE_SINGLEFILE: bool + USE_READABILITY: bool USE_GIT: bool USE_CHROME: bool USE_YOUTUBEDL: bool - USE_SINGLEFILE: bool - - CURL_BINARY: Optional[str] - GIT_BINARY: Optional[str] - WGET_BINARY: Optional[str] - YOUTUBEDL_BINARY: Optional[str] + CURL_BINARY: str + GIT_BINARY: str + WGET_BINARY: str + SINGLEFILE_BINARY: str + READABILITY_BINARY: str + YOUTUBEDL_BINARY: str CHROME_BINARY: Optional[str] - SINGLEFILE_BINARY: Optional[str] - - TERM_WIDTH: Callable[[], int] - USER: str - ANSI: Dict[str, str] - REPO_DIR: str - PYTHON_DIR: str - TEMPLATES_DIR: str - ARCHIVE_DIR: str - SOURCES_DIR: str - LOGS_DIR: str - - URL_BLACKLIST_PTN: Optional[Pattern] - WGET_AUTO_COMPRESSION: bool - - ARCHIVEBOX_BINARY: str - VERSION: str - GIT_SHA: str - - PYTHON_BINARY: str - PYTHON_ENCODING: str - PYTHON_VERSION: str - - DJANGO_BINARY: str - DJANGO_VERSION: str - - CURL_VERSION: str - WGET_VERSION: str - YOUTUBEDL_VERSION: str - GIT_VERSION: str - CHROME_VERSION: str - - DEPENDENCIES: Dict[str, SimpleConfigValueDict] - CODE_LOCATIONS: Dict[str, SimpleConfigValueDict] - CONFIG_LOCATIONS: Dict[str, SimpleConfigValueDict] - DATA_LOCATIONS: Dict[str, SimpleConfigValueDict] - CHROME_OPTIONS: Dict[str, SimpleConfigValue] ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue] From bd1ebf6fe54529b65af28639f1e00f09dadd5ece Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:18:11 -0400 Subject: [PATCH 14/21] reduce npm package file list --- .npmignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.npmignore b/.npmignore index 45bf83f5..53fae0a8 100644 --- a/.npmignore +++ b/.npmignore @@ -1,12 +1,16 @@ tests/ archivebox/ +archivebox.egg-info/ build/ dist/ docs/ etc/ +.github +.gitmodules .dockerignore .flake8 CNAME +_config.yml docker-compose.yaml docker-compose.yml Dockerfile From 5cb13ff1dd8e16da0f251020499998e1416cbf8e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:20:26 -0400 Subject: [PATCH 15/21] ignore node cruft in output folder during init --- archivebox/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/archivebox/main.py b/archivebox/main.py index b65c6e64..3958405c 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -114,6 +114,8 @@ ALLOWED_IN_OUTPUT_DIR = { 'venv', 'virtualenv', '.virtualenv', + 'node_modules', + 'package-lock.json', ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, From 31a6318582fb375a2d2b698a1312e38b7222fa49 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:32:11 -0400 Subject: [PATCH 16/21] better colors, hints, and progress bars in docker --- README.md | 6 +++--- archivebox/config/__init__.py | 8 ++++---- archivebox/logging_util.py | 19 +++++++++++-------- docker-compose.yml | 2 +- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index c263932e..406cfa38 100644 --- a/README.md +++ b/README.md @@ -62,10 +62,10 @@ To get started, you can [install them manually](https://github.com/pirate/Archiv ```bash # Docker mkdir data && cd data -docker run -v $PWD:/data nikisweeting/archivebox init -docker run -v $PWD:/data nikisweeting/archivebox add 'https://example.com' +docker run -v $PWD:/data -it nikisweeting/archivebox init +docker run -v $PWD:/data -it nikisweeting/archivebox add 'https://example.com' docker run -v $PWD:/data -it nikisweeting/archivebox manage createsuperuser -docker run -v $PWD:/data -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000 +docker run -v $PWD:/data -it -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 ``` diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 59bb521f..f4edcf85 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -840,10 +840,10 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: info['version'] or 'unable to detect version', ) ) - # if dependency in ("SINGLEFILE_BINARY", "READABILITY_BINARY"): - # hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"', - # f'or set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning', - # '')) + if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY'): + hint(('npm install --prefix . "git+https://github.com/piratee/ArchiveBox.git"', + f'or set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning', + ''), prefix=' ') stderr('') if config['TIMEOUT'] < 5: diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 71a50c56..b79557a6 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -99,15 +99,18 @@ class TimedProgress: if self.SHOW_PROGRESS: # terminate if we havent already terminated - self.p.terminate() - self.p.join() - self.p.close() - - # clear whole terminal line try: - sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) - except (IOError, BrokenPipeError): - # ignore when the parent proc has stopped listening to our stdout + self.p.terminate() + self.p.join() + self.p.close() + + # clear whole terminal line + try: + sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) + except (IOError, BrokenPipeError): + # ignore when the parent proc has stopped listening to our stdout + pass + except ValueError: pass diff --git a/docker-compose.yml b/docker-compose.yml index c8733e1b..a209e959 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -31,7 +31,7 @@ services: # host machine, add tasks and see more info with archivebox schedule --help # scheduler: # image: nikisweeting/archivebox:latest - # command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNA Date: Tue, 18 Aug 2020 18:38:14 -0400 Subject: [PATCH 17/21] better colors and hints in version --- archivebox.egg-info/PKG-INFO | 6 +++--- archivebox/config/__init__.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index 0357040c..3b2f2114 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -77,10 +77,10 @@ Description:
```bash # Docker mkdir data && cd data - docker run -v $PWD:/data nikisweeting/archivebox init - docker run -v $PWD:/data nikisweeting/archivebox add 'https://example.com' + docker run -v $PWD:/data -it nikisweeting/archivebox init + docker run -v $PWD:/data -it nikisweeting/archivebox add 'https://example.com' docker run -v $PWD:/data -it nikisweeting/archivebox manage createsuperuser - docker run -v $PWD:/data -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000 + docker run -v $PWD:/data -it -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 ``` diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index f4edcf85..079c073f 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -841,8 +841,8 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: ) ) if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY'): - hint(('npm install --prefix . "git+https://github.com/piratee/ArchiveBox.git"', - f'or set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning', + hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"', + f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning', ''), prefix=' ') stderr('') From 7144e0bdceec53d34f192d62697831faadcfa8b5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:40:19 -0400 Subject: [PATCH 18/21] search for node dependencies in output dir first --- archivebox/config/__init__.py | 2 +- archivebox/extractors/readability.py | 4 ++-- archivebox/extractors/singlefile.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 079c073f..fd424c2b 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -780,7 +780,7 @@ globals().update(CONFIG) os.environ["TZ"] = 'UTC' # add ./node_modules/.bin to $PATH so we can use node scripts in extractors -NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]) / 'node_modules' / '.bin').resolve()) +NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin')) sys.path.append(NODE_BIN_PATH) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index f181160d..219402b5 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -17,7 +17,7 @@ from ..util import ( from ..config import ( TIMEOUT, SAVE_READABILITY, - READABILITY_BINARY, + DEPENDENCIES, READABILITY_VERSION, ) from ..logging_util import TimedProgress @@ -73,7 +73,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO temp_doc.close() cmd = [ - READABILITY_BINARY, + DEPENDENCIES['READABILITY_BINARY']['path'], temp_doc.name ] diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 87e7d5fd..702e44a0 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -15,7 +15,7 @@ from ..util import ( from ..config import ( TIMEOUT, SAVE_SINGLEFILE, - SINGLEFILE_BINARY, + DEPENDENCIES, SINGLEFILE_VERSION, CHROME_BINARY, ) @@ -43,7 +43,7 @@ def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli cmd = [ - SINGLEFILE_BINARY, + DEPENDENCIES['SINGLEFILE_BINARY']['path'], '--browser-executable-path={}'.format(CHROME_BINARY), '--browser-args="{}"'.format(json.dumps(browser_args[1:])), link.url, From d803481bd820c734e3102a008e1be6367a84ed3c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:46:21 -0400 Subject: [PATCH 19/21] better folder list column output --- archivebox/index/schema.py | 2 +- archivebox/logging_util.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index c3b6ce8c..d6ab601f 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -127,7 +127,7 @@ class Link: def __str__(self) -> str: - return f'[{self.timestamp}] {self.base_url} "{self.title}"' + return f'[{self.timestamp}] {self.url} "{self.title}"' def __post_init__(self): self.typecheck() diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index b79557a6..b10fc10b 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -469,7 +469,10 @@ def printable_folders(folders: Dict[str, Optional["Link"]], from .index.csv import links_to_csv return links_to_csv(folders.values(), cols=csv.split(','), header=True) - return '\n'.join(f'{folder} {link}' for folder, link in folders.items()) + return '\n'.join( + f'{folder} {link and link.url} "{link and link.title}"' + for folder, link in folders.items() + ) From 20e46bf375bf9e80e8dfca7b3ff271974a5717ca Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:52:08 -0400 Subject: [PATCH 20/21] 0.4.20 release --- archivebox.egg-info/PKG-INFO | 2 +- package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index 3b2f2114..a7301407 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: archivebox -Version: 0.4.19 +Version: 0.4.20 Summary: The self-hosted internet archive. Home-page: https://github.com/pirate/ArchiveBox Author: Nick Sweeting diff --git a/package.json b/package.json index b4c96da1..6b5c086d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.19", + "version": "0.4.20", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", From 104b6e2a7bad519fb8b23b4f156d674a3c173b6e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 18 Aug 2020 18:58:05 -0400 Subject: [PATCH 21/21] ignore font assets by default --- archivebox/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index fd424c2b..9887bde4 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -59,7 +59,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'}, 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, - 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2)(\?.*)?$'}, # to avoid downloading code assets as their own pages + 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages }, 'SERVER_CONFIG': {