From cc0fa747cedda5eb2f71382d7cadda657ee732c8 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 18 Aug 2020 10:34:28 -0500 Subject: [PATCH 1/4] feat: Add options to ease management of node related extractors --- README.md | 25 +++++++++++++++++-------- archivebox/config/__init__.py | 6 ++++-- package.json | 3 ++- tests/test_extractors.py | 8 ++++++++ 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c263932e..030313ab 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,12 @@

ArchiveBox
The open-source self-hosted web archive.

-▶️ Quickstart | -Demo | -Github | -Documentation | -Info & Motivation | -Community | +▶️ Quickstart | +Demo | +Github | +Documentation | +Info & Motivation | +Community | Roadmap
@@ -22,6 +22,7 @@
 
 
 
+
 
@@ -56,8 +57,8 @@ ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl, ## Quickstart -ArchiveBox is written in `python3.7` and has [3 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, and `youtube-dl`. -To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings. +ArchiveBox is written in `python3.7` and has [4 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, `youtube-dl` and `nodejs`. +To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. These dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings. ```bash # Docker @@ -82,9 +83,16 @@ open http://127.0.0.1:8000 ```bash # Bare Metal # Use apt on Ubuntu/Debian, brew on mac, or pkg on BSD +# You may need to add a ppa with a more recent version of nodejs apt install python3 python3-pip git curl wget youtube-dl chromium-browser +curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ + && echo 'deb https://deb.nodesource.com/node_14.x buster main' >> /etc/apt/sources.list \ + && apt-get update -qq \ + && apt-get install -qq -y --no-install-recommends nodejs + pip install archivebox # install archivebox +npm run setup mkdir data && cd data # (doesn't have to be called data) archivebox init @@ -97,6 +105,7 @@ archivebox add https://getpocket.com/users/USERNAME/feed/all --depth=1 Once you've added your first links, open `data/index.html` in a browser to view the static archive. You can also start it as a server with a full web UI to manage your links: + ```bash archivebox manage createsuperuser archivebox server diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 066be01f..021939e0 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -112,6 +112,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'USE_READABILITY': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, + 'USE_NODE': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'CURL_BINARY': {'type': str, 'default': 'curl'}, @@ -275,11 +276,12 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()}, 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, + 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])}, 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, - 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE']}, - 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY']}, + 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE'] and c['USE_NODE']}, + 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, diff --git a/package.json b/package.json index 25ad24b1..08566ce7 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,8 @@ "author": "Nick Sweeting ", "license": "MIT", "scripts": { - "archivebox": "./bin/archive" + "setup": "node -e \"const {execSync} = require('child_process'); Object.entries(JSON.parse(fs.readFileSync('package.json')).dependencies).forEach(globaldep => execSync('npm i -g ' + globaldep[1]))\"", + "archivebox": "./bin/archive" }, "bin": { "archivebox": "./bin/archive" diff --git a/tests/test_extractors.py b/tests/test_extractors.py index e085d10e..28781569 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -53,3 +53,11 @@ def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict): archived_item_path = list(tmp_path.glob("archive/**/*"))[0] output_file = archived_item_path / "readability" / "content.html" assert output_file.exists() + +def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + output_str = add_process.stdout.decode("utf-8") + assert "> singlefile" not in output_str + assert "> readability" not in output_str \ No newline at end of file From 44131f05ee313312f6429e7c837cf6dba9e2f3e1 Mon Sep 17 00:00:00 2001 From: Cristian Vargas Date: Tue, 18 Aug 2020 15:56:55 -0500 Subject: [PATCH 2/4] Update README.md Co-authored-by: Nick Sweeting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 030313ab..8725d9b2 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ open http://127.0.0.1:8000 apt install python3 python3-pip git curl wget youtube-dl chromium-browser curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ - && echo 'deb https://deb.nodesource.com/node_14.x buster main' >> /etc/apt/sources.list \ + && echo 'deb https://deb.nodesource.com/node_14.x $(lsb_release -cs) main' >> /etc/apt/sources.list \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends nodejs From 6af08ae9f23ff23cf64ec6b86b8bbf22430d4a60 Mon Sep 17 00:00:00 2001 From: Cristian Vargas Date: Tue, 18 Aug 2020 16:08:03 -0500 Subject: [PATCH 3/4] Install archivebox as an npm package (README.md) Co-authored-by: Nick Sweeting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8725d9b2..13159664 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ && apt-get install -qq -y --no-install-recommends nodejs pip install archivebox # install archivebox -npm run setup +npm install -g 'git+https://github.com/pirate/ArchiveBox.git' mkdir data && cd data # (doesn't have to be called data) archivebox init From 21ae8c8777d329ed83d8e2bdaae9323a042c77a6 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 18 Aug 2020 16:15:58 -0500 Subject: [PATCH 4/4] fix: Remove setup script from package.json --- package.json | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/package.json b/package.json index 08566ce7..f511a549 100644 --- a/package.json +++ b/package.json @@ -1,18 +1,17 @@ { - "name": "archivebox", - "version": "0.4.14", - "description": "ArchiveBox: The self-hosted internet archive", - "author": "Nick Sweeting ", - "license": "MIT", - "scripts": { - "setup": "node -e \"const {execSync} = require('child_process'); Object.entries(JSON.parse(fs.readFileSync('package.json')).dependencies).forEach(globaldep => execSync('npm i -g ' + globaldep[1]))\"", - "archivebox": "./bin/archive" - }, - "bin": { - "archivebox": "./bin/archive" - }, - "dependencies": { - "readability-extractor": "git+https://github.com/pirate/readability-extractor.git", - "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git" - } + "name": "archivebox", + "version": "0.4.14", + "description": "ArchiveBox: The self-hosted internet archive", + "author": "Nick Sweeting ", + "license": "MIT", + "scripts": { + "archivebox": "./bin/archive" + }, + "bin": { + "archivebox": "./bin/archive" + }, + "dependencies": { + "readability-extractor": "git+https://github.com/pirate/readability-extractor.git", + "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git" + } }