1
0
Fork 0
mirror of synced 2024-06-02 02:25:20 +12:00

Merge pull request #403 from cdvv7788/single-file

This commit is contained in:
Nick Sweeting 2020-08-06 22:43:47 -04:00 committed by GitHub
commit c8e3aed647
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 236 additions and 47 deletions

View file

@ -40,6 +40,22 @@ jobs:
with:
fetch-depth: 1
- uses: actions/checkout@v2
with:
fetch-depth: 1
repository: "gildas-lormeau/SingleFile"
ref: "master"
path: "singlefile"
- name: Install npm requirements for singlefile
run: npm install --prefix singlefile/cli
- name: Give singlefile execution permissions
run: chmod +x singlefile/cli/single-file
- name: Set SINGLEFILE_BINARY
run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file"
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v1
with:
@ -60,6 +76,14 @@ jobs:
restore-keys: |
${{ runner.os }}-${{ matrix.python }}-venv-
- name: Use nodejs 14.7.0
uses: actions/setup-node@v1
with:
node-version: 14.7.0
- name: Debug
run: ls ./
- name: Install dependencies
run: |
python -m pip install .

View file

@ -10,8 +10,8 @@
FROM python:3.8-slim-buster
LABEL name="archivebox" \
maintainer="Nick Sweeting <archivebox-git@sweeting.me>" \
description="All-in-one personal internet archiving container"
maintainer="Nick Sweeting <archivebox-git@sweeting.me>" \
description="All-in-one personal internet archiving container"
ENV TZ=UTC \
LANGUAGE=en_US:en \
@ -22,28 +22,40 @@ ENV TZ=UTC \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
CODE_PATH=/app \
VENV_PATH=/venv \
DATA_PATH=/data
DATA_PATH=/data \
EXTRA_PATH=/extra
# First install CLI utils and base deps, then Chrome + Fons
# First install CLI utils and base deps, then Chrome + Fons + nodejs
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
dumb-init jq git wget curl youtube-dl ffmpeg \
apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
dumb-init jq git wget curl youtube-dl ffmpeg \
&& curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
&& echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
&& curl -sL https://deb.nodesource.com/setup_14.x | bash - \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
google-chrome-stable \
fontconfig \
fonts-ipafont-gothic \
fonts-wqy-zenhei \
fonts-thai-tlwg \
fonts-kacst \
fonts-symbola \
fonts-noto \
fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
google-chrome-stable \
fontconfig \
fonts-ipafont-gothic \
fonts-wqy-zenhei \
fonts-thai-tlwg \
fonts-kacst \
fonts-symbola \
fonts-noto \
fonts-freefont-ttf \
nodejs \
unzip \
&& rm -rf /var/lib/apt/lists/*
# Clone singlefile and move it to the /bin folder so archivebox can find it
WORKDIR "$EXTRA_PATH"
RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \
&& unzip -q SingleFile.zip \
&& npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \
&& chmod +x SingleFile-master/cli/single-file
# Run everything from here on out as non-privileged user
RUN groupadd --system archivebox \
@ -60,7 +72,8 @@ VOLUME "$DATA_PATH"
WORKDIR "$DATA_PATH"
EXPOSE 8000
ENV CHROME_BINARY=google-chrome \
CHROME_SANDBOX=False
CHROME_SANDBOX=False \
SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file"
RUN env ALLOW_ROOT=True archivebox version

View file

@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'DEPENDENCY_CONFIG': {
'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True},
'USE_SINGLEFILE': {'type': bool, 'default': True},
'USE_GIT': {'type': bool, 'default': True},
'USE_CHROME': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True},
@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'},
'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'CHROME_BINARY': {'type': str, 'default': None},
},
@ -249,6 +252,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
@ -674,6 +681,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_WGET'],
'is_valid': bool(config['WGET_VERSION']),
},
'SINGLEFILE_BINARY': {
'path': bin_path(config['SINGLEFILE_BINARY']),
'version': config['SINGLEFILE_VERSION'],
'hash': bin_hash(config['SINGLEFILE_BINARY']),
'enabled': config['USE_SINGLEFILE'],
'is_valid': bool(config['SINGLEFILE_VERSION']),
},
'GIT_BINARY': {
'path': bin_path(config['GIT_BINARY']),
'version': config['GIT_VERSION'],

View file

@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin):
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
'<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
'<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin):
*link_tuple(link, 'screenshot_path'),
*link_tuple(link, 'dom_path'),
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
*link_tuple(link, 'singlefile_path'),
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),

View file

@ -25,6 +25,7 @@ from ..logging_util import (
from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .singlefile import should_save_singlefile, save_singlefile
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
@ -37,6 +38,7 @@ def get_default_archive_methods():
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('wget', should_save_wget, save_wget),
('singlefile', should_save_singlefile, save_singlefile),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),

View file

@ -0,0 +1,84 @@
__package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
chrome_args
)
from ..config import (
TIMEOUT,
SAVE_SINGLEFILE,
SINGLEFILE_BINARY,
SINGLEFILE_VERSION,
CHROME_BINARY,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
output = Path(out_dir or link.link_dir) / 'singlefile.html'
return SAVE_SINGLEFILE and (not output.exists())
@enforce_types
def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using single-file"""
out_dir = out_dir or link.link_dir
output = str(Path(out_dir).absolute() / "singlefile.html")
browser_args = chrome_args(TIMEOUT=0)
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
cmd = [
SINGLEFILE_BINARY,
'--browser-executable-path={}'.format(CHROME_BINARY),
'--browser-args="{}"'.format(json.dumps(browser_args[1:])),
link.url,
output
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip()
]
hints = (
'Got single-file response code: {}.'.format(result.returncode),
*output_tail,
)
# Check for common failure cases
if (result.returncode > 0):
raise ArchiveError('SingleFile was not able to archive the page', hints)
chmod_file(output)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=SINGLEFILE_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -365,6 +365,7 @@ class Link:
'screenshot.png',
'output.html',
'media',
'singlefile.html'
)
return any(
@ -376,7 +377,7 @@ class Link:
"""get the latest output that each archive method produced for link"""
ARCHIVE_METHODS = (
'title', 'favicon', 'wget', 'warc', 'pdf',
'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
'screenshot', 'dom', 'git', 'media', 'archive_org',
)
latest: Dict[str, ArchiveOutput] = {}
@ -392,7 +393,6 @@ class Link:
latest[archive_method] = history[0].output
else:
latest[archive_method] = None
return latest
@ -406,6 +406,7 @@ class Link:
'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
'wget_path': wget_output_path(self),
'warc_path': 'warc',
'singlefile_path': 'singlefile.html',
'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png',
'dom_path': 'output.html',
@ -425,7 +426,7 @@ class Link:
'pdf_path': static_path,
'screenshot_path': static_path,
'dom_path': static_path,
'singlefile_path': static_path,
})
return canonical

View file

@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
@enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str:
version = None
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note, version = 'green', '', 'valid', ''

View file

@ -79,6 +79,7 @@
.card {
overflow: hidden;
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
margin-top: 10px;
}
.card h4 {
font-size: 1.4vw;
@ -335,6 +336,18 @@
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$singlefile_path" target="preview"><h4 class="card-title">SingleFile</h4></a>
<p class="card-text">archive/singlefile.html</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>

View file

@ -7,4 +7,19 @@ import pytest
def process(tmp_path):
os.chdir(tmp_path)
process = subprocess.run(['archivebox', 'init'], capture_output=True)
return process
return process
@pytest.fixture
def disable_extractors_dict():
env = os.environ.copy()
env.update({
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"USE_GIT": "false",
"SAVE_MEDIA": "false",
"SAVE_ARCHIVE_DOT_ORG": "false"
})
return env

View file

@ -3,25 +3,30 @@ import json
from .fixtures import *
def test_depth_flag_is_accepted(process):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
def test_depth_flag_is_accepted(process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")
def test_depth_flag_fails_if_it_is_not_0_or_1(process):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True)
def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"],
capture_output=True, env=disable_extractors_dict)
assert 'invalid choice' in arg_process.stderr.decode("utf-8")
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True)
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"],
capture_output=True, env=disable_extractors_dict)
assert 'invalid choice' in arg_process.stderr.decode("utf-8")
def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
with open(archived_item_path / "index.json", "r") as f:
output_json = json.load(f)
assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True)
def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"],
capture_output=True, env=disable_extractors_dict)
with open(tmp_path / "index.json", "r") as f:
archive_file = f.read()
assert "http://127.0.0.1:8080/static/example.com.html" in archive_file

View file

@ -1,8 +1,10 @@
from .fixtures import *
from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title
def test_wget_broken_pipe(tmp_path, process):
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_WGET": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")
def test_ignore_methods():
@ -10,4 +12,12 @@ def test_ignore_methods():
Takes the passed method out of the default methods list and returns that value
"""
ignored = ignore_methods(['title'])
assert should_save_title not in ignored
assert should_save_title not in ignored
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
output_file = archived_item_path / "singlefile.html"
assert output_file.exists()

View file

@ -18,9 +18,10 @@ def test_update(tmp_path, process):
update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
def test_add_link(tmp_path, process):
def test_add_link(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
assert "index.json" in [x.name for x in archived_item_path.iterdir()]
@ -33,9 +34,10 @@ def test_add_link(tmp_path, process):
output_html = f.read()
assert "Example Domain" in output_html
def test_add_link_support_stdin(tmp_path, process):
def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
env=disable_extractors_dict)
stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode())
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process):
file_path = tmp_path / file
assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
def test_correct_permissions_add_command_results(tmp_path, process):
def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
for path in archived_item_path.iterdir():
assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS

View file

@ -2,13 +2,14 @@ from pathlib import Path
from .fixtures import *
def test_oneshot_command_exists(tmp_path):
def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
os.chdir(tmp_path)
process = subprocess.run(['archivebox', 'oneshot'], capture_output=True)
process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict)
assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8")
def test_oneshot_commad_saves_page_in_right_folder(tmp_path):
process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True)
def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"],
capture_output=True, env=disable_extractors_dict)
items = ' '.join([str(x) for x in tmp_path.iterdir()])
current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
assert "index.json" in items

View file

@ -1,8 +1,8 @@
from .fixtures import *
def test_remove_leaves_index_in_consistent_state(tmp_path, process):
def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
list_process = subprocess.run(['archivebox', 'list'], capture_output=True)
assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8")

View file

@ -1,12 +1,13 @@
from .fixtures import *
def test_title_is_htmlencoded_in_index_html(tmp_path, process):
def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
"""
https://github.com/pirate/ArchiveBox/issues/330
Unencoded content should not be rendered as it facilitates xss injections
and breaks the layout.
"""
add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True)
add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'],
capture_output=True, env=disable_extractors_dict)
with open(tmp_path / "index.html", "r") as f:
output_html = f.read()