1
0
Fork 0
mirror of synced 2024-05-03 20:12:52 +12:00

fix: Fix mercury extractor test

This commit is contained in:
Cristian 2020-09-23 10:34:05 -05:00
parent 2bf496e7e9
commit 46b9e3d536
7 changed files with 79 additions and 108 deletions

View file

@ -855,7 +855,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
info['version'] or 'unable to detect version',
)
)
if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY'):
if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"',
f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
''), prefix=' ')

View file

@ -179,7 +179,7 @@ def wget_output_path(link: Link) -> Optional[str]:
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
]
if html_files:
return str(Path(search_dir.name) / html_files[0])
return str(html_files[0])
# Move up one directory level
search_dir = search_dir.parent

View file

@ -2,7 +2,7 @@
<td title="$timestamp">$bookmarked_date</td>
<td class="title-col">
<a href="$archive_path/index.html" class="link-url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
<a href="$archive_path/$wget_url" title="$title">
<a href="$wget_url" title="$title">
<span data-title-for="$url" data-archived="$is_archived">$title</span>
<small style="float:right">$tags</small>
</a>

170
package-lock.json generated
View file

@ -122,9 +122,9 @@
"integrity": "sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ=="
},
"@types/node": {
"version": "14.6.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-14.6.3.tgz",
"integrity": "sha512-pC/hkcREG6YfDfui1FBmj8e20jFU5Exjw4NYDm8kEdrW+mOh0T1Zve8DWKnS7ZIZvgncrctcNCXF4Q2I+loyww==",
"version": "14.11.2",
"resolved": "https://registry.npmjs.org/@types/node/-/node-14.11.2.tgz",
"integrity": "sha512-jiE3QIxJ8JLNcb1Ps6rDbysDhN4xa8DJJvuC9prr6w+1tIh+QAbYyNF3tyiZNLDBIuBCf4KEcV2UvQm/V60xfA==",
"optional": true
},
"@types/yauzl": {
@ -348,13 +348,13 @@
"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="
},
"cliui": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/cliui/-/cliui-6.0.0.tgz",
"integrity": "sha512-t6wbgtoCXvAzst7QgXxJYqPt0usEfbgQdftEPbLL/cvv6HPE5VgvqCuAIDR0NgU52ds6rFwqrgakNLrHEjCbrQ==",
"version": "7.0.1",
"resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.1.tgz",
"integrity": "sha512-rcvHOWyGyid6I1WjT/3NatKj2kDt9OdSHSXpyLXaMWFbKpGACNW8pRhhdPUq9MWUOdwn8Rz9AVETjF4105rZZQ==",
"requires": {
"string-width": "^4.2.0",
"strip-ansi": "^6.0.0",
"wrap-ansi": "^6.2.0"
"wrap-ansi": "^7.0.0"
}
},
"color-convert": {
@ -448,11 +448,11 @@
}
},
"debug": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.1.1.tgz",
"integrity": "sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==",
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.2.0.tgz",
"integrity": "sha512-IX2ncY78vDTjZMFUdmsvIRFY2Cf4FnD0wRs+nQwJU8Lu99/tPFdb0VybiiMTPe3I6rQmwsqQqRBvxU+bZ/I8sg==",
"requires": {
"ms": "^2.1.1"
"ms": "2.1.2"
}
},
"decamelize": {
@ -475,6 +475,11 @@
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk="
},
"devtools-protocol": {
"version": "0.0.799653",
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.799653.tgz",
"integrity": "sha512-t1CcaZbvm8pOlikqrsIM9GOa7Ipp07+4h/q9u0JXBWjPCjHdBl9KkddX87Vv9vBHoBGtwV79sYQNGnQM6iS5gg=="
},
"difflib": {
"version": "github:postlight/difflib.js#32e8e38c7fcd935241b9baab71bb432fd9b166ed",
"from": "github:postlight/difflib.js",
@ -520,9 +525,9 @@
}
},
"dompurify": {
"version": "2.0.14",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.0.14.tgz",
"integrity": "sha512-oqcjyCLHLjWugZ6VwK0YfmRND/DFy/CuZhdasmymMfnxbzaaQxBSA1ATZIXWESGDj/nvq1vKLmRa7rTdbGgrmQ=="
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.1.0.tgz",
"integrity": "sha512-wKExRhOwUnfm1icoISSXnlmM1P2l07W2tFQqbU+8oySnvy7tHwj2iHJ1kJQi8EfcTlojsHKESOJwCGVJmNUdPQ=="
},
"domutils": {
"version": "1.5.1",
@ -570,6 +575,11 @@
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
},
"escalade": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.0.tgz",
"integrity": "sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig=="
},
"escodegen": {
"version": "1.14.3",
"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-1.14.3.tgz",
@ -1047,11 +1057,6 @@
"resolved": "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz",
"integrity": "sha1-7dFMgk4sycHgsKG0K7UhBRakJDg="
},
"mime": {
"version": "2.4.6",
"resolved": "https://registry.npmjs.org/mime/-/mime-2.4.6.tgz",
"integrity": "sha512-RZKhC3EmpBchfTGBVb8fb+RL2cWyw/32lshnsETttkBAyAUXSGHxbEJWWRXc751DrIxG1q04b8QwMbAwkRPpUA=="
},
"mime-db": {
"version": "1.44.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.44.0.tgz",
@ -1193,6 +1198,14 @@
"resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
"integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns="
},
"pkg-dir": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-4.2.0.tgz",
"integrity": "sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==",
"requires": {
"find-up": "^4.0.0"
}
},
"pn": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/pn/-/pn-1.1.0.tgz",
@ -1288,14 +1301,15 @@
"integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A=="
},
"puppeteer-core": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-3.3.0.tgz",
"integrity": "sha512-hynQ3r0J/lkGrKeBCqu160jrj0WhthYLIzDQPkBxLzxPokjw4elk1sn6mXAian/kfD2NRzpdh9FSykxZyL56uA==",
"version": "5.3.1",
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-5.3.1.tgz",
"integrity": "sha512-YE6c6FvHAFKQUyNTqFs78SgGmpcqOPhhmVfEVNYB4abv7bV2V+B3r72T3e7vlJkEeTloy4x9bQLrGbHHoKSg1w==",
"requires": {
"debug": "^4.1.0",
"devtools-protocol": "0.0.799653",
"extract-zip": "^2.0.0",
"https-proxy-agent": "^4.0.0",
"mime": "^2.0.3",
"pkg-dir": "^4.2.0",
"progress": "^2.0.1",
"proxy-from-env": "^1.0.0",
"rimraf": "^3.0.2",
@ -1319,7 +1333,7 @@
"from": "git+https://github.com/pirate/readability-extractor.git",
"requires": {
"@mozilla/readability": "^0.3.0",
"dompurify": "^2.0.14",
"dompurify": "^2.1.0",
"jsdom": "^16.4.0"
}
},
@ -1437,11 +1451,6 @@
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
"integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I="
},
"require-main-filename": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-2.0.0.tgz",
"integrity": "sha512-NKN5kMDylKuldxYLSUfrbo5Tuzh4hd+2E8NPPX02mZtn1VuREQToYe/ZdlJy+J3uCpfaiGF05e7B8W0iXbQHmg=="
},
"rimraf": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
@ -1493,18 +1502,13 @@
}
}
},
"set-blocking": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
"integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc="
},
"set-immediate-shim": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/set-immediate-shim/-/set-immediate-shim-1.0.1.tgz",
"integrity": "sha1-SysbJ+uAip+NzEgaWOXlb1mfP2E="
},
"single-file": {
"version": "git+https://github.com/gildas-lormeau/SingleFile.git#e2e15381a6cbb9c3a6ca0ea8ff7307174e98ad12",
"version": "git+https://github.com/gildas-lormeau/SingleFile.git#12671240d6ca699667915378034dd29ca80d4796",
"from": "git+https://github.com/gildas-lormeau/SingleFile.git",
"requires": {
"file-url": "^3.0.0",
@ -1517,38 +1521,11 @@
},
"dependencies": {
"iconv-lite": {
"version": "0.5.2",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.5.2.tgz",
"integrity": "sha512-kERHXvpSaB4aU3eANwidg79K8FlrN77m8G9V+0vOR3HYaRifrlwMEpT7ZBJqLSEIHnEgJTHcWK82wwLwwKwtag==",
"version": "0.6.2",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.2.tgz",
"integrity": "sha512-2y91h5OpQlolefMPmUlivelittSWy0rP+oYVpn6A7GwVHNE8AWzoYOBNmlwks3LobaJxgHCYZAnyNo2GgpNRNQ==",
"requires": {
"safer-buffer": ">= 2.1.2 < 3"
}
},
"request-promise-core": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/request-promise-core/-/request-promise-core-1.1.3.tgz",
"integrity": "sha512-QIs2+ArIGQVp5ZYbWD5ZLCY29D5CfWizP8eWnm8FoGD1TX61veauETVQbrV60662V0oFBkrDOuaBI8XgtuyYAQ==",
"requires": {
"lodash": "^4.17.15"
}
},
"request-promise-native": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/request-promise-native/-/request-promise-native-1.0.8.tgz",
"integrity": "sha512-dapwLGqkHtwL5AEbfenuzjTYg35Jd6KPytsC2/TLkVMz8rm+tNt72MGUWT1RP/aYawMpN6HqbNGBQaRcBtjQMQ==",
"requires": {
"request-promise-core": "1.1.3",
"stealthy-require": "^1.1.1",
"tough-cookie": "^2.3.3"
}
},
"tough-cookie": {
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz",
"integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==",
"requires": {
"psl": "^1.1.28",
"punycode": "^2.1.1"
"safer-buffer": ">= 2.1.2 < 3.0.0"
}
}
}
@ -1644,11 +1621,11 @@
}
},
"tar-stream": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.1.3.tgz",
"integrity": "sha512-Z9yri56Dih8IaK8gncVPx4Wqt86NDmQTSh49XLZgjWpGZL9GK9HKParS2scqHCC4w6X9Gh2jwaU45V47XTKwVA==",
"version": "2.1.4",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.1.4.tgz",
"integrity": "sha512-o3pS2zlG4gxr67GmFYBLlq+dM8gyRGUOvsrHclSkvtVtQbjV0s/+ZE8OpICbaj8clrX3tjeHngYGP7rweaBnuw==",
"requires": {
"bl": "^4.0.1",
"bl": "^4.0.3",
"end-of-stream": "^1.4.1",
"fs-constants": "^1.0.0",
"inherits": "^2.0.3",
@ -1970,29 +1947,24 @@
"integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g=="
},
"whatwg-url": {
"version": "8.2.1",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.2.1.tgz",
"integrity": "sha512-ZmVCr6nfBeaMxEHALLEGy0LszYjpJqf6PVNQUQ1qd9Et+q7Jpygd4rGGDXgHjD8e99yLFseD69msHDM4YwPZ4A==",
"version": "8.2.2",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.2.2.tgz",
"integrity": "sha512-PcVnO6NiewhkmzV0qn7A+UZ9Xx4maNTI+O+TShmfE4pqjoCMwUMjkvoNhNHPTvgR7QH9Xt3R13iHuWy2sToFxQ==",
"requires": {
"lodash.sortby": "^4.7.0",
"tr46": "^2.0.2",
"webidl-conversions": "^6.1.0"
}
},
"which-module": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz",
"integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho="
},
"word-wrap": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
"integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ=="
},
"wrap-ansi": {
"version": "6.2.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz",
"integrity": "sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==",
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
"integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
"requires": {
"ansi-styles": "^4.0.0",
"string-width": "^4.1.0",
@ -2028,36 +2000,28 @@
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
},
"y18n": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz",
"integrity": "sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w=="
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.1.tgz",
"integrity": "sha512-/jJ831jEs4vGDbYPQp4yGKDYPSCCEQ45uZWJHE1AoYBzqdZi8+LDWas0z4HrmJXmKdpFsTiowSHXdxyFhpmdMg=="
},
"yargs": {
"version": "15.4.1",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-15.4.1.tgz",
"integrity": "sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A==",
"version": "16.0.3",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-16.0.3.tgz",
"integrity": "sha512-6+nLw8xa9uK1BOEOykaiYAJVh6/CjxWXK/q9b5FpRgNslt8s22F2xMBqVIKgCRjNgGvGPBy8Vog7WN7yh4amtA==",
"requires": {
"cliui": "^6.0.0",
"decamelize": "^1.2.0",
"find-up": "^4.1.0",
"get-caller-file": "^2.0.1",
"cliui": "^7.0.0",
"escalade": "^3.0.2",
"get-caller-file": "^2.0.5",
"require-directory": "^2.1.1",
"require-main-filename": "^2.0.0",
"set-blocking": "^2.0.0",
"string-width": "^4.2.0",
"which-module": "^2.0.0",
"y18n": "^4.0.0",
"yargs-parser": "^18.1.2"
"y18n": "^5.0.1",
"yargs-parser": "^20.0.0"
}
},
"yargs-parser": {
"version": "18.1.3",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-18.1.3.tgz",
"integrity": "sha512-o50j0JeToy/4K6OZcaQmW6lyXXKhq7csREXcDwk2omFPJEwUNOVtJKvmDr9EI1fAJZUyZcRF7kxGBWmRXudrCQ==",
"requires": {
"camelcase": "^5.0.0",
"decamelize": "^1.2.0"
}
"version": "20.2.0",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.0.tgz",
"integrity": "sha512-2agPoRFPoIcFzOIp6656gcvsg2ohtscpw2OINr/q46+Sq41xz2OYLqx5HRHabmFU1OARIPAYH5uteICE7mn/5A=="
},
"yauzl": {
"version": "2.10.0",

View file

@ -10,7 +10,8 @@
"bin": {
"archivebox-node": "./bin/archive",
"single-file": "./node_modules/.bin/single-file",
"readability-extractor": "./node_modules/.bin/readability-extractor"
"readability-extractor": "./node_modules/.bin/readability-extractor",
"mercury-parser": "./node_modules/.bin/mercury-parser"
},
"dependencies": {
"@postlight/mercury-parser": "^2.2.0",

View file

@ -10,6 +10,12 @@ def index():
@route("/static/<filename>")
def static_path(filename):
template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
response = static_file(filename, root=template_path)
return response
@route("/static_no_content_type/<filename>")
def static_no_content_type(filename):
template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
response = static_file(filename, root=template_path)
response.set_header("Content-Type", "")

View file

@ -5,6 +5,6 @@ def test_download_url_downloads_content():
assert "Example Domain" in text
def test_download_url_gets_encoding_from_body():
text = util.download_url("http://127.0.0.1:8080/static/shift_jis.html")
text = util.download_url("http://127.0.0.1:8080/static_no_content_type/shift_jis.html")
assert "鹿児島のニュースMBC南日本放送" in text
assert "掲載された全ての記事・画像等の無断転載、二次利用をお断りいたします" in text