diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index f2fea3ba..b350fb28 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,5 +1,5 @@ --- -name: Bug report +name: 🐞 Bug report about: Create a report to help us improve title: '' labels: '' diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md new file mode 100644 index 00000000..dc3c2741 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation_change.md @@ -0,0 +1,15 @@ +--- +name: 📑 Documentation change +about: Submit a suggestion for the Wiki documentation +title: '' +labels: '' +assignees: '' + +--- + +## Wiki Page URL + + +## Suggested Edit + +... diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 4489f7dc..0f9423f5 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,5 +1,5 @@ --- -name: Feature request +name: 💡 Feature request about: Suggest an idea for this project title: '' labels: '' diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 44e56a16..c903d1a9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,5 @@ +**IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes, I will close them with great prejudice. The PEP8 checks I don't follow are intentional. PRs for minor bugfixes, typos, etc are fine.** + # Summary e.g. This PR fixes ABC or adds the ability to do XYZ... diff --git a/archivebox/config.py b/archivebox/config.py index df634102..84e74710 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1,12 +1,14 @@ import os +import re import sys import shutil -from typing import Optional +from typing import Optional, Pattern from subprocess import run, PIPE, DEVNULL OUTPUT_DIR: str +URL_BLACKLIST: Optional[Pattern[str]] # ****************************************************************************** # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration @@ -24,6 +26,7 @@ TIMEOUT = int(os.getenv('TIMEOUT', '60')) MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) +URL_BLACKLIST = os.getenv('URL_BLACKLIST', None) FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true' @@ -58,6 +61,11 @@ CHROME_BINARY = os.getenv('CHROME_BINARY', None) CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' +try: + OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) +except Exception: + OUTPUT_DIR = None + # ****************************************************************************** ### Terminal Configuration @@ -95,6 +103,9 @@ TEMPLATES_DIR = os.path.join(PYTHON_DIR, 'templates') if COOKIES_FILE: COOKIES_FILE = os.path.abspath(COOKIES_FILE) +URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) + +########################### Environment & Dependencies ######################### VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip() GIT_SHA = VERSION.split('+')[1] diff --git a/archivebox/links.py b/archivebox/links.py index ffb4d415..8844ef9b 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -8,6 +8,9 @@ from .util import ( merge_links, ) +from config import ( + URL_BLACKLIST, +) def validate_links(links: Iterable[Link]) -> Iterable[Link]: links = archivable_links(links) # remove chrome://, about:, mailto: etc. @@ -22,11 +25,11 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]: def archivable_links(links: Iterable[Link]) -> Iterable[Link]: """remove chrome://, about:// or other schemed links that cant be archived""" - return ( - link - for link in links - if scheme(link.url) in ('http', 'https', 'ftp') - ) + for link in links: + scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') + not_blacklisted = (not URL_BLACKLIST.match(link.url)) if URL_BLACKLIST else True + if scheme_is_valid and not_blacklisted: + yield link def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: @@ -87,3 +90,5 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: new_timestamp = '{}.{}'.format(timestamp, nonce) return new_timestamp + +