1
0
Fork 0
mirror of synced 2024-06-27 02:20:36 +12:00

Merge branch 'master' into dev

This commit is contained in:
Nick Sweeting 2019-03-30 15:36:54 -04:00
commit 5d0185b6dd
6 changed files with 41 additions and 8 deletions

View file

@ -1,5 +1,5 @@
---
name: Bug report
name: 🐞 Bug report
about: Create a report to help us improve
title: ''
labels: ''

View file

@ -0,0 +1,15 @@
---
name: 📑 Documentation change
about: Submit a suggestion for the Wiki documentation
title: ''
labels: ''
assignees: ''
---
## Wiki Page URL
## Suggested Edit
...

View file

@ -1,5 +1,5 @@
---
name: Feature request
name: 💡 Feature request
about: Suggest an idea for this project
title: ''
labels: ''

View file

@ -1,3 +1,5 @@
**IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes, I will close them with great prejudice. The PEP8 checks I don't follow are intentional. PRs for minor bugfixes, typos, etc are fine.**
# Summary
e.g. This PR fixes ABC or adds the ability to do XYZ...

View file

@ -1,12 +1,14 @@
import os
import re
import sys
import shutil
from typing import Optional
from typing import Optional, Pattern
from subprocess import run, PIPE, DEVNULL
OUTPUT_DIR: str
URL_BLACKLIST: Optional[Pattern[str]]
# ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
@ -24,6 +26,7 @@ TIMEOUT = int(os.getenv('TIMEOUT', '60'))
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' )
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
URL_BLACKLIST = os.getenv('URL_BLACKLIST', None)
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
@ -58,6 +61,11 @@ CHROME_BINARY = os.getenv('CHROME_BINARY', None)
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
try:
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
except Exception:
OUTPUT_DIR = None
# ******************************************************************************
### Terminal Configuration
@ -95,6 +103,9 @@ TEMPLATES_DIR = os.path.join(PYTHON_DIR, 'templates')
if COOKIES_FILE:
COOKIES_FILE = os.path.abspath(COOKIES_FILE)
URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE)
########################### Environment & Dependencies #########################
VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[1]

View file

@ -8,6 +8,9 @@ from .util import (
merge_links,
)
from config import (
URL_BLACKLIST,
)
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
@ -22,11 +25,11 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
return (
link
for link in links
if scheme(link.url) in ('http', 'https', 'ftp')
)
for link in links:
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST.match(link.url)) if URL_BLACKLIST else True
if scheme_is_valid and not_blacklisted:
yield link
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
@ -87,3 +90,5 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp