diff --git a/.gitignore b/.gitignore index 790b36d5..f9e2b17d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,19 +1,3 @@ -# Pocket archive output folder -pocket/ -bookmarks/ -pinboard/ -html/ -downloads/ - -# Byte-compiled / optimized / DLL files +output/ __pycache__/ - -# Virtualenv -env/ -ENV/ -venv/ -VENV/ -.venv/ -.env/ -.python-version -.env +archiver/venv diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index becd4abb..00000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,46 +0,0 @@ -# Contributor Covenant Code of Conduct - -## Our Pledge - -In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. - -## Our Standards - -Examples of behavior that contributes to creating a positive environment include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a professional setting - -## Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. - -## Scope - -This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at git@nicksweeting.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] - -[homepage]: http://contributor-covenant.org -[version]: http://contributor-covenant.org/version/1/4/ diff --git a/README.md b/README.md index 76c689f7..3a21a5a5 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,6 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc **Shell Options:** - colorize console ouput: `USE_COLOR` value: [`True`]/`False` - show progress bar: `SHOW_PROGRESS` value: [`True`]/`False` - - archive output directory: `ARCHIVE_DIR` value: [`.`]/`'/var/www/archive'`/`...` - archive permissions: `ARCHIVE_PERMISSIONS` values: [`755`]/`644`/`...` **Dependency Options:** diff --git a/archiver/.DS_Store b/archiver/.DS_Store new file mode 100644 index 00000000..d37f27fe Binary files /dev/null and b/archiver/.DS_Store differ diff --git a/__init__.py b/archiver/__init__.py similarity index 100% rename from __init__.py rename to archiver/__init__.py diff --git a/archive b/archiver/archive.py similarity index 93% rename from archive rename to archiver/archive.py index dc694751..b878a764 100755 --- a/archive +++ b/archiver/archive.py @@ -19,7 +19,7 @@ from index import ( ) from config import ( ARCHIVE_PERMISSIONS, - HTML_FOLDER, + OUTPUT_DIR, ANSI, TIMEOUT, ) @@ -36,10 +36,10 @@ def print_help(): print(__DESCRIPTION__) print("Documentation: {}\n".format(__DOCUMENTATION__)) print("Usage:") - print(" ./archive.py ~/Downloads/bookmarks_export.html\n") + print(" ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n") -def merge_links(archive_path=HTML_FOLDER, import_path=None): +def merge_links(archive_path=OUTPUT_DIR, import_path=None): """get new links from file and optionally append them to links in existing archive""" all_links = [] if import_path: @@ -116,7 +116,7 @@ def update_archive(archive_path, links, source=None, resume=None, append=True): if __name__ == '__main__': argc = len(sys.argv) - if set(sys.argv).intersection('-h', '--help', 'help'): + if set(sys.argv).intersection(('-h', '--help', 'help')): print_help() raise SystemExit(0) @@ -139,11 +139,11 @@ if __name__ == '__main__': raise SystemExit(1) # See if archive folder already exists - for out_dir in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'): + for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'): if os.path.exists(out_dir): break else: - out_dir = HTML_FOLDER + out_dir = OUTPUT_DIR # Step 0: Download url to local file (only happens if a URL is specified instead of local path) if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')): diff --git a/archive_methods.py b/archiver/archive_methods.py similarity index 99% rename from archive_methods.py rename to archiver/archive_methods.py index cb4e19da..5a923673 100644 --- a/archive_methods.py +++ b/archiver/archive_methods.py @@ -11,7 +11,6 @@ from peekable import Peekable from index import wget_output_path, parse_json_link_index, write_link_index from links import links_after_timestamp from config import ( - ARCHIVE_DIR, CHROME_BINARY, FETCH_WGET, FETCH_WGET_REQUISITES, @@ -33,6 +32,7 @@ from util import ( check_dependencies, progress, chmod_file, + pretty_path, ) @@ -123,7 +123,7 @@ def log_link_archive(link_dir, link, update_existing): **ANSI, )) - print(' > {}{}'.format(link_dir, '' if update_existing else ' (new)')) + print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)')) if link['type']: print(' i {}'.format(link['type'])) diff --git a/config.py b/archiver/config.py similarity index 93% rename from config.py rename to archiver/config.py index ceae6c75..16c0f781 100644 --- a/config.py +++ b/archiver/config.py @@ -25,7 +25,6 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' RESOLUTION = os.getenv('RESOLUTION', '1440,1200' ) CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' ARCHIVE_PERMISSIONS = os.getenv('ARCHIVE_PERMISSIONS', '755' ) -ARCHIVE_DIR = os.getenv('ARCHIVE_DIR', '') CHROME_BINARY = os.getenv('CHROME_BINARY', 'chromium-browser' ) # change to google-chrome browser if using google-chrome WGET_BINARY = os.getenv('WGET_BINARY', 'wget' ) WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', None) @@ -37,16 +36,17 @@ INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_ro TEMPLATE_STATICFILES = os.getenv('TEMPLATE_STATICFILES', 'templates/static') FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) -### Output Paths -ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__)) -HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html') -os.chdir(ROOT_FOLDER) +### Paths +REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +OUTPUT_DIR = os.path.abspath(os.path.join(REPO_DIR, 'output')) +SOURCES_DIR = os.path.abspath(os.path.join(OUTPUT_DIR, 'sources')) # ****************************************************************************** # ********************** Do not edit below this point ************************** # ****************************************************************************** ### Terminal Configuration +os.chdir(os.path.join(REPO_DIR, 'archiver')) TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns ANSI = { 'reset': '\033[00;00m', @@ -65,7 +65,7 @@ if not USE_COLOR: ### Confirm Environment Setup try: - GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=ROOT_FOLDER).stdout.strip().decode() + GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() except Exception: GIT_SHA = None print('[!] Warning, you need git installed for some archiving features to save correct version numbers!') diff --git a/index.py b/archiver/index.py similarity index 96% rename from index.py rename to archiver/index.py index 44d993f5..d74e2a06 100644 --- a/index.py +++ b/archiver/index.py @@ -11,7 +11,6 @@ from config import ( LINK_INDEX_TEMPLATE, TEMPLATE_STATICFILES, ARCHIVE_PERMISSIONS, - ARCHIVE_DIR, ANSI, GIT_SHA, FOOTER_INFO, @@ -20,6 +19,7 @@ from util import ( chmod_file, wget_output_path, derived_link_info, + pretty_path, ) @@ -37,8 +37,8 @@ def write_links_index(out_dir, links): print('{green}[√] [{}] Updated main index files:{reset}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), **ANSI)) - print(' > {}/index.json'.format(out_dir)) - print(' > {}/index.html'.format(out_dir)) + print(' > {}/index.json'.format(pretty_path(out_dir))) + print(' > {}/index.html'.format(pretty_path(out_dir))) def write_json_links_index(out_dir, links): """write the json link index to a given path""" diff --git a/links.py b/archiver/links.py similarity index 100% rename from links.py rename to archiver/links.py diff --git a/parse.py b/archiver/parse.py similarity index 99% rename from parse.py rename to archiver/parse.py index 682ee403..8adebd55 100644 --- a/parse.py +++ b/archiver/parse.py @@ -98,7 +98,7 @@ def parse_json_export(json_file): 'url': erg['href'], 'domain': domain(erg['href']), 'base_url': base_url(erg['href']), - 'timestamp': str(time.timestamp()), + 'timestamp': erg.get('timestamp', str(time.timestamp())), 'tags': erg['tags'], 'title': erg['description'].replace(' — Readability', ''), 'sources': [json_file.name], diff --git a/peekable.py b/archiver/peekable.py similarity index 100% rename from peekable.py rename to archiver/peekable.py diff --git a/templates/index.html b/archiver/templates/index.html similarity index 100% rename from templates/index.html rename to archiver/templates/index.html diff --git a/templates/index_row.html b/archiver/templates/index_row.html similarity index 100% rename from templates/index_row.html rename to archiver/templates/index_row.html diff --git a/templates/link_index.html b/archiver/templates/link_index.html similarity index 100% rename from templates/link_index.html rename to archiver/templates/link_index.html diff --git a/templates/link_index_fancy.html b/archiver/templates/link_index_fancy.html similarity index 100% rename from templates/link_index_fancy.html rename to archiver/templates/link_index_fancy.html diff --git a/templates/static/archive.png b/archiver/templates/static/archive.png similarity index 100% rename from templates/static/archive.png rename to archiver/templates/static/archive.png diff --git a/templates/static/external.png b/archiver/templates/static/external.png similarity index 100% rename from templates/static/external.png rename to archiver/templates/static/external.png diff --git a/templates/static/spinner.gif b/archiver/templates/static/spinner.gif similarity index 100% rename from templates/static/spinner.gif rename to archiver/templates/static/spinner.gif diff --git a/util.py b/archiver/util.py similarity index 95% rename from util.py rename to archiver/util.py index d18f2159..115aad96 100644 --- a/util.py +++ b/archiver/util.py @@ -13,8 +13,9 @@ from urllib.parse import quote from config import ( IS_TTY, ARCHIVE_PERMISSIONS, - HTML_FOLDER, - ARCHIVE_DIR, + REPO_DIR, + SOURCES_DIR, + OUTPUT_DIR, TIMEOUT, TERM_WIDTH, SHOW_PROGRESS, @@ -165,22 +166,25 @@ def progress(seconds=TIMEOUT, prefix=''): return end +def pretty_path(path): + """convert paths like .../bookmark-archiver/archiver/../output/abc into output/abc""" + return path.replace(REPO_DIR, '') + def download_url(url): """download a given url's content into downloads/domain.txt""" - download_dir = os.path.join(ARCHIVE_DIR, 'downloads') + if not os.path.exists(SOURCES_DIR): + os.makedirs(SOURCES_DIR) - if not os.path.exists(download_dir): - os.makedirs(download_dir) + ts = str(datetime.now().timestamp()).split('.', 1)[0] + + source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts)) - url_domain = url.split('/', 3)[2] - output_path = os.path.join(download_dir, '{}.txt'.format(url_domain)) - print('[*] [{}] Downloading {} > {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), url, - output_path, + pretty_path(source_path), )) end = progress(TIMEOUT, prefix=' ') try: @@ -192,10 +196,10 @@ def download_url(url): print(' ', e) raise SystemExit(1) - with open(output_path, 'w', encoding='utf-8') as f: + with open(source_path, 'w', encoding='utf-8') as f: f.write(downloaded_xml) - return output_path + return source_path def str_between(string, start, end=None): """(12345, , ) -> 12345""" @@ -258,7 +262,7 @@ def find_link(folder, links): timestamp = folder.split('.')[0] for link in links: if link['timestamp'].startswith(timestamp): - if link['domain'] in os.listdir(os.path.join(ARCHIVE_DIR, 'html/archive', folder)): + if link['domain'] in os.listdir(os.path.join(OUTPUT_DIR, 'archive', folder)): return link # careful now, this isn't safe for most ppl if link['domain'] in parse_url(folder): return link @@ -267,7 +271,7 @@ def find_link(folder, links): def parse_url(folder): """for a given archive folder, figure out what url it's for""" - link_json = os.path.join(ARCHIVE_DIR, 'html/archive', folder, 'index.json') + link_json = os.path.join(OUTPUT_DIR, 'archive', folder, 'index.json') if os.path.exists(link_json): with open(link_json, 'r') as f: try: @@ -278,7 +282,7 @@ def parse_url(folder): except ValueError: print('File contains invalid JSON: {}!'.format(link_json)) - archive_org_txt = os.path.join(ARCHIVE_DIR, 'html/archive' + folder, 'archive.org.txt') + archive_org_txt = os.path.join(OUTPUT_DIR, 'archive', folder, 'archive.org.txt') if os.path.exists(archive_org_txt): with open(archive_org_txt, 'r') as f: original_link = f.read().strip().split('/http', 1)[-1] @@ -413,7 +417,7 @@ def wget_output_path(link, look_in=None): # instead of trying to emulate it here, we just look in the output folder # to see what html file wget actually created as the output wget_folder = link['base_url'].rsplit('/', 1)[0].split('/') - look_in = os.path.join(HTML_FOLDER, 'archive', link['timestamp'], *wget_folder) + look_in = os.path.join(OUTPUT_DIR, 'archive', link['timestamp'], *wget_folder) if look_in and os.path.exists(look_in): html_files = [ diff --git a/bin/.DS_Store b/bin/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/bin/.DS_Store differ diff --git a/bin/bookmark-archiver b/bin/bookmark-archiver new file mode 100755 index 00000000..1d5521c5 --- /dev/null +++ b/bin/bookmark-archiver @@ -0,0 +1,6 @@ +#!/bin/bash +# Bookmark Archiver Shortcut + +REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"; cd .. && pwd )" + +python3 "$REPO_DIR/archiver/archive.py" "$@" diff --git a/bin/export-browser-history b/bin/export-browser-history new file mode 100755 index 00000000..95e489ee --- /dev/null +++ b/bin/export-browser-history @@ -0,0 +1,35 @@ +#!/bin/bash + +REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"; cd .. && pwd )" + +if [[ "$1" == "--chrome" ]]; then + # Google Chrome / Chromium + default=$(ls ~/Library/Application\ Support/Google/Chrome/Default/History) + if [[ -e "$2" ]]; then + cp "$2" "$REPO_DIR/output/sources/chrome_history.db.tmp" + else + echo "Defaulting to history db: $default" + echo "Optionally specify the path to a different sqlite history database as the 2nd argument." + cp "$default" "$REPO_DIR/output/sources/chrome_history.db.tmp" + fi + sqlite3 "$REPO_DIR/output/sources/chrome_history.db.tmp" "SELECT \"[\" || group_concat(json_object('timestamp', last_visit_time, 'description', title, 'href', url)) || \"]\" FROM urls;" > "$REPO_DIR/output/sources/chrome_history.json" + rm "$REPO_DIR/output/sources/chrome_history.db.tmp" + echo "Chrome history exported to:" + echo " output/sources/chrome_history.json" +fi + +if [[ "$1" == "--firefox" ]]; then + # Firefox + default=$(ls ~/Library/Application\ Support/Firefox/Profiles/*.default/places.sqlite) + if [[ -e "$2" ]]; then + cp "$2" "$REPO_DIR/output/sources/firefox_history.db.tmp" + else + echo "Defaulting to history db: $default" + echo "Optionally specify the path to a different sqlite history database as the 2nd argument." + cp "$default" "$REPO_DIR/output/sources/firefox_history.db.tmp" + fi + sqlite3 "$REPO_DIR/output/sources/firefox_history.db.tmp" "SELECT \"[\" || group_concat(json_object('timestamp', last_visit_date, 'description', title, 'href', url)) || \"]\" FROM moz_places;" > "$REPO_DIR/output/sources/firefox_history.json" + rm "$REPO_DIR/output/sources/firefox_history.db.tmp" + echo "Firefox history exported to:" + echo " output/sources/firefox_history.json" +fi diff --git a/setup.sh b/bin/setup-bookmark-archiver similarity index 100% rename from setup.sh rename to bin/setup-bookmark-archiver diff --git a/screenshot.png b/screenshot.png deleted file mode 100644 index 0358d442..00000000 Binary files a/screenshot.png and /dev/null differ diff --git a/screenshot_mobile.png b/screenshot_mobile.png deleted file mode 100644 index 1f218740..00000000 Binary files a/screenshot_mobile.png and /dev/null differ