diff --git a/README.md b/README.md index 4f62d409..8b84a54c 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,9 @@ apt update; apt install google-chrome-beta python3 wget # Check: google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed." + +# BeautifulSoup4 +sudo pip3 install beautifulsoup4 ``` **2. Run the archive script:** diff --git a/archive.py b/archive.py index f03e81f7..617bcae6 100755 --- a/archive.py +++ b/archive.py @@ -9,7 +9,9 @@ import os import sys import json -from datetime import datetime +from bs4 import BeautifulSoup + +from datetime import datetime, timezone import time from subprocess import run, PIPE, DEVNULL @@ -108,6 +110,27 @@ def parse_pinboard_export(html): info['type'] = get_link_type(info) yield info +def parse_bookmarks_export(html): + soup = BeautifulSoup(html, "html5lib") + for link in soup.find_all('a'): + + url = link.get('href') + secs = link.get('add_date') + dt = datetime.fromtimestamp(int(secs)) + + info = { + 'url': url, + 'domain': url.replace('http://', '').replace('https://', '').split('/')[0], + 'base_url': url.replace('https://', '').replace('http://', '').split('?')[0], + 'time': dt, + 'timestamp': secs, + 'tags': link.get('tags'), + 'title': link.string.strip(), + } + + info['type'] = get_link_type(info) + yield info + ### ACHIVING FUNCTIONS @@ -278,6 +301,8 @@ def create_archive(export_file, service, resume=None): links = parse_pocket_export(f) elif service == "pinboard": links = parse_pinboard_export(f) + elif service == "bookmarks": + links = parse_bookmarks_export(f) links = list(reversed(sorted(links, key=lambda l: l['timestamp']))) # most recent first if resume: links = [link for link in links if link['timestamp'] >= resume]