diff --git a/README.md b/README.md index 8b84a54c..4f62d409 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,6 @@ apt update; apt install google-chrome-beta python3 wget # Check: google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed." - -# BeautifulSoup4 -sudo pip3 install beautifulsoup4 ``` **2. Run the archive script:** diff --git a/archive.py b/archive.py index 617bcae6..c3d42f97 100755 --- a/archive.py +++ b/archive.py @@ -9,9 +9,7 @@ import os import sys import json -from bs4 import BeautifulSoup - -from datetime import datetime, timezone +from datetime import datetime import time from subprocess import run, PIPE, DEVNULL @@ -111,25 +109,26 @@ def parse_pinboard_export(html): yield info def parse_bookmarks_export(html): - soup = BeautifulSoup(html, "html5lib") - for link in soup.find_all('a'): + pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE) + for line in html: + match = pattern.search(line) + if match: + url = match.group(1) + secs = match.group(2) + dt = datetime.fromtimestamp(int(secs)) - url = link.get('href') - secs = link.get('add_date') - dt = datetime.fromtimestamp(int(secs)) + info = { + 'url': url, + 'domain': url.replace('http://', '').replace('https://', '').split('/')[0], + 'base_url': url.replace('https://', '').replace('http://', '').split('?')[0], + 'time': dt, + 'timestamp': secs, + 'tags': "", + 'title': match.group(3) + } - info = { - 'url': url, - 'domain': url.replace('http://', '').replace('https://', '').split('/')[0], - 'base_url': url.replace('https://', '').replace('http://', '').split('?')[0], - 'time': dt, - 'timestamp': secs, - 'tags': link.get('tags'), - 'title': link.string.strip(), - } - - info['type'] = get_link_type(info) - yield info + info['type'] = get_link_type(info) + yield info ### ACHIVING FUNCTIONS