Removed BeautifulSoup from dependencies

2024-05-29 08:41:05 +12:00 · 2017-06-11 19:45:22 +01:00 · 2017-06-11 19:45:22 +01:00 · 666760fe06
parent 350803a4c8
commit 666760fe06
2 changed files with 19 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -33,9 +33,6 @@ apt update; apt install google-chrome-beta python3 wget
 # Check:
 google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed."
 # BeautifulSoup4
 sudo pip3 install beautifulsoup4
 ```
 **2. Run the archive script:**
--- a/archive.py
+++ b/archive.py
@ -9,9 +9,7 @@ import os
 import sys
 import json
-from bs4 import BeautifulSoup
+from datetime import datetime
 from datetime import datetime, timezone
 import time
 from subprocess import run, PIPE, DEVNULL
@ -111,25 +109,26 @@ def parse_pinboard_export(html):
            yield info
 def parse_bookmarks_export(html):
-    soup = BeautifulSoup(html, "html5lib")
+    pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
-    for link in soup.find_all('a'):
+    for line in html:
        match = pattern.search(line)
        if match:
            url = match.group(1)
            secs = match.group(2)
            dt = datetime.fromtimestamp(int(secs))
-        url = link.get('href')
+            info = {
-        secs = link.get('add_date')
+                'url': url,
-        dt = datetime.fromtimestamp(int(secs))
+                'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
                'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
                'time': dt,
                'timestamp': secs,
                'tags': "",
                'title': match.group(3)
            }
-        info = {
+            info['type'] = get_link_type(info)
-            'url': url,
+            yield info
            'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
            'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
            'time': dt,
            'timestamp': secs,
            'tags': link.get('tags'),
            'title': link.string.strip(),
        }
        info['type'] = get_link_type(info)
        yield info
 ### ACHIVING FUNCTIONS