1
0
Fork 0
mirror of synced 2024-05-29 08:41:05 +12:00

Removed BeautifulSoup from dependencies

This commit is contained in:
Arcady Chumachenko 2017-06-11 19:45:22 +01:00
parent 350803a4c8
commit 666760fe06
2 changed files with 19 additions and 23 deletions

View file

@ -33,9 +33,6 @@ apt update; apt install google-chrome-beta python3 wget
# Check: # Check:
google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed." google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed."
# BeautifulSoup4
sudo pip3 install beautifulsoup4
``` ```
**2. Run the archive script:** **2. Run the archive script:**

View file

@ -9,9 +9,7 @@ import os
import sys import sys
import json import json
from bs4 import BeautifulSoup from datetime import datetime
from datetime import datetime, timezone
import time import time
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
@ -111,25 +109,26 @@ def parse_pinboard_export(html):
yield info yield info
def parse_bookmarks_export(html): def parse_bookmarks_export(html):
soup = BeautifulSoup(html, "html5lib") pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for link in soup.find_all('a'): for line in html:
match = pattern.search(line)
if match:
url = match.group(1)
secs = match.group(2)
dt = datetime.fromtimestamp(int(secs))
url = link.get('href') info = {
secs = link.get('add_date') 'url': url,
dt = datetime.fromtimestamp(int(secs)) 'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
'time': dt,
'timestamp': secs,
'tags': "",
'title': match.group(3)
}
info = { info['type'] = get_link_type(info)
'url': url, yield info
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
'time': dt,
'timestamp': secs,
'tags': link.get('tags'),
'title': link.string.strip(),
}
info['type'] = get_link_type(info)
yield info
### ACHIVING FUNCTIONS ### ACHIVING FUNCTIONS