1
0
Fork 0
mirror of synced 2024-05-16 10:23:51 +12:00

Merge pull request #21 from ilvar/issue-18-rm-bs4-dependency

Removed BeautifulSoup from dependencies
This commit is contained in:
Nick Sweeting 2017-06-12 21:39:54 -05:00 committed by GitHub
commit 5c4f17f5ad
2 changed files with 19 additions and 23 deletions

View file

@ -33,9 +33,6 @@ apt update; apt install google-chrome-beta python3 wget
# Check:
google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed."
# BeautifulSoup4
sudo pip3 install beautifulsoup4
```
**2. Run the archive script:**

View file

@ -9,9 +9,7 @@ import os
import sys
import json
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from datetime import datetime
import time
from subprocess import run, PIPE, DEVNULL
@ -111,25 +109,26 @@ def parse_pinboard_export(html):
yield info
def parse_bookmarks_export(html):
soup = BeautifulSoup(html, "html5lib")
for link in soup.find_all('a'):
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for line in html:
match = pattern.search(line)
if match:
url = match.group(1)
secs = match.group(2)
dt = datetime.fromtimestamp(int(secs))
url = link.get('href')
secs = link.get('add_date')
dt = datetime.fromtimestamp(int(secs))
info = {
'url': url,
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
'time': dt,
'timestamp': secs,
'tags': "",
'title': match.group(3)
}
info = {
'url': url,
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
'time': dt,
'timestamp': secs,
'tags': link.get('tags'),
'title': link.string.strip(),
}
info['type'] = get_link_type(info)
yield info
info['type'] = get_link_type(info)
yield info
### ACHIVING FUNCTIONS