1
0
Fork 0
mirror of synced 2024-06-20 19:30:15 +12:00

#18 - Parse browser bookmarks

This commit is contained in:
Arcady Chumachenko 2017-06-08 21:52:39 +01:00
parent be0abf11fd
commit 8488c57026
2 changed files with 29 additions and 1 deletions

View file

@ -33,6 +33,9 @@ apt update; apt install google-chrome-beta python3 wget
# Check: # Check:
google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed." google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed."
# BeautifulSoup4
sudo pip3 install beautifulsoup4
``` ```
**2. Run the archive script:** **2. Run the archive script:**

View file

@ -9,7 +9,9 @@ import os
import sys import sys
import json import json
from datetime import datetime from bs4 import BeautifulSoup
from datetime import datetime, timezone
import time import time
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
@ -108,6 +110,27 @@ def parse_pinboard_export(html):
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
yield info yield info
def parse_bookmarks_export(html):
soup = BeautifulSoup(html, "html5lib")
for link in soup.find_all('a'):
url = link.get('href')
secs = link.get('add_date')
dt = datetime.fromtimestamp(int(secs))
info = {
'url': url,
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
'time': dt,
'timestamp': secs,
'tags': link.get('tags'),
'title': link.string.strip(),
}
info['type'] = get_link_type(info)
yield info
### ACHIVING FUNCTIONS ### ACHIVING FUNCTIONS
@ -278,6 +301,8 @@ def create_archive(export_file, service, resume=None):
links = parse_pocket_export(f) links = parse_pocket_export(f)
elif service == "pinboard": elif service == "pinboard":
links = parse_pinboard_export(f) links = parse_pinboard_export(f)
elif service == "bookmarks":
links = parse_bookmarks_export(f)
links = list(reversed(sorted(links, key=lambda l: l['timestamp']))) # most recent first links = list(reversed(sorted(links, key=lambda l: l['timestamp']))) # most recent first
if resume: if resume:
links = [link for link in links if link['timestamp'] >= resume] links = [link for link in links if link['timestamp'] >= resume]