1
0
Fork 0
mirror of synced 2024-06-26 18:10:24 +12:00

Merge pull request #528 from mAAdhaTTah/pull-from-pocket-api

Add parser for Pocket API
This commit is contained in:
Nick Sweeting 2020-12-04 22:58:41 -05:00 committed by GitHub
commit f467435797
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 122 additions and 1 deletions

View file

@ -159,6 +159,9 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'NODE_BINARY': {'type': str, 'default': 'node'},
'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
},
}
@ -386,7 +389,7 @@ def load_config_val(key: str,
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
return int(val)
elif type is list:
elif type is list or type is dict:
return json.loads(val)
raise Exception('Config values can only be str, bool, int or json')

View file

@ -32,6 +32,7 @@ from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
from .pocket_html import parse_pocket_html_export
from .pocket_api import parse_pocket_api_export
from .pinboard_rss import parse_pinboard_rss_export
from .wallabag_atom import parse_wallabag_atom_export
from .shaarli_rss import parse_shaarli_rss_export
@ -44,6 +45,7 @@ from .generic_txt import parse_generic_txt_export
PARSERS = (
# Specialized parsers
('Pocket API', parse_pocket_api_export),
('Wallabag ATOM', parse_wallabag_atom_export),
('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export),

View file

@ -0,0 +1,115 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable, Optional
from datetime import datetime
from configparser import ConfigParser
from pathlib import Path
from pocket import Pocket
import requests
from ..index.schema import Link
from ..util import (
enforce_types,
)
from ..config import (
SOURCES_DIR
)
_COUNT_PER_PAGE = 500
_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
# search for broken protocols that sometimes come from the Pocket API
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
def get_pocket_articles(api: Pocket, since=None, page=0):
body, headers = api.get(
state='archive',
sort='oldest',
since=since,
count=_COUNT_PER_PAGE,
offset=page * _COUNT_PER_PAGE,
)
articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
returned_count = len(articles)
yield from articles
if returned_count == _COUNT_PER_PAGE:
yield from get_pocket_articles(api, since=since, page=page + 1)
else:
api.last_since = body['since']
def link_from_article(article: dict, sources: list):
url: str = article['resolved_url'] or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article['resolved_title'] or article['given_title'] or url
return Link(
url=url,
timestamp=article['time_read'],
title=title,
tags=article.get('tags'),
sources=sources
)
def write_since(username: str, since: str):
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(_API_DB_PATH)
since_file[username] = {
'since': since
}
with open(_API_DB_PATH, 'w+') as new:
since_file.write(new)
def read_since(username: str) -> Optional[str]:
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(_API_DB_PATH)
return config_file.get(username, 'since', fallback=None)
@enforce_types
def should_parse_as_pocket_api(text: str) -> bool:
return text.startswith('pocket://')
@enforce_types
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Pocket API"""
input_buffer.seek(0)
pattern = re.compile("^pocket:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_pocket_api(line):
from ..config import (
POCKET_CONSUMER_KEY,
POCKET_ACCESS_TOKENS,
)
username = pattern.search(line).group(1)
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):
yield link_from_article(article, sources=[line])
write_since(username, api.last_since)

View file

@ -59,6 +59,7 @@ setuptools.setup(
"python-crontab==2.5.1",
"croniter==0.3.34",
"w3lib==1.22.0",
"pocket==0.3.6",
# Some/all of these will likely be added in the future:
# wpull
# pywb