diff --git a/archivebox/config.py b/archivebox/config.py index 3b1211dd..2ea9f196 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -223,6 +223,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, + + 'READWISE_READER_TOKENS': {'type': dict, 'default': {}}, }, } diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 99d11a1b..c6f2f382 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -34,6 +34,7 @@ from ..index.schema import Link from ..logging_util import TimedProgress, log_source_saved from . import pocket_api +from . import readwise_reader_api from . import wallabag_atom from . import pocket_html from . import pinboard_rss @@ -51,6 +52,7 @@ from . import url_list PARSERS = { # Specialized parsers pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER), + readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER), wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER), pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER), pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER), diff --git a/archivebox/parsers/readwise_reader_api.py b/archivebox/parsers/readwise_reader_api.py new file mode 100644 index 00000000..a2a0c29a --- /dev/null +++ b/archivebox/parsers/readwise_reader_api.py @@ -0,0 +1,123 @@ +__package__ = "archivebox.parsers" + + +import re +import requests +from datetime import datetime + +from typing import IO, Iterable, Optional +from configparser import ConfigParser + +from pathlib import Path + +from ..index.schema import Link +from ..util import enforce_types +from ..system import atomic_write +from ..config import ( + SOURCES_DIR, + READWISE_READER_TOKENS, +) + + +API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db" + + +class ReadwiseReaderAPI: + cursor: Optional[str] + + def __init__(self, api_token, cursor=None) -> None: + self.api_token = api_token + self.cursor = cursor + + def get_archive(self): + response = requests.get( + url="https://readwise.io/api/v3/list/", + headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"}, + params={ + "location": "archive", + "pageCursor": self.cursor, + } + ) + response.raise_for_status() + return response + +def get_readwise_reader_articles(api: ReadwiseReaderAPI): + response = api.get_archive() + body = response.json() + articles = body["results"] + + yield from articles + + + if body['nextPageCursor']: + api.cursor = body["nextPageCursor"] + yield from get_readwise_reader_articles(api) + + +def link_from_article(article: dict, sources: list): + url: str = article['source_url'] + title = article["title"] or url + timestamp = datetime.fromisoformat(article['updated_at']).timestamp() + + return Link( + url=url, + timestamp=str(timestamp), + title=title, + tags="", + sources=sources, + ) + + +def write_cursor(username: str, since: str): + if not API_DB_PATH.exists(): + atomic_write(API_DB_PATH, "") + + since_file = ConfigParser() + since_file.optionxform = str + since_file.read(API_DB_PATH) + + since_file[username] = {"since": since} + + with open(API_DB_PATH, "w+") as new: + since_file.write(new) + + +def read_cursor(username: str) -> Optional[str]: + if not API_DB_PATH.exists(): + atomic_write(API_DB_PATH, "") + + config_file = ConfigParser() + config_file.optionxform = str + config_file.read(API_DB_PATH) + + return config_file.get(username, "since", fallback=None) + + + + +@enforce_types +def should_parse_as_readwise_reader_api(text: str) -> bool: + return text.startswith("readwise-reader://") + + +@enforce_types +def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: + """Parse bookmarks from the Readwise Reader API""" + + input_buffer.seek(0) + pattern = re.compile(r"^readwise-reader:\/\/(\w+)") + for line in input_buffer: + if should_parse_as_readwise_reader_api(line): + username = pattern.search(line).group(1) + api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username)) + + for article in get_readwise_reader_articles(api): + yield link_from_article(article, sources=[line]) + + if api.cursor: + write_cursor(username, api.cursor) + + +KEY = "readwise_reader_api" +NAME = "Readwise Reader API" +PARSER = parse_readwise_reader_api_export