1
0
Fork 0
mirror of synced 2024-06-28 11:00:35 +12:00

refactor: All parsers return snapshot instead of link

This commit is contained in:
Cristian 2021-01-04 09:31:14 -05:00
parent 15d88be229
commit b8efaa5b6a
11 changed files with 72 additions and 53 deletions

View file

@ -6,7 +6,8 @@ import re
from typing import IO, Iterable, Optional from typing import IO, Iterable, Optional
from datetime import datetime from datetime import datetime
from ..index.schema import Link from django.db.models import Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -29,7 +30,7 @@ class HrefParser(HTMLParser):
@enforce_types @enforce_types
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]: def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Model]:
"""Parse Generic HTML for href tags and use only the url (support for title coming later)""" """Parse Generic HTML for href tags and use only the url (support for title coming later)"""
from core.models import Snapshot from core.models import Snapshot

View file

@ -5,7 +5,8 @@ import json
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from django.db.models import Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -13,8 +14,9 @@ from ..util import (
@enforce_types @enforce_types
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
from core.models import Snapshot
json_file.seek(0) json_file.seek(0)
links = json.load(json_file) links = json.load(json_file)
@ -56,10 +58,10 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
elif link.get('name'): elif link.get('name'):
title = link['name'].strip() title = link['name'].strip()
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=ts_str, timestamp=ts_str,
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=htmldecode(link.get('tags')) or '', #tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name], #sources=[json_file.name],
) )

View file

@ -4,7 +4,8 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from django.db.models import Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -12,8 +13,9 @@ from ..util import (
) )
@enforce_types @enforce_types
def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse RSS XML-format files into links""" """Parse RSS XML-format files into links"""
from core.models import Snapshot
rss_file.seek(0) rss_file.seek(0)
items = rss_file.read().split('<item>') items = rss_file.read().split('<item>')
@ -40,10 +42,10 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=None, #tags=None,
sources=[rss_file.name], #sources=[rss_file.name],
) )

View file

@ -7,7 +7,8 @@ from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from ..index.schema import Link from django.db.models import Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -16,7 +17,7 @@ from ..util import (
@enforce_types @enforce_types
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse raw links from each line in a text file""" """Parse raw links from each line in a text file"""
# TODO: Check if we should add sources list to the database # TODO: Check if we should add sources list to the database
from core.models import Snapshot from core.models import Snapshot
@ -29,12 +30,12 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
# if the line is a local file path that resolves, then we can archive it # if the line is a local file path that resolves, then we can archive it
try: try:
if Path(line).exists(): if Path(line).exists():
yield Link( yield Snapshot(
url=line, url=line,
timestamp=str(datetime.now().timestamp()), timestamp=str(datetime.now().timestamp()),
title=None, title=None,
tags=None, #tags=None,
sources=[text_file.name], #sources=[text_file.name],
) )
except (OSError, PermissionError): except (OSError, PermissionError):
# nvm, not a valid path... # nvm, not a valid path...

View file

@ -4,9 +4,10 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from django.db.models import Model
from xml.etree import ElementTree from xml.etree import ElementTree
from ..index.schema import Link
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -14,8 +15,9 @@ from ..util import (
@enforce_types @enforce_types
def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Medium RSS feed files into links""" """Parse Medium RSS feed files into links"""
from core.models import Snapshot
rss_file.seek(0) rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot() root = ElementTree.parse(rss_file).getroot()
@ -26,10 +28,10 @@ def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
ts_str = item.find("pubDate").text # type: ignore ts_str = item.find("pubDate").text # type: ignore
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=None, #tags=None,
sources=[rss_file.name], #sources=[rss_file.name],
) )

View file

@ -6,7 +6,8 @@ import re
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from django.db.models imort Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -14,8 +15,9 @@ from ..util import (
@enforce_types @enforce_types
def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse netscape-format bookmarks export files (produced by all browsers)""" """Parse netscape-format bookmarks export files (produced by all browsers)"""
from core.models import Snapshot
html_file.seek(0) html_file.seek(0)
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE) pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
@ -29,11 +31,11 @@ def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
time = datetime.fromtimestamp(float(match.group(2))) time = datetime.fromtimestamp(float(match.group(2)))
title = match.group(3).strip() title = match.group(3).strip()
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=None, #tags=None,
sources=[html_file.name], #sources=[html_file.name],
) )

View file

@ -4,9 +4,10 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from django.db.models import Model
from xml.etree import ElementTree from xml.etree import ElementTree
from ..index.schema import Link
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -14,8 +15,9 @@ from ..util import (
@enforce_types @enforce_types
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Pinboard RSS feed files into links""" """Parse Pinboard RSS feed files into links"""
from core.models import Snapshot
rss_file.seek(0) rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot() root = ElementTree.parse(rss_file).getroot()
@ -38,10 +40,10 @@ def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
else: else:
time = datetime.now() time = datetime.now()
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=htmldecode(tags) or None, #tags=htmldecode(tags) or None,
sources=[rss_file.name], #sources=[rss_file.name],
) )

View file

@ -9,7 +9,6 @@ from configparser import ConfigParser
from pathlib import Path from pathlib import Path
from ..vendor.pocket import Pocket from ..vendor.pocket import Pocket
from ..index.schema import Link
from ..util import enforce_types from ..util import enforce_types
from ..system import atomic_write from ..system import atomic_write
from ..config import ( from ..config import (
@ -46,19 +45,21 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
api.last_since = body['since'] api.last_since = body['since']
def link_from_article(article: dict, sources: list): def snapshot_from_article(article: dict, sources: list):
from core.models import Snapshot
url: str = article['resolved_url'] or article['given_url'] url: str = article['resolved_url'] or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url) broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol: if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://') url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article['resolved_title'] or article['given_title'] or url title = article['resolved_title'] or article['given_title'] or url
return Link( return Snapshot(
url=url, url=url,
timestamp=article['time_read'], timestamp=article['time_read'],
title=title, title=title,
tags=article.get('tags'), #tags=article.get('tags'),
sources=sources #sources=sources
) )
@ -108,6 +109,6 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
api.last_since = None api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)): for article in get_pocket_articles(api, since=read_since(username)):
yield link_from_article(article, sources=[line]) yield snapshot_from_article(article, sources=[line])
write_since(username, api.last_since) write_since(username, api.last_since)

View file

@ -6,7 +6,8 @@ import re
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from django.db.models import Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -14,8 +15,9 @@ from ..util import (
@enforce_types @enforce_types
def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
from core.models import Snapshot
html_file.seek(0) html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
@ -29,10 +31,10 @@ def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
tags = match.group(3) tags = match.group(3)
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=tags or '', #tags=tags or '',
sources=[html_file.name], #sources=[html_file.name],
) )

View file

@ -4,7 +4,8 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from django.db.models import Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -13,8 +14,9 @@ from ..util import (
@enforce_types @enforce_types
def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Shaarli-specific RSS XML-format files into links""" """Parse Shaarli-specific RSS XML-format files into links"""
from core.models import Snapshot
rss_file.seek(0) rss_file.seek(0)
entries = rss_file.read().split('<entry>')[1:] entries = rss_file.read().split('<entry>')[1:]
@ -41,10 +43,10 @@ def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
ts_str = str_between(get_row('published'), '<published>', '</published>') ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=None, #tags=None,
sources=[rss_file.name], #sources=[rss_file.name],
) )

View file

@ -4,7 +4,8 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from django.db.models import Model
from ..util import ( from ..util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
@ -13,8 +14,9 @@ from ..util import (
@enforce_types @enforce_types
def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Wallabag Atom files into links""" """Parse Wallabag Atom files into links"""
from core.models import Snapshot
rss_file.seek(0) rss_file.seek(0)
entries = rss_file.read().split('<entry>')[1:] entries = rss_file.read().split('<entry>')[1:]
@ -48,10 +50,10 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
except: except:
tags = None tags = None
yield Link( yield Snapshot(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(time.timestamp()), timestamp=str(time.timestamp()),
title=htmldecode(title) or None, title=htmldecode(title) or None,
tags=tags or '', #tags=tags or '',
sources=[rss_file.name], #sources=[rss_file.name],
) )