diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py
index 99aebf5c..b3e019e2 100644
--- a/archivebox/parsers/generic_html.py
+++ b/archivebox/parsers/generic_html.py
@@ -6,7 +6,8 @@ import re
from typing import IO, Iterable, Optional
from datetime import datetime
-from ..index.schema import Link
+from django.db.models import Model
+
from ..util import (
htmldecode,
enforce_types,
@@ -29,7 +30,7 @@ class HrefParser(HTMLParser):
@enforce_types
-def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
+def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Model]:
"""Parse Generic HTML for href tags and use only the url (support for title coming later)"""
from core.models import Snapshot
diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py
index e6ed6772..d0371069 100644
--- a/archivebox/parsers/generic_json.py
+++ b/archivebox/parsers/generic_json.py
@@ -5,7 +5,8 @@ import json
from typing import IO, Iterable
from datetime import datetime
-from ..index.schema import Link
+from django.db.models import Model
+
from ..util import (
htmldecode,
enforce_types,
@@ -13,8 +14,9 @@ from ..util import (
@enforce_types
-def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
+ from core.models import Snapshot
json_file.seek(0)
links = json.load(json_file)
@@ -56,10 +58,10 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
elif link.get('name'):
title = link['name'].strip()
- yield Link(
+ yield Snapshot(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
- tags=htmldecode(link.get('tags')) or '',
- sources=[json_file.name],
+ #tags=htmldecode(link.get('tags')) or '',
+ #sources=[json_file.name],
)
diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py
index 28318444..49e38000 100644
--- a/archivebox/parsers/generic_rss.py
+++ b/archivebox/parsers/generic_rss.py
@@ -4,7 +4,8 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
-from ..index.schema import Link
+from django.db.models import Model
+
from ..util import (
htmldecode,
enforce_types,
@@ -12,8 +13,9 @@ from ..util import (
)
@enforce_types
-def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse RSS XML-format files into links"""
+ from core.models import Snapshot
rss_file.seek(0)
items = rss_file.read().split('- ')
@@ -40,10 +42,10 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), ' Iterable[Link]:
+def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse raw links from each line in a text file"""
# TODO: Check if we should add sources list to the database
from core.models import Snapshot
@@ -29,12 +30,12 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
# if the line is a local file path that resolves, then we can archive it
try:
if Path(line).exists():
- yield Link(
+ yield Snapshot(
url=line,
timestamp=str(datetime.now().timestamp()),
title=None,
- tags=None,
- sources=[text_file.name],
+ #tags=None,
+ #sources=[text_file.name],
)
except (OSError, PermissionError):
# nvm, not a valid path...
diff --git a/archivebox/parsers/medium_rss.py b/archivebox/parsers/medium_rss.py
index 8f14f773..5afa4985 100644
--- a/archivebox/parsers/medium_rss.py
+++ b/archivebox/parsers/medium_rss.py
@@ -4,9 +4,10 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
+from django.db.models import Model
+
from xml.etree import ElementTree
-from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
@@ -14,8 +15,9 @@ from ..util import (
@enforce_types
-def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Medium RSS feed files into links"""
+ from core.models import Snapshot
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
@@ -26,10 +28,10 @@ def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
ts_str = item.find("pubDate").text # type: ignore
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
- yield Link(
+ yield Snapshot(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
- tags=None,
- sources=[rss_file.name],
+ #tags=None,
+ #sources=[rss_file.name],
)
diff --git a/archivebox/parsers/netscape_html.py b/archivebox/parsers/netscape_html.py
index a063023c..fe47cf39 100644
--- a/archivebox/parsers/netscape_html.py
+++ b/archivebox/parsers/netscape_html.py
@@ -6,7 +6,8 @@ import re
from typing import IO, Iterable
from datetime import datetime
-from ..index.schema import Link
+from django.db.models imort Model
+
from ..util import (
htmldecode,
enforce_types,
@@ -14,8 +15,9 @@ from ..util import (
@enforce_types
-def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
+ from core.models import Snapshot
html_file.seek(0)
pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE)
@@ -29,11 +31,11 @@ def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
time = datetime.fromtimestamp(float(match.group(2)))
title = match.group(3).strip()
- yield Link(
+ yield Snapshot(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
- tags=None,
- sources=[html_file.name],
+ #tags=None,
+ #sources=[html_file.name],
)
diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py
index 98ff14a3..fd3550af 100644
--- a/archivebox/parsers/pinboard_rss.py
+++ b/archivebox/parsers/pinboard_rss.py
@@ -4,9 +4,10 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
+from django.db.models import Model
+
from xml.etree import ElementTree
-from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
@@ -14,8 +15,9 @@ from ..util import (
@enforce_types
-def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Pinboard RSS feed files into links"""
+ from core.models import Snapshot
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
@@ -38,10 +40,10 @@ def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
else:
time = datetime.now()
- yield Link(
+ yield Snapshot(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
- tags=htmldecode(tags) or None,
- sources=[rss_file.name],
+ #tags=htmldecode(tags) or None,
+ #sources=[rss_file.name],
)
diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py
index bf3a292b..d79ace94 100644
--- a/archivebox/parsers/pocket_api.py
+++ b/archivebox/parsers/pocket_api.py
@@ -9,7 +9,6 @@ from configparser import ConfigParser
from pathlib import Path
from ..vendor.pocket import Pocket
-from ..index.schema import Link
from ..util import enforce_types
from ..system import atomic_write
from ..config import (
@@ -46,19 +45,21 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
api.last_since = body['since']
-def link_from_article(article: dict, sources: list):
+def snapshot_from_article(article: dict, sources: list):
+ from core.models import Snapshot
+
url: str = article['resolved_url'] or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article['resolved_title'] or article['given_title'] or url
- return Link(
+ return Snapshot(
url=url,
timestamp=article['time_read'],
title=title,
- tags=article.get('tags'),
- sources=sources
+ #tags=article.get('tags'),
+ #sources=sources
)
@@ -108,6 +109,6 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):
- yield link_from_article(article, sources=[line])
+ yield snapshot_from_article(article, sources=[line])
write_since(username, api.last_since)
diff --git a/archivebox/parsers/pocket_html.py b/archivebox/parsers/pocket_html.py
index 653f21b8..91fd9a75 100644
--- a/archivebox/parsers/pocket_html.py
+++ b/archivebox/parsers/pocket_html.py
@@ -6,7 +6,8 @@ import re
from typing import IO, Iterable
from datetime import datetime
-from ..index.schema import Link
+from django.db.models import Model
+
from ..util import (
htmldecode,
enforce_types,
@@ -14,8 +15,9 @@ from ..util import (
@enforce_types
-def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
+ from core.models import Snapshot
html_file.seek(0)
pattern = re.compile("^\\s*(.+)", re.UNICODE)
@@ -29,10 +31,10 @@ def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
tags = match.group(3)
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
- yield Link(
+ yield Snapshot(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
- tags=tags or '',
- sources=[html_file.name],
+ #tags=tags or '',
+ #sources=[html_file.name],
)
diff --git a/archivebox/parsers/shaarli_rss.py b/archivebox/parsers/shaarli_rss.py
index 4a925f46..8ad16e74 100644
--- a/archivebox/parsers/shaarli_rss.py
+++ b/archivebox/parsers/shaarli_rss.py
@@ -4,7 +4,8 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
-from ..index.schema import Link
+from django.db.models import Model
+
from ..util import (
htmldecode,
enforce_types,
@@ -13,8 +14,9 @@ from ..util import (
@enforce_types
-def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Shaarli-specific RSS XML-format files into links"""
+ from core.models import Snapshot
rss_file.seek(0)
entries = rss_file.read().split('')[1:]
@@ -41,10 +43,10 @@ def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
ts_str = str_between(get_row('published'), '', '')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
- yield Link(
+ yield Snapshot(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
- tags=None,
- sources=[rss_file.name],
+ #tags=None,
+ #sources=[rss_file.name],
)
diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py
index 0d77869f..c3e6971d 100644
--- a/archivebox/parsers/wallabag_atom.py
+++ b/archivebox/parsers/wallabag_atom.py
@@ -4,7 +4,8 @@ __package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
-from ..index.schema import Link
+from django.db.models import Model
+
from ..util import (
htmldecode,
enforce_types,
@@ -13,8 +14,9 @@ from ..util import (
@enforce_types
-def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
+def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
"""Parse Wallabag Atom files into links"""
+ from core.models import Snapshot
rss_file.seek(0)
entries = rss_file.read().split('')[1:]
@@ -48,10 +50,10 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
except:
tags = None
- yield Link(
+ yield Snapshot(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
- tags=tags or '',
- sources=[rss_file.name],
+ #tags=tags or '',
+ #sources=[rss_file.name],
)