ArchiveBox/archivebox/extractors/__init__.py

__package__ = 'archivebox.extractors'

import os
from pathlib import Path

from typing import Optional, List, Iterable, Union
from datetime import datetime, timezone
from django.db.models import QuerySet

from ..index.schema import Link
from ..index.sql import write_link_to_sql_index
from ..index import (
    load_link_details,
    write_link_details,
)
from ..util import enforce_types
from ..logging_util import (
    log_archiving_started,
    log_archiving_paused,
    log_archiving_finished,
    log_link_archiving_started,
    log_link_archiving_finished,
    log_archive_method_started,
    log_archive_method_finished,
)
from ..search import write_search_index

from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .singlefile import should_save_singlefile, save_singlefile
from .readability import should_save_readability, save_readability
from .mercury import should_save_mercury, save_mercury
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
from .git import should_save_git, save_git
from .media import should_save_media, save_media
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
from .headers import should_save_headers, save_headers


def get_default_archive_methods():
    return [
        ('title', should_save_title, save_title),
        ('favicon', should_save_favicon, save_favicon),
        ('headers', should_save_headers, save_headers),
        ('singlefile', should_save_singlefile, save_singlefile),
        ('pdf', should_save_pdf, save_pdf),
        ('screenshot', should_save_screenshot, save_screenshot),
        ('dom', should_save_dom, save_dom),
        ('wget', should_save_wget, save_wget),
        ('readability', should_save_readability, save_readability),  # keep readability below wget and singlefile, as it depends on them
        ('mercury', should_save_mercury, save_mercury),
        ('git', should_save_git, save_git),
        ('media', should_save_media, save_media),
        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
    ]

ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]

@enforce_types
def ignore_methods(to_ignore: List[str]):
    ARCHIVE_METHODS = get_default_archive_methods()
    methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS)
    methods = map(lambda x: x[0], methods)
    return list(methods)

@enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""

    # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
    from core.models import Snapshot, ArchiveResult
    try:
        snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
    except Snapshot.DoesNotExist:
        snapshot = write_link_to_sql_index(link)

    ARCHIVE_METHODS = get_default_archive_methods()
    
    if methods:
        ARCHIVE_METHODS = [
            method for method in ARCHIVE_METHODS
            if method[0] in methods
        ]

    out_dir = out_dir or Path(link.link_dir)
    try:
        is_new = not Path(out_dir).exists()
        if is_new:
            os.makedirs(out_dir)

        link = load_link_details(link, out_dir=out_dir)
        write_link_details(link, out_dir=out_dir, skip_sql_index=False)
        log_link_archiving_started(link, out_dir, is_new)
        link = link.overwrite(updated=datetime.now(timezone.utc))
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}

        for method_name, should_run, method_function in ARCHIVE_METHODS:
            try:
                if method_name not in link.history:
                    link.history[method_name] = []

                if should_run(link, out_dir, overwrite):
                    log_archive_method_started(method_name)

                    result = method_function(link=link, out_dir=out_dir)

                    link.history[method_name].append(result)

                    stats[result.status] += 1
                    log_archive_method_finished(result)
                    write_search_index(link=link, texts=result.index_texts)
                    ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                                                 output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)


                    # bump the updated time on the main Snapshot here, this is critical
                    # to be able to cache summaries of the ArchiveResults for a given
                    # snapshot without having to load all the results from the DB each time.
                    # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
                    # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
                    snapshot.save()
                else:
                    # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                    stats['skipped'] += 1
            except Exception as e:
                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
                    method_name,
                    link.url,
                )) from e

        # print('    ', stats)

        try:
            latest_title = link.history['title'][-1].output.strip()
            if latest_title and len(latest_title) >= len(link.title or ''):
                link = link.overwrite(title=latest_title)
        except Exception:
            pass

        write_link_details(link, out_dir=out_dir, skip_sql_index=False)

        log_link_archiving_finished(link, link.link_dir, is_new, stats)

    except KeyboardInterrupt:
        try:
            write_link_details(link, out_dir=link.link_dir)
        except:
            pass
        raise

    except Exception as err:
        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
        raise

    return link

@enforce_types
def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:

    if type(all_links) is QuerySet:
        num_links: int = all_links.count()
        get_link = lambda x: x.as_link()
        all_links = all_links.iterator()
    else:
        num_links: int = len(all_links)
        get_link = lambda x: x

    if num_links == 0:
        return []

    log_archiving_started(num_links)
    idx: int = 0
    try:
        for link in all_links:
            idx += 1
            to_archive = get_link(link)
            archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
    except KeyboardInterrupt:
        log_archiving_paused(num_links, idx, link.timestamp)
        raise SystemExit(0)
    except BaseException:
        print()
        raise

    log_archiving_finished(num_links)
    return all_links
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`__package__ = 'archivebox.extractors'`

			`import os`
test: Fix tests post-rebase 2020-09-16 07:05:48 +12:00			`from pathlib import Path`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00
lint: Remove unused import 2020-09-15 06:38:32 +12:00			`from typing import Optional, List, Iterable, Union`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 20:19:30 +12:00			`from datetime import datetime, timezone`
refactor: Change archive_links check to focus on queryset, so it allows other iterables and not just lists 2020-09-09 02:36:06 +12:00			`from django.db.models import QuerySet`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00
			`from ..index.schema import Link`
fix: Handle case when update tries to re-add a link that is not in the sql index 2020-11-05 09:02:54 +13:00			`from ..index.sql import write_link_to_sql_index`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`from ..index import (`
			`load_link_details,`
			`write_link_details,`
			`)`
			`from ..util import enforce_types`
fix: Rename logging folder to avoid naming conflicts (and circular import issues) 2020-07-23 04:02:13 +12:00			`from ..logging_util import (`
fix depth flag and tweak logging 2020-07-14 03:26:30 +12:00			`log_archiving_started,`
			`log_archiving_paused,`
			`log_archiving_finished,`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`log_link_archiving_started,`
			`log_link_archiving_finished,`
			`log_archive_method_started,`
			`log_archive_method_finished,`
			`)`
Initial implementation 2020-11-18 12:42:57 +13:00			`from ..search import write_search_index`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00
			`from .title import should_save_title, save_title`
			`from .favicon import should_save_favicon, save_favicon`
			`from .wget import should_save_wget, save_wget`
feat: Add initial support for singlefile extractor 2020-07-31 06:23:10 +12:00			`from .singlefile import should_save_singlefile, save_singlefile`
feat: Initial version of readability extractor 2020-08-08 01:05:17 +12:00			`from .readability import should_save_readability, save_readability`
fix: add mercury-parser to extractors list 2020-09-22 20:55:14 +12:00			`from .mercury import should_save_mercury, save_mercury`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`from .pdf import should_save_pdf, save_pdf`
			`from .screenshot import should_save_screenshot, save_screenshot`
			`from .dom import should_save_dom, save_dom`
			`from .git import should_save_git, save_git`
			`from .media import should_save_media, save_media`
			`from .archive_org import should_save_archive_dot_org, save_archive_dot_org`
Added headers extractor 2020-09-12 02:06:52 +12:00			`from .headers import should_save_headers, save_headers`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00
Add ArchiveResult Manager and sorted indexable filter 2020-11-24 07:04:38 +13:00
fix: Remove title from extractors for oneshot 2020-08-01 03:24:58 +12:00			`def get_default_archive_methods():`
			`return [`
add overwrite flag to add command to force re-archiving 2020-08-18 20:37:54 +12:00			`('title', should_save_title, save_title),`
			`('favicon', should_save_favicon, save_favicon),`
bump Snapshot.updated time after each extractor, change extractor order 2021-02-17 09:52:18 +13:00			`('headers', should_save_headers, save_headers),`
add overwrite flag to add command to force re-archiving 2020-08-18 20:37:54 +12:00			`('singlefile', should_save_singlefile, save_singlefile),`
			`('pdf', should_save_pdf, save_pdf),`
			`('screenshot', should_save_screenshot, save_screenshot),`
			`('dom', should_save_dom, save_dom),`
bump Snapshot.updated time after each extractor, change extractor order 2021-02-17 09:52:18 +13:00			`('wget', should_save_wget, save_wget),`
			`('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them`
fix: add mercury-parser to extractors list 2020-09-22 20:55:14 +12:00			`('mercury', should_save_mercury, save_mercury),`
add overwrite flag to add command to force re-archiving 2020-08-18 20:37:54 +12:00			`('git', should_save_git, save_git),`
			`('media', should_save_media, save_media),`
			`('archive_org', should_save_archive_dot_org, save_archive_dot_org),`
			`]`
fix: Remove title from extractors for oneshot 2020-08-01 03:24:58 +12:00
Add ArchiveResult Manager and sorted indexable filter 2020-11-24 07:04:38 +13:00			`ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]`

fix: Remove title from extractors for oneshot 2020-08-01 03:24:58 +12:00			`@enforce_types`
			`def ignore_methods(to_ignore: List[str]):`
			`ARCHIVE_METHODS = get_default_archive_methods()`
			`methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS)`
fix: oneshot command not running extractors 2020-09-25 04:24:34 +12:00			`methods = map(lambda x: x[0], methods)`
fix: Remove title from extractors for oneshot 2020-08-01 03:24:58 +12:00			`return list(methods)`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00
			`@enforce_types`
refactor: Remove `skip_index` from archive related functions 2020-12-09 12:42:01 +13:00			`def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""`

feat: Create ArchiveResult after finishing an extractor process 2020-11-05 05:22:55 +13:00			`# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.`
refactor: Remove `skip_index` from archive related functions 2020-12-09 12:42:01 +13:00			`from core.models import Snapshot, ArchiveResult`
			`try:`
			`snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot`
			`except Snapshot.DoesNotExist:`
			`snapshot = write_link_to_sql_index(link)`
feat: Create ArchiveResult after finishing an extractor process 2020-11-05 05:22:55 +13:00
fix: Remove title from extractors for oneshot 2020-08-01 03:24:58 +12:00			`ARCHIVE_METHODS = get_default_archive_methods()`

fix pull title not working 2020-08-19 00:49:26 +12:00			`if methods:`
accept methods argument to filder archive_link 2020-07-28 21:58:38 +12:00			`ARCHIVE_METHODS = [`
			`method for method in ARCHIVE_METHODS`
fix pull title not working 2020-08-19 00:49:26 +12:00			`if method[0] in methods`
accept methods argument to filder archive_link 2020-07-28 21:58:38 +12:00			`]`
fix config file atomic writing bugs 2020-06-30 18:04:16 +12:00
test: Fix tests post-rebase 2020-09-16 07:05:48 +12:00			`out_dir = out_dir or Path(link.link_dir)`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`try:`
replaced os.path in init extractors 2020-10-01 08:43:35 +13:00			`is_new = not Path(out_dir).exists()`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`if is_new:`
			`os.makedirs(out_dir)`

			`link = load_link_details(link, out_dir=out_dir)`
refactor: Remove `skip_index` from archive related functions 2020-12-09 12:42:01 +13:00			`write_link_details(link, out_dir=out_dir, skip_sql_index=False)`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`log_link_archiving_started(link, out_dir, is_new)`
add timezone support, tons of CSS and layout improvements, more detailed snapshot admin form info, ability to sort by recently updated, better grid view styling, better table layouts, better dark mode support 2021-04-10 20:19:30 +12:00			`link = link.overwrite(updated=datetime.now(timezone.utc))`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}`

			`for method_name, should_run, method_function in ARCHIVE_METHODS:`
			`try:`
			`if method_name not in link.history:`
			`link.history[method_name] = []`
fix config file atomic writing bugs 2020-06-30 18:04:16 +12:00
Refactor `should_save_extractor` methods to accept `overwrite` parameter 2021-01-22 10:45:11 +13:00			`if should_run(link, out_dir, overwrite):`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`log_archive_method_started(method_name)`

			`result = method_function(link=link, out_dir=out_dir)`

			`link.history[method_name].append(result)`

			`stats[result.status] += 1`
			`log_archive_method_finished(result)`
refactor: Remove `skip_index` from archive related functions 2020-12-09 12:42:01 +13:00			`write_search_index(link=link, texts=result.index_texts)`
			`ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,`
feat: Create ArchiveResult after finishing an extractor process 2020-11-05 05:22:55 +13:00			`output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)`
add more explanation about snapshot.save timestamp bump 2021-02-18 07:34:46 +13:00

			`# bump the updated time on the main Snapshot here, this is critical`
			`# to be able to cache summaries of the ArchiveResults for a given`
			`# snapshot without having to load all the results from the DB each time.`
			`# (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume`
			`# ArchiveResults are unchanged as long as the updated timestamp is unchanged)`
			`snapshot.save()`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`else:`
dont show skipped extractors to reduce visual noise 2020-08-19 00:13:35 +12:00			`# print('{black} X {}{reset}'.format(method_name, **ANSI))`
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`stats['skipped'] += 1`
			`except Exception as e:`
			`raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(`
			`method_name,`
			`link.url,`
			`)) from e`

			`# print(' ', stats)`

update Snapshot.title to latest_title after fetching 2020-07-28 21:55:09 +12:00			`try:`
			`latest_title = link.history['title'][-1].output.strip()`
			`if latest_title and len(latest_title) >= len(link.title or ''):`
			`link = link.overwrite(title=latest_title)`
			`except Exception:`
			`pass`

refactor: Remove `skip_index` from archive related functions 2020-12-09 12:42:01 +13:00			`write_link_details(link, out_dir=out_dir, skip_sql_index=False)`
fix config file atomic writing bugs 2020-06-30 18:04:16 +12:00
move everything out of legacy folder 2019-04-28 09:26:24 +12:00			`log_link_archiving_finished(link, link.link_dir, is_new, stats)`

			`except KeyboardInterrupt:`
			`try:`
			`write_link_details(link, out_dir=link.link_dir)`
			`except:`
			`pass`
			`raise`

			`except Exception as err:`
			`print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))`
			`raise`

			`return link`
fix depth flag and tweak logging 2020-07-14 03:26:30 +12:00
			`@enforce_types`
test: Fix tests post-rebase 2020-09-16 07:05:48 +12:00			`def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:`
feat: Refactor add method to use querysets 2020-08-22 02:57:29 +12:00
refactor: Change archive_links check to focus on queryset, so it allows other iterables and not just lists 2020-09-09 02:36:06 +12:00			`if type(all_links) is QuerySet:`
feat: Refactor add method to use querysets 2020-08-22 02:57:29 +12:00			`num_links: int = all_links.count()`
			`get_link = lambda x: x.as_link()`
refactor: Remove get_iter lambda from archive_links 2020-09-09 03:39:01 +12:00			`all_links = all_links.iterator()`
refactor: Change archive_links check to focus on queryset, so it allows other iterables and not just lists 2020-09-09 02:36:06 +12:00			`else:`
			`num_links: int = len(all_links)`
			`get_link = lambda x: x`
feat: Refactor add method to use querysets 2020-08-22 02:57:29 +12:00
			`if num_links == 0:`
fix depth flag and tweak logging 2020-07-14 03:26:30 +12:00			`return []`

feat: Refactor add method to use querysets 2020-08-22 02:57:29 +12:00			`log_archiving_started(num_links)`
fix depth flag and tweak logging 2020-07-14 03:26:30 +12:00			`idx: int = 0`
			`try:`
refactor: Remove get_iter lambda from archive_links 2020-09-09 03:39:01 +12:00			`for link in all_links:`
feat: Refactor add method to use querysets 2020-08-22 02:57:29 +12:00			`idx += 1`
			`to_archive = get_link(link)`
test: Fix tests post-rebase 2020-09-16 07:05:48 +12:00			`archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))`
fix depth flag and tweak logging 2020-07-14 03:26:30 +12:00			`except KeyboardInterrupt:`
feat: Refactor add method to use querysets 2020-08-22 02:57:29 +12:00			`log_archiving_paused(num_links, idx, link.timestamp)`
fix depth flag and tweak logging 2020-07-14 03:26:30 +12:00			`raise SystemExit(0)`
			`except BaseException:`
			`print()`
			`raise`

feat: Refactor add method to use querysets 2020-08-22 02:57:29 +12:00			`log_archiving_finished(num_links)`
			`return all_links`