1
0
Fork 0
mirror of synced 2024-06-28 02:50:24 +12:00

Optionally import only new links

When importing a huge list of links periodically (from a big dump of
links from a bookmark service for example) with a lot of broken links,
this links will always be rechecked. To skip this, the environment
variable ONLY_NEW can be used to only import new links and skip the rest
altogether. This partially fixes #95.
This commit is contained in:
Aaron Fischer 2018-10-19 21:28:38 +02:00
parent bf6e8f03e4
commit 69c007ce85
4 changed files with 37 additions and 4 deletions

View file

@ -140,6 +140,11 @@ You can run it in parallel by using the `resume` feature, or by manually splitti
```
Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
If you already imported a huge list of bookmarks and want to import only new
bookmarks, you can use the `ONLY_NEW` environment variable. This is useful if
you want to import a bookmark dump periodically and want to skip broken links
which are already in the index.
## Configuration
You can tweak parameters via environment variables, or by editing `config.py` directly:
@ -158,6 +163,7 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc
**Archive Options:**
- maximum allowed download time per link: `TIMEOUT` values: [`60`]/`30`/`...`
- import only new links: `ONLY_NEW` values `True`/[`False`]
- archive methods (values: [`True`]/`False`):
- fetch page with wget: `FETCH_WGET`
- fetch images/css/js with wget: `FETCH_WGET_REQUISITES` (True is highly recommended)

View file

@ -10,7 +10,10 @@ from datetime import datetime
from subprocess import run
from parse import parse_links
from links import validate_links
from links import (
new_links,
validate_links
)
from archive_methods import archive_links, _RESULTS_TOTALS
from index import (
write_links_index,
@ -19,6 +22,7 @@ from index import (
parse_json_link_index,
)
from config import (
ONLY_NEW,
OUTPUT_PERMISSIONS,
OUTPUT_DIR,
ANSI,
@ -45,7 +49,7 @@ def print_help():
print(" ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n")
def merge_links(archive_path=OUTPUT_DIR, import_path=None):
def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
"""get new links from file and optionally append them to links in existing archive"""
all_links = []
if import_path:
@ -76,6 +80,9 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None):
# **ANSI,
# ))
if only_new:
return new_links(all_links, existing_links)
return all_links
def update_archive(archive_path, links, source=None, resume=None, append=True):
@ -158,7 +165,7 @@ if __name__ == '__main__':
source = download_url(source)
# Step 1: Parse the links and dedupe them with existing archive
links = merge_links(archive_path=out_dir, import_path=source)
links = merge_links(archive_path=out_dir, import_path=source, only_new=False)
# Step 2: Write new index
write_links_index(out_dir=out_dir, links=links)
@ -167,4 +174,8 @@ if __name__ == '__main__':
# cleanup_archive(out_dir, links)
# Step 4: Run the archive methods for each link
update_archive(out_dir, links, source=source, resume=resume, append=True)
if ONLY_NEW:
new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True)
update_archive(out_dir, new_links, source=source, resume=resume, append=True)
else:
update_archive(out_dir, links, source=source, resume=resume, append=True)

View file

@ -13,6 +13,7 @@ from subprocess import run, PIPE
IS_TTY = sys.stdout.isatty()
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true'
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true'

View file

@ -74,6 +74,21 @@ def validate_links(links):
return list(links)
def new_links(imported_links, existing_links):
"""
Return all links which are in the imported_links but not in the existing_links.
This is used to determine which links are new and not indexed jet. Set the
ONLY_NEW environment variable to activate this filter mechanism.
"""
new_links = []
for i_link in imported_links:
found_link_in_existing_links = False
for e_link in existing_links:
if i_link['url'] == e_link['url']:
found_link_in_existing_links = True
if not found_link_in_existing_links:
new_links.append(i_link)
return new_links
def archivable_links(links):
"""remove chrome://, about:// or other schemed links that cant be archived"""