diff --git a/Dockerfile b/Dockerfile index c9820e4d..56a6f936 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server # Multi-arch build: # docker buildx create --use -# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev +# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev # # Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development). @@ -194,10 +194,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T && playwright install --with-deps chromium \ && export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \ else \ - # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) - apt-get install -qq -y -t bookworm-backports --no-install-recommends \ - chromium \ - && export CHROME_BINARY="$(which chromium)"; \ + # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) + # apt-get install -qq -y -t bookworm-backports --no-install-recommends \ + # chromium \ + # && export CHROME_BINARY="$(which chromium)"; \ + echo 'armv7 no longer supported in versions after v0.7.3' \ + exit 1; \ fi \ && rm -rf /var/lib/apt/lists/* \ && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \ @@ -275,7 +277,6 @@ ENV IN_DOCKER=True \ GOOGLE_DEFAULT_CLIENT_SECRET=no \ ALLOWED_HOSTS=* ## No need to set explicitly, these values will be autodetected by archivebox in docker: - # CHROME_SANDBOX=False \ # WGET_BINARY="wget" \ # YOUTUBEDL_BINARY="yt-dlp" \ # CHROME_BINARY="/usr/bin/chromium-browser" \ diff --git a/README.md b/README.md index a961cb47..4d1bcf0d 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Without active preservation effort, everything on the internet eventually dissap

-📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more. +📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://github.com/ArchiveBox/archivebox-browser-extension), and more. See Input Formats for a full list of supported input formats...
@@ -141,21 +141,20 @@ curl -fsSL 'https://get.archivebox.io' | sh ArchiveBox is free for everyone to self-host, but we also provide support, security review, and custom integrations to help NGOs, governments, and other organizations [run ArchiveBox professionally](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102): -- 🗞️ **Journalists:** +- **Journalists:** `crawling during research`, `preserving cited pages`, `fact-checking & review` -- ⚖️ **Lawyers:** +- **Lawyers:** `collecting & preserving evidence`, `detecting changes`, `tagging & review` -- 🔬 **Researchers:** +- **Researchers:** `analyzing social media trends`, `getting LLM training data`, `crawling pipelines` -- 👩🏽 **Individuals:** +- **Individuals:** `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` +- **Governments:** + `snapshoting public service sites`, `recordkeeping compliance` -> ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally. We offer services such as:* -> -> - setup & support, hosting, custom features, security, hashing & audit logging for chain-of-custody, etc. -> - for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... - -*We are a 🏛️ 501(c)(3) nonprofit and all our work goes towards supporting open-source development.* +> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.* +> We offer: setup & support, hosting, custom features, security, hashing & audit logging/chain-of-custody, etc. +> *ArchiveBox has 🏛️ 501(c)(3) [nonprofit status](https://hackclub.com/hcb/) and all our work supports open-source development.*
@@ -498,7 +497,7 @@ docker run -it -v $PWD:/data archivebox/archivebox help
-curl sh automatic setup script CLI Usage Examples (non-Docker) +curl sh automatic setup script CLI Usage Examples: non-Docker

 # make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
@@ -515,7 +514,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
 
-Docker Docker Compose CLI Usage Examples +Docker CLI Usage Examples: Docker Compose

 # make sure you have `docker-compose.yml` from the Quickstart instructions first
@@ -533,7 +532,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
 
-Docker Docker CLI Usage Examples +Docker CLI Usage Examples: Docker

 # make sure you create and cd into in a new empty directory first  
@@ -655,13 +654,13 @@ docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://exampl
   ArchiveBox supports injesting URLs in [any text-based format](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file).
 
 -  From manually exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (in Netscape format)  
-  See instructions for: Chrome, Firefox, Safari, IE, Opera, and more...
+  Instructions: Chrome, Firefox, Safari, IE, Opera, and more...
 
 -  From URLs visited through a [MITM Proxy](https://mitmproxy.org/) with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy)  
   Provides [realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy.
 
 -  From bookmarking services or social media (e.g. Twitter bookmarks, Reddit saved posts, etc.)  
-  See instructions for: Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved, Wallabag, Unmark.it, OneTab, Firefox Sync, and more...
+  Instructions: Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved, Wallabag, Unmark.it, OneTab, Firefox Sync, and more...
 
 
 
@@ -1018,7 +1017,7 @@ For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) active
 
 
@@ -1061,7 +1060,6 @@ Improved support for saving multiple snapshots of a single URL without this hash
 

- ### Storage Requirements Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive. There are also some special requirements when using filesystems like NFS/SMB/FUSE. @@ -1071,17 +1069,16 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co Click to learn more about ArchiveBox's filesystem and hosting requirements...
- -**ArchiveBox can use anywhere from ~1gb per 1000 articles, to ~50gb per 1000 articles**, mostly dependent on whether you're saving audio & video using `SAVE_MEDIA=True` and whether you lower `MEDIA_MAX_SIZE=750mb`. - -Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind). - -**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `data/archive/` folder. - -**Try to keep the `data/index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `data/archive/` folder can be on a network mount or slower HDD. - -If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server. - +
    +
  • ArchiveBox can use anywhere from ~1gb per 1000 Snapshots, to ~50gb per 1000 Snapshots, mostly dependent on whether you're saving audio & video using SAVE_MEDIA=True and whether you lower MEDIA_MAX_SIZE=750mb.
  • +
  • Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like fdupes or rdfind. +
  • +
  • Don't store large collections on older filesystems like EXT3/FAT as they may not be able to handle more than 50k directory entries in the data/archive/ folder. +
  • +
  • Try to keep the data/index.sqlite3 file on local drive (not a network mount) or SSD for maximum performance, however the data/archive/ folder can be on a network mount or slower HDD.
  • +
  • If using Docker or NFS/SMB/FUSE for the data/archive/ folder, you may need to set PUID & PGID and disable root_squash on your fileshare server. +
  • +

Learn More

@@ -1163,19 +1160,23 @@ ArchiveBox aims to enable more of the internet to be saved from deterioration by Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity. -Whether it's to resist censorship by saving articles before they get taken down or edited, or just to save a collection of early 2010's flash games you love to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears. +Whether it's to resist censorship by saving news articles before they get taken down or edited, or just to save a collection of early 2010's flash games you loved to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears.

Image from Perma.cc...
+The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. I don't think everything should be preserved in an automated fashion--making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about, just like libraries do. Without the work of archivists saving physical books, manuscrips, and paintings we wouldn't have any knowledge of our ancestors' history. I believe archiving the web is just as important to provide the same benefit to future generations. -The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. I don't think everything should be preserved in an automated fashion--making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about. +ArchiveBox's stance is that duplication of other people's content is only ethical if it: -Because modern websites are complicated and often rely on dynamic content, -ArchiveBox archives the sites in **several different formats** beyond what public archiving services like Archive.org/Archive.is save. Using multiple methods and the market-dominant browser to execute JS ensures we can save even the most complex, finicky websites in at least a few high-quality, long-term data formats. +- A. doesn't deprive the original creators of revenue and +- B. is responsibly curated by an individual/institution. +In the U.S., libraries, researchers, and archivists are allowed to duplicate copyrighted materials under "fair use" for private study, scholarship, or research. Archive.org's preservation work is covered under this exemption, as they are as a non-profit providing public service, and they respond to unethical content/DMCA/GDPR removal requests. + +As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use sofware like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in FOOTER_INFO and changing your instance's branding using CUSTOM_TEMPLATES_DIR).

@@ -1188,7 +1189,7 @@ ArchiveBox archives the sites in **several different formats** beyond what publi > **Check out our [community wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for a list of web archiving tools and orgs.** -A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity archive collection over time. +A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity collection over time.
@@ -1576,10 +1577,10 @@ Extractors take the URL of a page to archive, write their output to the filesyst -- [ArchiveBox.io Homepage](https://archivebox.io) / [Source Code (Github)](https://github.com/ArchiveBox/ArchiveBox) / [Demo Server](https://demo.archivebox.io) -- [Documentation Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs](https://docs.archivebox.io) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) -- [Bug Tracker](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io) -- Find us on social media: [Twitter](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [SaaSHub](https://www.saashub.com/archivebox), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) +- [ArchiveBox.io Website](https://archivebox.io) / [ArchiveBox Github (Source Code)](https://github.com/ArchiveBox/ArchiveBox) / [ArchiveBox Demo Server](https://demo.archivebox.io) +- [Documentation (Github Wiki)](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs (ReadTheDocs)](https://docs.archivebox.io) / [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) +- [Bug Tracker (Github Issues)](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions (Github Discussions)](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io) +- Find us on social media: [Twitter `@ArchiveBoxApp`](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [SaaSHub](https://www.saashub.com/archivebox), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) --- @@ -1598,7 +1599,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst    
-ArchiveBox was started by Nick Sweeting in 2017, and has grown steadily with help from our amazing contributors. +ArchiveBox was started by Nick Sweeting in 2017, and has grown steadily with help from our amazing contributors.
✨ Have spare CPU/disk/bandwidth after all your 网站存档爬 and want to help the world?
Check out our Good Karma Kit...
diff --git a/archivebox/config.py b/archivebox/config.py index 2cc586d7..4dab8d04 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -355,6 +355,7 @@ ALLOWED_IN_OUTPUT_DIR = { 'static', 'sonic', 'search.sqlite3', + 'crontabs', ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, @@ -1039,10 +1040,10 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': config['LOGS_DIR'].exists(), }, - 'PERSONAS': { - 'path': config['PERSONAS'].resolve(), + 'PERSONAS_DIR': { + 'path': config['PERSONAS_DIR'].resolve(), 'enabled': True, - 'is_valid': config['PERSONAS'].exists(), + 'is_valid': config['PERSONAS_DIR'].exists(), }, 'ARCHIVE_DIR': { 'path': config['ARCHIVE_DIR'].resolve(), diff --git a/archivebox/core/__init__.py b/archivebox/core/__init__.py index c09affb6..203ef6be 100644 --- a/archivebox/core/__init__.py +++ b/archivebox/core/__init__.py @@ -1,3 +1,3 @@ __package__ = 'archivebox.core' -default_app_config = 'core.apps.CoreAppConfig' \ No newline at end of file +# default_app_config = 'core.apps.CoreAppConfig' diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index b5cc569d..0a89d099 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -49,6 +49,60 @@ GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel +class ArchiveBoxAdmin(admin.AdminSite): + site_header = 'ArchiveBox' + index_title = 'Links' + site_title = 'Index' + namespace = 'admin' + + def get_urls(self): + return [ + path('core/snapshot/add/', self.add_view, name='Add'), + ] + super().get_urls() + + def add_view(self, request): + if not request.user.is_authenticated: + return redirect(f'/admin/login/?next={request.path}') + + request.current_app = self.name + context = { + **self.each_context(request), + 'title': 'Add URLs', + } + + if request.method == 'GET': + context['form'] = AddLinkForm() + + elif request.method == 'POST': + form = AddLinkForm(request.POST) + if form.is_valid(): + url = form.cleaned_data["url"] + print(f'[+] Adding URL: {url}') + depth = 0 if form.cleaned_data["depth"] == "0" else 1 + input_kwargs = { + "urls": url, + "depth": depth, + "update_all": False, + "out_dir": OUTPUT_DIR, + } + add_stdout = StringIO() + with redirect_stdout(add_stdout): + add(**input_kwargs) + print(add_stdout.getvalue()) + + context.update({ + "stdout": ansi_to_html(add_stdout.getvalue().strip()), + "form": AddLinkForm() + }) + else: + context["form"] = form + + return render(template_name='add.html', request=request, context=context) + +archivebox_admin = ArchiveBoxAdmin() +archivebox_admin.register(get_user_model()) +archivebox_admin.disable_action('delete_selected') + class ArchiveResultInline(admin.TabularInline): model = ArchiveResult @@ -58,11 +112,11 @@ class TagInline(admin.TabularInline): from django.contrib.admin.helpers import ActionForm from django.contrib.admin.widgets import AutocompleteSelectMultiple -# WIP: broken by Django 3.1.2 -> 4.0 migration class AutocompleteTags: model = Tag search_fields = ['name'] name = 'tags' + remote_field = TagInline class AutocompleteTagsAdminStub: name = 'admin' @@ -72,7 +126,6 @@ class SnapshotActionForm(ActionForm): tags = forms.ModelMultipleChoiceField( queryset=Tag.objects.all(), required=False, - # WIP: broken by Django 3.1.2 -> 4.0 migration widget=AutocompleteSelectMultiple( AutocompleteTags(), AutocompleteTagsAdminStub(), @@ -91,6 +144,7 @@ class SnapshotActionForm(ActionForm): # ) +@admin.register(Snapshot, site=archivebox_admin) class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'added', 'files') @@ -178,6 +232,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): obj.id, ) + @admin.display( + description='Title', + ordering='title', + ) def title_str(self, obj): canon = obj.as_link().canonical_outputs() tags = ''.join( @@ -199,12 +257,17 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' ) + mark_safe(f' {tags}') + @admin.display( + description='Files Saved', + ordering='archiveresult_count', + ) def files(self, obj): return snapshot_icons(obj) - files.admin_order_field = 'archiveresult_count' - files.short_description = 'Files Saved' + @admin.display( + ordering='archiveresult_count' + ) def size(self, obj): archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size if archive_size: @@ -219,8 +282,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): size_txt, ) - size.admin_order_field = 'archiveresult_count' + @admin.display( + description='Original URL', + ordering='url', + ) def url_str(self, obj): return format_html( '{}', @@ -257,65 +323,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): # print('[*] Got request', request.method, request.POST) # return super().changelist_view(request, extra_context=None) + @admin.action( + description="Pull" + ) def update_snapshots(self, request, queryset): archive_links([ snapshot.as_link() for snapshot in queryset ], out_dir=OUTPUT_DIR) - update_snapshots.short_description = "Pull" + @admin.action( + description="⬇️ Title" + ) def update_titles(self, request, queryset): archive_links([ snapshot.as_link() for snapshot in queryset ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR) - update_titles.short_description = "⬇️ Title" + @admin.action( + description="Re-Snapshot" + ) def resnapshot_snapshot(self, request, queryset): for snapshot in queryset: timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds') new_url = snapshot.url.split('#')[0] + f'#{timestamp}' add(new_url, tag=snapshot.tags_str()) - resnapshot_snapshot.short_description = "Re-Snapshot" + @admin.action( + description="Reset" + ) def overwrite_snapshots(self, request, queryset): archive_links([ snapshot.as_link() for snapshot in queryset ], overwrite=True, out_dir=OUTPUT_DIR) - overwrite_snapshots.short_description = "Reset" + @admin.action( + description="Delete" + ) def delete_snapshots(self, request, queryset): remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) - delete_snapshots.short_description = "Delete" + @admin.action( + description="+" + ) def add_tags(self, request, queryset): tags = request.POST.getlist('tags') print('[+] Adding tags', tags, 'to Snapshots', queryset) for obj in queryset: obj.tags.add(*tags) - add_tags.short_description = "+" + @admin.action( + description="–" + ) def remove_tags(self, request, queryset): tags = request.POST.getlist('tags') print('[-] Removing tags', tags, 'to Snapshots', queryset) for obj in queryset: obj.tags.remove(*tags) - remove_tags.short_description = "–" - title_str.short_description = 'Title' - url_str.short_description = 'Original URL' - - title_str.admin_order_field = 'title' - url_str.admin_order_field = 'url' + +@admin.register(Tag, site=archivebox_admin) class TagAdmin(admin.ModelAdmin): list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id') sort_fields = ('id', 'name', 'slug') @@ -346,6 +423,7 @@ class TagAdmin(admin.ModelAdmin): ) + (f'
and {total_count-10} more...' if obj.snapshot_set.count() > 10 else '')) +@admin.register(ArchiveResult, site=archivebox_admin) class ArchiveResultAdmin(admin.ModelAdmin): list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') @@ -358,6 +436,9 @@ class ArchiveResultAdmin(admin.ModelAdmin): ordering = ['-start_ts'] list_per_page = SNAPSHOTS_PER_PAGE + @admin.display( + description='snapshot' + ) def snapshot_str(self, obj): return format_html( '[{}]
' @@ -367,6 +448,9 @@ class ArchiveResultAdmin(admin.ModelAdmin): obj.snapshot.url[:128], ) + @admin.display( + description='tags' + ) def tags_str(self, obj): return obj.snapshot.tags_str() @@ -383,64 +467,3 @@ class ArchiveResultAdmin(admin.ModelAdmin): obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html', obj.output, ) - - tags_str.short_description = 'tags' - snapshot_str.short_description = 'snapshot' - -class ArchiveBoxAdmin(admin.AdminSite): - site_header = 'ArchiveBox' - index_title = 'Links' - site_title = 'Index' - - def get_urls(self): - return [ - path('core/snapshot/add/', self.add_view, name='Add'), - ] + super().get_urls() - - def add_view(self, request): - if not request.user.is_authenticated: - return redirect(f'/admin/login/?next={request.path}') - - request.current_app = self.name - context = { - **self.each_context(request), - 'title': 'Add URLs', - } - - if request.method == 'GET': - context['form'] = AddLinkForm() - - elif request.method == 'POST': - form = AddLinkForm(request.POST) - if form.is_valid(): - url = form.cleaned_data["url"] - print(f'[+] Adding URL: {url}') - depth = 0 if form.cleaned_data["depth"] == "0" else 1 - input_kwargs = { - "urls": url, - "depth": depth, - "update_all": False, - "out_dir": OUTPUT_DIR, - } - add_stdout = StringIO() - with redirect_stdout(add_stdout): - add(**input_kwargs) - print(add_stdout.getvalue()) - - context.update({ - "stdout": ansi_to_html(add_stdout.getvalue().strip()), - "form": AddLinkForm() - }) - else: - context["form"] = form - - return render(template_name='add.html', request=request, context=context) - -admin.site = ArchiveBoxAdmin() -admin.site.register(get_user_model()) -admin.site.register(Group) -admin.site.register(Permission) -admin.site.register(Snapshot, SnapshotAdmin) -admin.site.register(Tag, TagAdmin) -admin.site.register(ArchiveResult, ArchiveResultAdmin) -admin.site.disable_action('delete_selected') diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index 71c004d4..7b5a1f11 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -3,11 +3,13 @@ from django.apps import AppConfig class CoreAppConfig(AppConfig): name = 'core' + # label = 'Archive Data' verbose_name = "Archive Data" # WIP: broken by Django 3.1.2 -> 4.0 migration - default_auto_field = 'django.db.models.UUIDField' + # default_auto_field = 'django.db.models.UUIDField' + def ready(self): from .auth import register_signals diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 93fd0e6c..aa793c0e 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -316,9 +316,6 @@ AUTH_PASSWORD_VALIDATORS = [ {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'}, ] -# WIP: broken by Django 3.1.2 -> 4.0 migration -DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField' - ################################################################################ ### Shell Settings ################################################################################ @@ -337,7 +334,6 @@ if IS_SHELL: LANGUAGE_CODE = 'en-us' USE_I18N = True -USE_L10N = True USE_TZ = True DATETIME_FORMAT = 'Y-m-d g:iA' SHORT_DATETIME_FORMAT = 'Y-m-d h:iA' diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index da9cfb52..ebd8c74c 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -1,4 +1,4 @@ -from django.contrib import admin +from .admin import archivebox_admin from django.urls import path, include from django.views import static @@ -33,7 +33,7 @@ urlpatterns = [ path('accounts/', include('django.contrib.auth.urls')), - path('admin/', admin.site.urls), + path('admin/', archivebox_admin.urls), path('health/', HealthCheckView.as_view(), name='healthcheck'), path('error/', lambda _: 1/0), diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 553c9f8d..1d5275dd 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -21,6 +21,7 @@ from ..config import ( SINGLEFILE_ARGS, SINGLEFILE_EXTRA_ARGS, CHROME_BINARY, + COOKIES_FILE, ) from ..logging_util import TimedProgress @@ -50,10 +51,11 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) # later options take precedence options = [ + '--browser-executable-path={}'.format(CHROME_BINARY), + *(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []), + browser_args, *SINGLEFILE_ARGS, *SINGLEFILE_EXTRA_ARGS, - browser_args, - '--browser-executable-path={}'.format(CHROME_BINARY), ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 9912b4c7..fb3688f3 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" from core.models import Snapshot try: - return Snapshot.objects.all() + return Snapshot.objects.all().only('id') except (KeyboardInterrupt, SystemExit): raise SystemExit(0) diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh index 20385137..ef974406 100755 --- a/bin/docker_entrypoint.sh +++ b/bin/docker_entrypoint.sh @@ -164,6 +164,17 @@ else fi fi +# symlink etc crontabs into place +mkdir -p "$DATA_DIR/crontabs" +if ! test -L /var/spool/cron/crontabs; then + # copy files from old location into new data dir location + for file in $(ls /var/spool/cron/crontabs); do + cp /var/spool/cron/crontabs/"$file" "$DATA_DIR/crontabs" + done + # replace old system path with symlink to data dir location + rm -Rf /var/spool/cron/crontabs + ln -s "$DATA_DIR/crontabs" /var/spool/cron/crontabs +fi # set DBUS_SYSTEM_BUS_ADDRESS & DBUS_SESSION_BUS_ADDRESS # (dbus is not actually needed, it makes chrome log fewer warnings but isn't worth making our docker images bigger) diff --git a/docker-compose.yml b/docker-compose.yml index 226d306b..8ef4f5f4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,35 +8,26 @@ # Documentation: # https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose ---- -version: "3.9" services: archivebox: - #image: ${DOCKER_IMAGE:-archivebox/archivebox:dev} - image: archivebox:test - # image: archivebox/archivebox:dev - command: server --quick-init 0.0.0.0:8000 + image: archivebox/archivebox:latest ports: - 8000:8000 volumes: - ./data:/data - - /Volumes/OPT/browsertrix:/browsertrix:z - # - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs - # - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox) - # build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox) environment: - - ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name - # - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list - # - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content - # - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive # - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo # - ADMIN_PASSWORD=SomeSecretPassword + - ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name + - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list + - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content + - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive + - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search + - SEARCH_BACKEND_HOST_NAME=sonic + - SEARCH_BACKEND_PASSWORD=SomeSecretPassword # - PUID=911 # set to your host user's UID & GID if you encounter permissions issues # - PGID=911 - # - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search - # - SEARCH_BACKEND_HOST_NAME=sonic - # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword # - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files # - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out # - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs) @@ -44,38 +35,82 @@ services: # ... # add further configuration options from archivebox/config.py as needed (to apply them only to this container) # or set using `docker compose run archivebox config --set SOME_KEY=someval` (to persist config across all containers) - # For ad-blocking during archiving, uncomment this section and pihole service section below # networks: # - dns # dns: # - 172.20.0.53 - browsertrix: - image: webrecorder/browsertrix-crawler:latest - command: /bin/docker_ipc_listener.py - expose: - - 2222 - volumes: - - /Volumes/OPT/browsertrix:/crawls:z - - ./bin/docker_ipc_listener.py:/bin/docker_ipc_listener.py - ######## Optional Addons: tweak examples below as needed for your specific use case ######## - ### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg - # $ curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg - # After starting, backfill any existing Snapshots into the full-text index: + ### Enable ability to run regularly scheduled archiving tasks by uncommenting this container + # $ docker compose run archivebox schedule --every=day --depth=1 'https://example.com/some/rss/feed.xml' + # then restart the scheduler container to apply the changes to the schedule + # $ docker compose restart archivebox_scheduler + + archivebox_scheduler: + image: archivebox/archivebox:latest + command: schedule --foreground + environment: + - TIMEOUT=120 # increase if you see timeouts often during archiving / on slow networks + - ONLY_NEW=True # set to False to retry previously failed URLs when re-adding instead of skipping them + # - PUID=502 # set to your host user's UID & GID if you encounter permissions issues + # - PGID=20 + volumes: + - ./data:/data + # cpus: 2 # uncomment / edit these values to limit container resource consumption + # mem_limit: 2048m + # shm_size: 1024m + + + ### Runs the Sonic full-text search backend, config file is auto-downloaded into sonic.cfg: + # After starting, backfill any existing Snapshots into the full-text index: # $ docker-compose run archivebox update --index-only - # sonic: - # image: valeriansaliou/sonic:latest - # expose: - # - 1491 - # environment: - # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword - # volumes: - # - ./sonic.cfg:/etc/sonic.cfg:ro - # - ./data/sonic:/var/lib/sonic/store + sonic: + image: valeriansaliou/sonic:latest + build: + # custom build just auto-downloads archivebox's default sonic.cfg as a convenience + # not needed if you have already have /etc/sonic.cfg + dockerfile_inline: | + FROM quay.io/curl/curl:latest AS setup + RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg + FROM valeriansaliou/sonic:latest + COPY --from=setup /tmp/sonic.cfg /etc/sonic.cfg + expose: + - 1491 + environment: + - SEARCH_BACKEND_PASSWORD=SomeSecretPassword + volumes: + - ./etc/sonic.cfg:/etc/sonic.cfg + - ./data/sonic:/var/lib/sonic/store + + + ### Example: Watch the ArchiveBox browser in realtime as it archives things, + # or remote control it to set up logins and credentials for sites you want to archive. + # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile + + novnc: + image: theasp/novnc:latest + environment: + - DISPLAY_WIDTH=1920 + - DISPLAY_HEIGHT=1080 + - RUN_XTERM=no + ports: + # to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html + - "8080:8080" + + + ### Example: Put Nginx in front of the ArchiveBox server for SSL termination + + # nginx: + # image: nginx:alpine + # ports: + # - 443:443 + # - 80:80 + # volumes: + # - ./etc/nginx.conf:/etc/nginx/nginx.conf + # - ./data:/var/www ### Example: To run pihole in order to block ad/tracker requests during archiving, @@ -99,57 +134,6 @@ services: # - ./etc/dnsmasq:/etc/dnsmasq.d - ### Example: Enable ability to run regularly scheduled archiving tasks by uncommenting this container - # $ docker compose run archivebox schedule --every=day --depth=1 'https://example.com/some/rss/feed.xml' - # then restart the scheduler container to apply the changes to the schedule - # $ docker compose restart archivebox_scheduler - - # archivebox_scheduler: - # image: ${DOCKER_IMAGE:-archivebox/archivebox:dev} - # command: schedule --foreground - # environment: - # - MEDIA_MAX_SIZE=750m # increase this number to allow archiving larger audio/video files - # # - TIMEOUT=60 # increase if you see timeouts often during archiving / on slow networks - # # - ONLY_NEW=True # set to False to retry previously failed URLs when re-adding instead of skipping them - # # - CHECK_SSL_VALIDITY=True # set to False to allow saving URLs w/ broken SSL certs - # # - SAVE_ARCHIVE_DOT_ORG=True # set to False to disable submitting URLs to Archive.org when archiving - # # - PUID=502 # set to your host user's UID & GID if you encounter permissions issues - # # - PGID=20 - # volumes: - # - ./data:/data - # - ./etc/crontabs:/var/spool/cron/crontabs - # # cpus: 2 # uncomment / edit these values to limit container resource consumption - # # mem_limit: 2048m - # # shm_size: 1024m - - - ### Example: Put Nginx in front of the ArchiveBox server for SSL termination - - # nginx: - # image: nginx:alpine - # ports: - # - 443:443 - # - 80:80 - # volumes: - # - ./etc/nginx.conf:/etc/nginx/nginx.conf - # - ./data:/var/www - - - ### Example: Watch the ArchiveBox browser in realtime as it archives things, - # or remote control it to set up logins and credentials for sites you want to archive. - # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile - - novnc: - image: theasp/novnc:latest - environment: - - DISPLAY_WIDTH=1920 - - DISPLAY_HEIGHT=1080 - - RUN_XTERM=no - ports: - # to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html - - "8080:8080" - - ### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel # wireguard: @@ -166,6 +150,13 @@ services: # - ./wireguard.conf:/config/wg0.conf:ro + ### Example: Run browsertrix in parallel with ArchiveBox + + # browsertrix: + # image: webrecorder/browsertrix-crawler:latest + # volumes: + # - ./browsertrix:/crawls:z + ### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox # pywb: diff --git a/package.json b/package.json index 129ff897..5dc0ec2a 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,6 @@ { "@postlight/parser": "^2.2.3", "readability-extractor": "github:ArchiveBox/readability-extractor", - "single-file-cli": "^1.1.46" + "single-file-cli": "^1.1.54" } } diff --git a/pyproject.toml b/pyproject.toml index a5887570..7090efe2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,15 @@ dependencies = [ "requests>=2.24.0", "croniter>=0.3.34", "dateparser>=1.0.0", +<<<<<<< HEAD +======= + "django-extensions>=3.2.3", + "django>=4.2.0,<5.0", + "setuptools>=69.0.3", + "feedparser>=6.0.11", + "ipython>5.0.0", + "mypy-extensions>=0.4.3", +>>>>>>> dev "python-crontab>=2.5.1", "django>=3.1.3,<3.2", "django-extensions>=3.0.3", @@ -22,9 +31,13 @@ dependencies = [ "w3lib>=1.22.0", "yt-dlp>=2024.3.10", # dont add playwright becuase packages without sdists cause trouble on many build systems that refuse to install wheel-only packages +<<<<<<< HEAD # "playwright>=1.39.0; platform_machine != 'armv7l'", "mypy-extensions>=0.4.3", # "django-stubs-ext>=4.2.7", +======= + "playwright>=1.39.0; platform_machine != 'armv7l'", +>>>>>>> dev ] classifiers = [ @@ -65,11 +78,11 @@ classifiers = [ sonic = [ # echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list # curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg + # apt install sonic "sonic-client>=0.0.5", ] ldap = [ # apt install libldap2-dev libsasl2-dev python3-ldap - "setuptools>=69.0.3", "python-ldap>=3.4.3", "django-auth-ldap>=4.1.0", ] @@ -84,7 +97,6 @@ ldap = [ [tool.pdm.dev-dependencies] dev = [ # building - "setuptools>=69.0.3", "wheel", "pdm", "homebrew-pypi-poet>=0.10.0",