1
0
Fork 0
mirror of synced 2024-04-28 17:42:35 +12:00

Merge branch 'dev' into plugins-browsertrix

This commit is contained in:
Nick Sweeting 2024-03-26 15:29:02 -07:00
commit b5311d2c57
14 changed files with 270 additions and 230 deletions

View file

@ -10,7 +10,7 @@
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
# Multi-arch build:
# docker buildx create --use
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
#
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
@ -194,10 +194,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
&& playwright install --with-deps chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
else \
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
apt-get install -qq -y -t bookworm-backports --no-install-recommends \
chromium \
&& export CHROME_BINARY="$(which chromium)"; \
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
# apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# chromium \
# && export CHROME_BINARY="$(which chromium)"; \
echo 'armv7 no longer supported in versions after v0.7.3' \
exit 1; \
fi \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
@ -275,7 +277,6 @@ ENV IN_DOCKER=True \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
## No need to set explicitly, these values will be autodetected by archivebox in docker:
# CHROME_SANDBOX=False \
# WGET_BINARY="wget" \
# YOUTUBEDL_BINARY="yt-dlp" \
# CHROME_BINARY="/usr/bin/chromium-browser" \

View file

@ -35,7 +35,7 @@ Without active preservation effort, everything on the internet eventually dissap
<hr/>
<br/>
📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more.
📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://github.com/ArchiveBox/archivebox-browser-extension), and more.
<sub>See <a href="#input-formats">Input Formats</a> for a full list of supported input formats...</sub>
<br/>
@ -141,21 +141,20 @@ curl -fsSL 'https://get.archivebox.io' | sh
ArchiveBox is free for everyone to self-host, but we also provide support, security review, and custom integrations to help NGOs, governments, and other organizations [run ArchiveBox professionally](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102):
- 🗞️ **Journalists:**
- **Journalists:**
`crawling during research`, `preserving cited pages`, `fact-checking & review`
- ⚖️ **Lawyers:**
- **Lawyers:**
`collecting & preserving evidence`, `detecting changes`, `tagging & review`
- 🔬 **Researchers:**
- **Researchers:**
`analyzing social media trends`, `getting LLM training data`, `crawling pipelines`
- 👩🏽 **Individuals:**
- **Individuals:**
`saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival`
- **Governments:**
`snapshoting public service sites`, `recordkeeping compliance`
> ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally. We offer services such as:*
>
> - setup & support, hosting, custom features, security, hashing & audit logging for chain-of-custody, etc.
> - for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more...
*We are a 🏛️ 501(c)(3) nonprofit and all our work goes towards supporting open-source development.*
> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.*
> We offer: setup & support, hosting, custom features, security, hashing & audit logging/chain-of-custody, etc.
> *ArchiveBox has 🏛️ 501(c)(3) [nonprofit status](https://hackclub.com/hcb/) and all our work supports open-source development.*
<br/>
@ -498,7 +497,7 @@ docker run -it -v $PWD:/data archivebox/archivebox help
<br/>
<details>
<summary><img src="https://user-images.githubusercontent.com/511499/117456282-08665e80-af16-11eb-91a1-8102eff54091.png" alt="curl sh automatic setup script" height="22px" align="top"/> <b>CLI Usage Examples (non-Docker)</b></summary>
<summary><img src="https://user-images.githubusercontent.com/511499/117456282-08665e80-af16-11eb-91a1-8102eff54091.png" alt="curl sh automatic setup script" height="22px" align="top"/> <b>CLI Usage Examples: non-Docker</b></summary>
<br/>
<pre lang="bash"><code style="white-space: pre-line">
# make sure you have pip-installed ArchiveBox and it's available in your $PATH first
@ -515,7 +514,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
<br/>
<details>
<summary><img src="https://user-images.githubusercontent.com/511499/117447182-29758200-af0b-11eb-97bd-58723fee62ab.png" alt="Docker" height="22px" align="top"/> <b>Docker Compose CLI Usage Examples</b></summary>
<summary><img src="https://user-images.githubusercontent.com/511499/117447182-29758200-af0b-11eb-97bd-58723fee62ab.png" alt="Docker" height="22px" align="top"/> <b>CLI Usage Examples: Docker Compose</b></summary>
<br/>
<pre lang="bash"><code style="white-space: pre-line">
# make sure you have `docker-compose.yml` from the Quickstart instructions first
@ -533,7 +532,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
<br/>
<details>
<summary><img src="https://user-images.githubusercontent.com/511499/117447182-29758200-af0b-11eb-97bd-58723fee62ab.png" alt="Docker" height="22px" align="top"/> <b>Docker CLI Usage Examples</b></summary>
<summary><img src="https://user-images.githubusercontent.com/511499/117447182-29758200-af0b-11eb-97bd-58723fee62ab.png" alt="Docker" height="22px" align="top"/> <b>CLI Usage Examples: Docker</b></summary>
<br/>
<pre lang="bash"><code style="white-space: pre-line">
# make sure you create and cd into in a new empty directory first
@ -655,13 +654,13 @@ docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://exampl
<i>ArchiveBox supports injesting URLs in [any text-based format](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file).</i>
- <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/32b494e6-4de1-4984-8d88-dc02f18e5c34" height="22px"/> From manually exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (in Netscape format)
<i>See instructions for: <a href="https://support.google.com/chrome/answer/96816?hl=en">Chrome</a>, <a href="https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer">Firefox</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a">Safari</a>, <a href="https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows">IE</a>, <a href="https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom">Opera</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive">and more...</a></i>
<i>Instructions: <a href="https://support.google.com/chrome/answer/96816?hl=en">Chrome</a>, <a href="https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer">Firefox</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a">Safari</a>, <a href="https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows">IE</a>, <a href="https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom">Opera</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive">and more...</a></i>
- <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/4f7bd318-265c-4235-ad25-38be89946b12" height="22px"/> From URLs visited through a [MITM Proxy](https://mitmproxy.org/) with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy)
<i>Provides [realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy.</i>
- <img src="https://getpocket.com/favicon.ico" height="22px"/> From bookmarking services or social media (e.g. Twitter bookmarks, Reddit saved posts, etc.)
<i>See instructions for: <a href="https://getpocket.com/export">Pocket</a>, <a href="https://pinboard.in/export/">Pinboard</a>, <a href="https://www.instapaper.com/user">Instapaper</a>, <a href="https://shaarli.readthedocs.io/en/master/Usage/#importexport">Shaarli</a>, <a href="https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/">Delicious</a>, <a href="https://github.com/csu/export-saved-reddit">Reddit Saved</a>, <a href="https://doc.wallabag.org/en/user/import/wallabagv2.html">Wallabag</a>, <a href="http://help.unmark.it/import-export">Unmark.it</a>, <a href="https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/">OneTab</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/issues/648">Firefox Sync</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive">and more...</a></i>
<i>Instructions: <a href="https://getpocket.com/export">Pocket</a>, <a href="https://pinboard.in/export/">Pinboard</a>, <a href="https://www.instapaper.com/user">Instapaper</a>, <a href="https://shaarli.readthedocs.io/en/master/Usage/#importexport">Shaarli</a>, <a href="https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/">Delicious</a>, <a href="https://github.com/csu/export-saved-reddit">Reddit Saved</a>, <a href="https://doc.wallabag.org/en/user/import/wallabagv2.html">Wallabag</a>, <a href="http://help.unmark.it/import-export">Unmark.it</a>, <a href="https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/">OneTab</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/issues/648">Firefox Sync</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive">and more...</a></i>
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/e1e5bd78-b0b6-45dc-914c-e1046fee4bc4" width="330px" align="right" style="float: right"/>
@ -1018,7 +1017,7 @@ For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) active
<ul>
<li>Set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#curl_user_agent"><code>CHROME_USER_AGENT</code>, <code>WGET_USER_AGENT</code>, <code>CURL_USER_AGENT</code></a> to impersonate a real browser (by default, ArchiveBox reveals that it's a bot when using the default user agent settings)</li>
<li>Set up a logged-in browser session for archiving using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile"><code>CHROME_DATA_DIR</code> &amp; <code>COOKIES_FILE</code></a></li>
<li>Set up a logged-in browser session for archiving using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile"><code>CHROME_USER_DATA_DIR</code> &amp; <code>COOKIES_FILE</code></a></li>
<li>Rewrite your URLs before archiving to swap in an alternative frontend thats more bot-friendly e.g.<br>
<code>reddit.com/some/url</code> -&gt; <code>teddit.net/some/url</code>: <a href="https://github.com/mendel5/alternative-front-ends">https://github.com/mendel5/alternative-front-ends</a></li>
</ul>
@ -1061,7 +1060,6 @@ Improved support for saving multiple snapshots of a single URL without this hash
</details>
<br/>
### Storage Requirements
Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive. There are also some special requirements when using filesystems like NFS/SMB/FUSE.
@ -1071,17 +1069,16 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co
<summary><i>Click to learn more about ArchiveBox's filesystem and hosting requirements...</i></summary>
<br/>
**ArchiveBox can use anywhere from ~1gb per 1000 articles, to ~50gb per 1000 articles**, mostly dependent on whether you're saving audio & video using `SAVE_MEDIA=True` and whether you lower `MEDIA_MAX_SIZE=750mb`.
Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind).
**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `data/archive/` folder.
**Try to keep the `data/index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `data/archive/` folder can be on a network mount or slower HDD.
If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server.
<ul>
<li><strong>ArchiveBox can use anywhere from ~1gb per 1000 Snapshots, to ~50gb per 1000 Snapshots</strong>, mostly dependent on whether you're saving audio & video using <code>SAVE_MEDIA=True</code> and whether you lower <code>MEDIA_MAX_SIZE=750mb</code>.</li>
<li>Disk usage can be reduced by using a compressed/<a href="https://www.ixsystems.com/blog/ixsystems-and-klara-systems-celebrate-valentines-day-with-a-heartfelt-donation-of-fast-dedupe-to-openzfs-and-truenas/">deduplicated</a> filesystem like <a href="https://www.reddit.com/r/zfs/comments/t9cexx/a_simple_real_world_zfs_compression_speed_an/">ZFS</a>/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like <a href="https://github.com/adrianlopezroche/fdupes"><code>fdupes</code></a> or <a href="https://github.com/pauldreik/rdfind"><code>rdfind</code></a>.
</li>
<li><strong>Don't store large collections on older filesystems like EXT3/FAT</strong> as they may not be able to handle more than 50k directory entries in the <code>data/archive/</code> folder.
</li>
<li><strong>Try to keep the <code>data/index.sqlite3</code> file on local drive (not a network mount)</strong> or SSD for maximum performance, however the <code>data/archive/</code> folder can be on a network mount or slower HDD.</li>
<li>If using Docker or NFS/SMB/FUSE for the <code>data/archive/</code> folder, you may need to set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid"><code>PUID</code> & <code>PGID</code></a> and <a href="https://github.com/ArchiveBox/ArchiveBox/issues/1304">disable <code>root_squash</code></a> on your fileshare server.
</li>
</ul>
<h4>Learn More</h4>
@ -1163,19 +1160,23 @@ ArchiveBox aims to enable more of the internet to be saved from deterioration by
Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity.
Whether it's to resist censorship by saving articles before they get taken down or edited, or just to save a collection of early 2010's flash games you love to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears.
Whether it's to resist censorship by saving news articles before they get taken down or edited, or just to save a collection of early 2010's flash games you loved to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears.
<div align="center" style="text-align: center">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/71e36bc5-1c94-44e2-92b6-405fa898c734" width="40%"/><br/>
<sup><i>Image from <a href="https://perma.cc/">Perma.cc</a>...</i><br/></sup>
</div>
The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. I don't think everything should be preserved in an automated fashion--making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about, just like libraries do. Without the work of archivists saving physical books, manuscrips, and paintings we wouldn't have any knowledge of our ancestors' history. I believe archiving the web is just as important to provide the same benefit to future generations.
The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. I don't think everything should be preserved in an automated fashion--making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about.
ArchiveBox's stance is that duplication of other people's content is only ethical if it:
Because modern websites are complicated and often rely on dynamic content,
ArchiveBox archives the sites in **several different formats** beyond what public archiving services like Archive.org/Archive.is save. Using multiple methods and the market-dominant browser to execute JS ensures we can save even the most complex, finicky websites in at least a few high-quality, long-term data formats.
- A. doesn't deprive the original creators of revenue and
- B. is responsibly curated by an individual/institution.
In the U.S., <a href="https://guides.library.oregonstate.edu/copyright/libraries">libraries, researchers, and archivists</a> are allowed to duplicate copyrighted materials under <a href="https://libguides.ala.org/copyright/fairuse">"fair use"</a> for <a href="https://guides.cuny.edu/cunyfairuse/librarians#:~:text=One%20of%20these%20specified%20conditions,may%20be%20liable%20for%20copyright">private study, scholarship, or research</a>. Archive.org's preservation work is covered under this exemption, as they are as a non-profit providing public service, and they respond to <a href="https://cardozoaelj.com/2015/03/20/use-of-copyright-law-to-take-down-revenge-porn/">unethical content</a>/<a href="https://help.archive.org/help/rights/">DMCA</a>/<a href="https://gdpr.eu/right-to-be-forgotten/#:~:text=An%20individual%20has%20the%20right,that%20individual%20withdraws%20their%20consent.">GDPR</a> removal requests.
As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use sofware like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#footer_info"><code>FOOTER_INFO</code></a> and changing your instance's branding using <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#custom_templates_dir"><code>CUSTOM_TEMPLATES_DIR</code></a>).
</details>
<br/>
@ -1188,7 +1189,7 @@ ArchiveBox archives the sites in **several different formats** beyond what publi
> **Check out our [community wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for a list of web archiving tools and orgs.**
A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity archive collection over time.
A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity collection over time.
<br/>
<details>
@ -1576,10 +1577,10 @@ Extractors take the URL of a page to archive, write their output to the filesyst
<img src="https://raw.githubusercontent.com/Monadical-SAS/redux-time/HEAD/examples/static/jeremy.jpg" width="100px" align="right"/>
- [ArchiveBox.io Homepage](https://archivebox.io) / [Source Code (Github)](https://github.com/ArchiveBox/ArchiveBox) / [Demo Server](https://demo.archivebox.io)
- [Documentation Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs](https://docs.archivebox.io) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases)
- [Bug Tracker](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io)
- Find us on social media: [Twitter](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [SaaSHub](https://www.saashub.com/archivebox), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/)
- [ArchiveBox.io Website](https://archivebox.io) / [ArchiveBox Github (Source Code)](https://github.com/ArchiveBox/ArchiveBox) / [ArchiveBox Demo Server](https://demo.archivebox.io)
- [Documentation (Github Wiki)](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs (ReadTheDocs)](https://docs.archivebox.io) / [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases)
- [Bug Tracker (Github Issues)](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions (Github Discussions)](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io)
- Find us on social media: [Twitter `@ArchiveBoxApp`](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [SaaSHub](https://www.saashub.com/archivebox), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/)
---
@ -1598,7 +1599,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst
<a href="https://twitter.com/ArchiveBoxApp"><img src="https://img.shields.io/badge/Tweet-%40ArchiveBoxApp-blue.svg?style=flat"/></a>&nbsp;
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>&nbsp;
<a href="https://zulip.archivebox.io/"><img src="https://img.shields.io/badge/Join_Our_Community-Zulip_Forum-%23B7EDFE.svg"/></a><br/>
<sup>ArchiveBox was started by <a href="https://docs.sweeting.me/s/blog#About">Nick Sweeting</a> in 2017, and has grown steadily with help from our <a href="https://github.com/ArchiveBox/ArchiveBox/graphs/contributors">amazing contributors</a>.</sup>
<sup>ArchiveBox was started by <a href="https://docs.sweeting.me/s/blog#About">Nick Sweeting</a> in 2017, and has <a href="https://star-history.com/#archivebox/archivebox&Date">grown steadily</a> with help from our <a href="https://github.com/ArchiveBox/ArchiveBox/graphs/contributors">amazing contributors</a>.</sup>
<hr/>
<i>✨ Have spare CPU/disk/bandwidth after all your 网站存档爬 and want to help the world?<br/>Check out our <a href="https://github.com/ArchiveBox/good-karma-kit">Good Karma Kit</a>...</i>
</div>

View file

@ -355,6 +355,7 @@ ALLOWED_IN_OUTPUT_DIR = {
'static',
'sonic',
'search.sqlite3',
'crontabs',
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@ -1039,10 +1040,10 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': config['LOGS_DIR'].exists(),
},
'PERSONAS': {
'path': config['PERSONAS'].resolve(),
'PERSONAS_DIR': {
'path': config['PERSONAS_DIR'].resolve(),
'enabled': True,
'is_valid': config['PERSONAS'].exists(),
'is_valid': config['PERSONAS_DIR'].exists(),
},
'ARCHIVE_DIR': {
'path': config['ARCHIVE_DIR'].resolve(),

View file

@ -1,3 +1,3 @@
__package__ = 'archivebox.core'
default_app_config = 'core.apps.CoreAppConfig'
# default_app_config = 'core.apps.CoreAppConfig'

View file

@ -49,6 +49,60 @@ GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE,
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Links'
site_title = 'Index'
namespace = 'admin'
def get_urls(self):
return [
path('core/snapshot/add/', self.add_view, name='Add'),
] + super().get_urls()
def add_view(self, request):
if not request.user.is_authenticated:
return redirect(f'/admin/login/?next={request.path}')
request.current_app = self.name
context = {
**self.each_context(request),
'title': 'Add URLs',
}
if request.method == 'GET':
context['form'] = AddLinkForm()
elif request.method == 'POST':
form = AddLinkForm(request.POST)
if form.is_valid():
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1
input_kwargs = {
"urls": url,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
}
add_stdout = StringIO()
with redirect_stdout(add_stdout):
add(**input_kwargs)
print(add_stdout.getvalue())
context.update({
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
"form": AddLinkForm()
})
else:
context["form"] = form
return render(template_name='add.html', request=request, context=context)
archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.register(get_user_model())
archivebox_admin.disable_action('delete_selected')
class ArchiveResultInline(admin.TabularInline):
model = ArchiveResult
@ -58,11 +112,11 @@ class TagInline(admin.TabularInline):
from django.contrib.admin.helpers import ActionForm
from django.contrib.admin.widgets import AutocompleteSelectMultiple
# WIP: broken by Django 3.1.2 -> 4.0 migration
class AutocompleteTags:
model = Tag
search_fields = ['name']
name = 'tags'
remote_field = TagInline
class AutocompleteTagsAdminStub:
name = 'admin'
@ -72,7 +126,6 @@ class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
queryset=Tag.objects.all(),
required=False,
# WIP: broken by Django 3.1.2 -> 4.0 migration
widget=AutocompleteSelectMultiple(
AutocompleteTags(),
AutocompleteTagsAdminStub(),
@ -91,6 +144,7 @@ class SnapshotActionForm(ActionForm):
# )
@admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'added', 'files')
@ -178,6 +232,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
obj.id,
)
@admin.display(
description='Title',
ordering='title',
)
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
@ -199,12 +257,17 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
description='Files Saved',
ordering='archiveresult_count',
)
def files(self, obj):
return snapshot_icons(obj)
files.admin_order_field = 'archiveresult_count'
files.short_description = 'Files Saved'
@admin.display(
ordering='archiveresult_count'
)
def size(self, obj):
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
if archive_size:
@ -219,8 +282,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
size_txt,
)
size.admin_order_field = 'archiveresult_count'
@admin.display(
description='Original URL',
ordering='url',
)
def url_str(self, obj):
return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>',
@ -257,65 +323,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
# print('[*] Got request', request.method, request.POST)
# return super().changelist_view(request, extra_context=None)
@admin.action(
description="Pull"
)
def update_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], out_dir=OUTPUT_DIR)
update_snapshots.short_description = "Pull"
@admin.action(
description="⬇️ Title"
)
def update_titles(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
update_titles.short_description = "⬇️ Title"
@admin.action(
description="Re-Snapshot"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
add(new_url, tag=snapshot.tags_str())
resnapshot_snapshot.short_description = "Re-Snapshot"
@admin.action(
description="Reset"
)
def overwrite_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, out_dir=OUTPUT_DIR)
overwrite_snapshots.short_description = "Reset"
@admin.action(
description="Delete"
)
def delete_snapshots(self, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
delete_snapshots.short_description = "Delete"
@admin.action(
description="+"
)
def add_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[+] Adding tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.add(*tags)
add_tags.short_description = "+"
@admin.action(
description=""
)
def remove_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[-] Removing tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.remove(*tags)
remove_tags.short_description = ""
title_str.short_description = 'Title'
url_str.short_description = 'Original URL'
title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url'
@admin.register(Tag, site=archivebox_admin)
class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
sort_fields = ('id', 'name', 'slug')
@ -346,6 +423,7 @@ class TagAdmin(admin.ModelAdmin):
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
@admin.register(ArchiveResult, site=archivebox_admin)
class ArchiveResultAdmin(admin.ModelAdmin):
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status')
@ -358,6 +436,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
ordering = ['-start_ts']
list_per_page = SNAPSHOTS_PER_PAGE
@admin.display(
description='snapshot'
)
def snapshot_str(self, obj):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
@ -367,6 +448,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
obj.snapshot.url[:128],
)
@admin.display(
description='tags'
)
def tags_str(self, obj):
return obj.snapshot.tags_str()
@ -383,64 +467,3 @@ class ArchiveResultAdmin(admin.ModelAdmin):
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
obj.output,
)
tags_str.short_description = 'tags'
snapshot_str.short_description = 'snapshot'
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Links'
site_title = 'Index'
def get_urls(self):
return [
path('core/snapshot/add/', self.add_view, name='Add'),
] + super().get_urls()
def add_view(self, request):
if not request.user.is_authenticated:
return redirect(f'/admin/login/?next={request.path}')
request.current_app = self.name
context = {
**self.each_context(request),
'title': 'Add URLs',
}
if request.method == 'GET':
context['form'] = AddLinkForm()
elif request.method == 'POST':
form = AddLinkForm(request.POST)
if form.is_valid():
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1
input_kwargs = {
"urls": url,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
}
add_stdout = StringIO()
with redirect_stdout(add_stdout):
add(**input_kwargs)
print(add_stdout.getvalue())
context.update({
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
"form": AddLinkForm()
})
else:
context["form"] = form
return render(template_name='add.html', request=request, context=context)
admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model())
admin.site.register(Group)
admin.site.register(Permission)
admin.site.register(Snapshot, SnapshotAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(ArchiveResult, ArchiveResultAdmin)
admin.site.disable_action('delete_selected')

View file

@ -3,11 +3,13 @@ from django.apps import AppConfig
class CoreAppConfig(AppConfig):
name = 'core'
# label = 'Archive Data'
verbose_name = "Archive Data"
# WIP: broken by Django 3.1.2 -> 4.0 migration
default_auto_field = 'django.db.models.UUIDField'
# default_auto_field = 'django.db.models.UUIDField'
def ready(self):
from .auth import register_signals

View file

@ -316,9 +316,6 @@ AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
]
# WIP: broken by Django 3.1.2 -> 4.0 migration
DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
################################################################################
### Shell Settings
################################################################################
@ -337,7 +334,6 @@ if IS_SHELL:
LANGUAGE_CODE = 'en-us'
USE_I18N = True
USE_L10N = True
USE_TZ = True
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'

View file

@ -1,4 +1,4 @@
from django.contrib import admin
from .admin import archivebox_admin
from django.urls import path, include
from django.views import static
@ -33,7 +33,7 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', admin.site.urls),
path('admin/', archivebox_admin.urls),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda _: 1/0),

View file

@ -21,6 +21,7 @@ from ..config import (
SINGLEFILE_ARGS,
SINGLEFILE_EXTRA_ARGS,
CHROME_BINARY,
COOKIES_FILE,
)
from ..logging_util import TimedProgress
@ -50,10 +51,11 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
# later options take precedence
options = [
'--browser-executable-path={}'.format(CHROME_BINARY),
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
browser_args,
*SINGLEFILE_ARGS,
*SINGLEFILE_EXTRA_ARGS,
browser_args,
'--browser-executable-path={}'.format(CHROME_BINARY),
]
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],

View file

@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
from core.models import Snapshot
try:
return Snapshot.objects.all()
return Snapshot.objects.all().only('id')
except (KeyboardInterrupt, SystemExit):
raise SystemExit(0)

View file

@ -164,6 +164,17 @@ else
fi
fi
# symlink etc crontabs into place
mkdir -p "$DATA_DIR/crontabs"
if ! test -L /var/spool/cron/crontabs; then
# copy files from old location into new data dir location
for file in $(ls /var/spool/cron/crontabs); do
cp /var/spool/cron/crontabs/"$file" "$DATA_DIR/crontabs"
done
# replace old system path with symlink to data dir location
rm -Rf /var/spool/cron/crontabs
ln -s "$DATA_DIR/crontabs" /var/spool/cron/crontabs
fi
# set DBUS_SYSTEM_BUS_ADDRESS & DBUS_SESSION_BUS_ADDRESS
# (dbus is not actually needed, it makes chrome log fewer warnings but isn't worth making our docker images bigger)

View file

@ -8,35 +8,26 @@
# Documentation:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
---
version: "3.9"
services:
archivebox:
#image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
image: archivebox:test
# image: archivebox/archivebox:dev
command: server --quick-init 0.0.0.0:8000
image: archivebox/archivebox:latest
ports:
- 8000:8000
volumes:
- ./data:/data
- /Volumes/OPT/browsertrix:/browsertrix:z
# - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
# - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
# build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
environment:
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
# - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
# - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
# - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
# - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo
# - ADMIN_PASSWORD=SomeSecretPassword
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
- PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
- PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
- PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
- SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
- SEARCH_BACKEND_HOST_NAME=sonic
- SEARCH_BACKEND_PASSWORD=SomeSecretPassword
# - PUID=911 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=911
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
# - SEARCH_BACKEND_HOST_NAME=sonic
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
# - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files
# - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out
# - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
@ -44,38 +35,82 @@ services:
# ...
# add further configuration options from archivebox/config.py as needed (to apply them only to this container)
# or set using `docker compose run archivebox config --set SOME_KEY=someval` (to persist config across all containers)
# For ad-blocking during archiving, uncomment this section and pihole service section below
# networks:
# - dns
# dns:
# - 172.20.0.53
browsertrix:
image: webrecorder/browsertrix-crawler:latest
command: /bin/docker_ipc_listener.py
expose:
- 2222
volumes:
- /Volumes/OPT/browsertrix:/crawls:z
- ./bin/docker_ipc_listener.py:/bin/docker_ipc_listener.py
######## Optional Addons: tweak examples below as needed for your specific use case ########
### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg
# $ curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg
# After starting, backfill any existing Snapshots into the full-text index:
### Enable ability to run regularly scheduled archiving tasks by uncommenting this container
# $ docker compose run archivebox schedule --every=day --depth=1 'https://example.com/some/rss/feed.xml'
# then restart the scheduler container to apply the changes to the schedule
# $ docker compose restart archivebox_scheduler
archivebox_scheduler:
image: archivebox/archivebox:latest
command: schedule --foreground
environment:
- TIMEOUT=120 # increase if you see timeouts often during archiving / on slow networks
- ONLY_NEW=True # set to False to retry previously failed URLs when re-adding instead of skipping them
# - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=20
volumes:
- ./data:/data
# cpus: 2 # uncomment / edit these values to limit container resource consumption
# mem_limit: 2048m
# shm_size: 1024m
### Runs the Sonic full-text search backend, config file is auto-downloaded into sonic.cfg:
# After starting, backfill any existing Snapshots into the full-text index:
# $ docker-compose run archivebox update --index-only
# sonic:
# image: valeriansaliou/sonic:latest
# expose:
# - 1491
# environment:
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
# volumes:
# - ./sonic.cfg:/etc/sonic.cfg:ro
# - ./data/sonic:/var/lib/sonic/store
sonic:
image: valeriansaliou/sonic:latest
build:
# custom build just auto-downloads archivebox's default sonic.cfg as a convenience
# not needed if you have already have /etc/sonic.cfg
dockerfile_inline: |
FROM quay.io/curl/curl:latest AS setup
RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg
FROM valeriansaliou/sonic:latest
COPY --from=setup /tmp/sonic.cfg /etc/sonic.cfg
expose:
- 1491
environment:
- SEARCH_BACKEND_PASSWORD=SomeSecretPassword
volumes:
- ./etc/sonic.cfg:/etc/sonic.cfg
- ./data/sonic:/var/lib/sonic/store
### Example: Watch the ArchiveBox browser in realtime as it archives things,
# or remote control it to set up logins and credentials for sites you want to archive.
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
novnc:
image: theasp/novnc:latest
environment:
- DISPLAY_WIDTH=1920
- DISPLAY_HEIGHT=1080
- RUN_XTERM=no
ports:
# to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html
- "8080:8080"
### Example: Put Nginx in front of the ArchiveBox server for SSL termination
# nginx:
# image: nginx:alpine
# ports:
# - 443:443
# - 80:80
# volumes:
# - ./etc/nginx.conf:/etc/nginx/nginx.conf
# - ./data:/var/www
### Example: To run pihole in order to block ad/tracker requests during archiving,
@ -99,57 +134,6 @@ services:
# - ./etc/dnsmasq:/etc/dnsmasq.d
### Example: Enable ability to run regularly scheduled archiving tasks by uncommenting this container
# $ docker compose run archivebox schedule --every=day --depth=1 'https://example.com/some/rss/feed.xml'
# then restart the scheduler container to apply the changes to the schedule
# $ docker compose restart archivebox_scheduler
# archivebox_scheduler:
# image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
# command: schedule --foreground
# environment:
# - MEDIA_MAX_SIZE=750m # increase this number to allow archiving larger audio/video files
# # - TIMEOUT=60 # increase if you see timeouts often during archiving / on slow networks
# # - ONLY_NEW=True # set to False to retry previously failed URLs when re-adding instead of skipping them
# # - CHECK_SSL_VALIDITY=True # set to False to allow saving URLs w/ broken SSL certs
# # - SAVE_ARCHIVE_DOT_ORG=True # set to False to disable submitting URLs to Archive.org when archiving
# # - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
# # - PGID=20
# volumes:
# - ./data:/data
# - ./etc/crontabs:/var/spool/cron/crontabs
# # cpus: 2 # uncomment / edit these values to limit container resource consumption
# # mem_limit: 2048m
# # shm_size: 1024m
### Example: Put Nginx in front of the ArchiveBox server for SSL termination
# nginx:
# image: nginx:alpine
# ports:
# - 443:443
# - 80:80
# volumes:
# - ./etc/nginx.conf:/etc/nginx/nginx.conf
# - ./data:/var/www
### Example: Watch the ArchiveBox browser in realtime as it archives things,
# or remote control it to set up logins and credentials for sites you want to archive.
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
novnc:
image: theasp/novnc:latest
environment:
- DISPLAY_WIDTH=1920
- DISPLAY_HEIGHT=1080
- RUN_XTERM=no
ports:
# to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html
- "8080:8080"
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
# wireguard:
@ -166,6 +150,13 @@ services:
# - ./wireguard.conf:/config/wg0.conf:ro
### Example: Run browsertrix in parallel with ArchiveBox
# browsertrix:
# image: webrecorder/browsertrix-crawler:latest
# volumes:
# - ./browsertrix:/crawls:z
### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox
# pywb:

View file

@ -9,6 +9,6 @@
{
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.46"
"single-file-cli": "^1.1.54"
}
}

View file

@ -15,6 +15,15 @@ dependencies = [
"requests>=2.24.0",
"croniter>=0.3.34",
"dateparser>=1.0.0",
<<<<<<< HEAD
=======
"django-extensions>=3.2.3",
"django>=4.2.0,<5.0",
"setuptools>=69.0.3",
"feedparser>=6.0.11",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
>>>>>>> dev
"python-crontab>=2.5.1",
"django>=3.1.3,<3.2",
"django-extensions>=3.0.3",
@ -22,9 +31,13 @@ dependencies = [
"w3lib>=1.22.0",
"yt-dlp>=2024.3.10",
# dont add playwright becuase packages without sdists cause trouble on many build systems that refuse to install wheel-only packages
<<<<<<< HEAD
# "playwright>=1.39.0; platform_machine != 'armv7l'",
"mypy-extensions>=0.4.3",
# "django-stubs-ext>=4.2.7",
=======
"playwright>=1.39.0; platform_machine != 'armv7l'",
>>>>>>> dev
]
classifiers = [
@ -65,11 +78,11 @@ classifiers = [
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
# apt install sonic
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev python3-ldap
"setuptools>=69.0.3",
"python-ldap>=3.4.3",
"django-auth-ldap>=4.1.0",
]
@ -84,7 +97,6 @@ ldap = [
[tool.pdm.dev-dependencies]
dev = [
# building
"setuptools>=69.0.3",
"wheel",
"pdm",
"homebrew-pypi-poet>=0.10.0",