release: archivebox 0.9.31rc30

This commit is contained in:
Nick Sweeting
2026-05-24 02:05:13 -07:00
parent 035132ded5
commit cadd3f517d
9 changed files with 108 additions and 60 deletions
+38 -28
View File
@@ -267,7 +267,6 @@ RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends build-essential gcc python3-dev \
&& uv sync \
--no-cache \
--no-dev \
--inexact \
--all-extras \
@@ -281,24 +280,6 @@ RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
&& rm -rf /var/lib/apt/lists/*
# installs the pip packages that archivebox depends on, defined in pyproject.toml dependencies
# Install ArchiveBox Python package from the checked-out source.
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
&& pip install \
--no-deps \
"$CODE_DIR" \
&& ( \
pip show archivebox \
&& which archivebox \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt \
&& find /venv "$CODE_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
&& find /venv "$CODE_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete
# installs archivebox itself, and any other vendored packages in pkgs/*, defined in pyproject.toml workspaces
####################################################
# Setup ArchiveBox runtime config
ENV TMP_DIR=/tmp/archivebox \
PIP_VENV_PYTHON=/usr/bin/python3.12 \
@@ -318,18 +299,47 @@ RUN openssl rand -hex 16 > /etc/machine-id \
&& chown "$DEFAULT_PUID:$DEFAULT_PGID" "$PLAYWRIGHT_BROWSERS_PATH" \
&& echo -e "\nTMP_DIR=$TMP_DIR\nLIB_DIR=$LIB_DIR\nPLAYWRIGHT_BROWSERS_PATH=$PLAYWRIGHT_BROWSERS_PATH\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt
# Pre-bake plugin-managed runtime dependencies using the same installer paths
# users run later via archivebox init --install / archivebox install. Build-time
# runs as root so providers can satisfy OS-level deps, then ownership is
# returned to the runtime archivebox user.
RUN echo "[+] Initializing image collection and installing plugin runtime dependencies into $LIB_DIR..." \
&& PUID=0 PGID=0 archivebox init --install \
&& find /venv "$CODE_DIR" "$LIB_DIR" "$DATA_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
&& find /venv "$CODE_DIR" "$LIB_DIR" "$DATA_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \
# Pre-bake plugin-managed runtime dependencies using the same abx-dl installer
# path users run later, before copying ArchiveBox source so source-only edits do
# not invalidate the heavy browser/plugin dependency layer.
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
--mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
--mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
--mount=type=cache,target=/root/.cache/puppeteer,sharing=locked,id=puppeteer-$TARGETARCH$TARGETVARIANT \
--mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing plugin runtime dependencies into $LIB_DIR..." \
&& PUID=0 PGID=0 abx-dl plugins --install \
&& find "$LIB_DIR" "$DATA_DIR"/personas -type d -name __pycache__ -prune -exec rm -rf {} + \
&& find "$LIB_DIR" "$DATA_DIR"/personas -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \
&& rm -rf /root/.cache /var/cache/apt/* /var/lib/apt/lists/* \
&& (chown "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR" "$DATA_DIR"/logs "$DATA_DIR"/sources "$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas "$DATA_DIR"/index.sqlite3 "$DATA_DIR"/ArchiveBox.conf 2>/dev/null || true) \
&& (chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR"/personas 2>/dev/null || true) \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$LIB_DIR"
# Install ArchiveBox Python package from the checked-out source.
WORKDIR "$CODE_DIR"
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
&& pip install \
--no-deps \
"$CODE_DIR" \
&& ( \
pip show archivebox \
&& which archivebox \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt \
&& find /venv "$CODE_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
&& find /venv "$CODE_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete
# installs archivebox itself, and any other vendored packages in pkgs/*, defined in pyproject.toml workspaces
# Initialize an empty image collection without rerunning dependency installs.
WORKDIR "$DATA_DIR"
RUN echo "[+] Initializing image collection..." \
&& PUID=0 PGID=0 archivebox init \
&& find "$DATA_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
&& find "$DATA_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \
&& (chown "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR" "$DATA_DIR"/logs "$DATA_DIR"/sources "$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas "$DATA_DIR"/index.sqlite3 "$DATA_DIR"/ArchiveBox.conf 2>/dev/null || true)
# Print version for nice docker finish summary
RUN (echo -e "\n\n[√] Finished Docker build successfully. Saving build summary in: /VERSION.txt" \
&& echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \
+2
View File
@@ -58,6 +58,7 @@ class StatusChoices(str, Enum):
class AddCommandSchema(Schema):
urls: list[str]
snapshot_ids: list[str] | None = None
tag: str = ""
depth: int = 0
parser: str = "auto"
@@ -119,6 +120,7 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
crawl, snapshots = add(
urls=args.urls,
snapshot_ids=args.snapshot_ids,
tag=args.tag,
depth=args.depth,
update=args.update,
+15 -1
View File
@@ -4,6 +4,7 @@ __package__ = "archivebox.cli"
__command__ = "archivebox add"
import sys
import json
from pathlib import Path
from typing import TYPE_CHECKING
@@ -47,6 +48,7 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
@enforce_types
def add(
urls: str | list[str],
snapshot_ids: list[str] | None = None,
depth: int | str = 0,
max_urls: int = 0,
max_size: int | str = 0,
@@ -102,10 +104,22 @@ def add(
if update is None:
update = not config.ONLY_NEW
if isinstance(urls, str):
url_list = [line.strip() for line in urls.splitlines() if line.strip()]
else:
url_list = [str(url).strip() for url in urls if str(url).strip()]
if snapshot_ids and len(snapshot_ids) != len(url_list):
raise ValueError("snapshot_ids length must match urls length")
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
if snapshot_ids:
sources_file.write_text(
"\n".join(json.dumps({"url": url, "id": snapshot_ids[index], "tags": tag, "depth": 0}) for index, url in enumerate(url_list)),
)
else:
sources_file.write_text("\n".join(url_list))
# 2. Create a new Crawl with inline URLs
cli_args = [*sys.argv]
+4 -4
View File
@@ -60,13 +60,13 @@ class ArchiveBoxAdmin(admin.AdminSite):
except (IndexError, TypeError, ValueError):
continue
if count >= 1_000_000_000:
count_label = f"~{count / 1_000_000_000:.1f}B"
count_label = f"{count / 1_000_000_000:.1f}B"
elif count >= 1_000_000:
count_label = f"~{count / 1_000_000:.1f}M"
count_label = f"{count / 1_000_000:.1f}M"
elif count >= 1_000:
count_label = f"~{count / 1_000:.1f}K"
count_label = f"{count / 1_000:.1f}K"
else:
count_label = f"~{count:,}"
count_label = f"{count:,}"
count_label = count_label.replace(".0", "")
for model in models:
model["object_count"] = count
+11 -2
View File
@@ -8,7 +8,8 @@ from django.utils.html import escape, format_html, format_html_join
from django.utils import timezone
from django.utils.safestring import mark_safe
from django.contrib import admin, messages
from django.db.models import Count, Q
from django.db.models import Count, IntegerField, OuterRef, Q, Subquery, Value
from django.db.models.functions import Coalesce
from django_object_actions import action
@@ -576,7 +577,15 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
def get_queryset(self, request):
"""Optimize queries with select_related and annotations."""
qs = super().get_queryset(request)
return qs.select_related("schedule", "created_by").annotate(num_snapshots_cached=Count("snapshot_set"))
snapshot_count = (
Snapshot.objects.filter(crawl_id=OuterRef("pk")).order_by().values("crawl_id").annotate(count=Count("pk")).values("count")
)
return qs.select_related("schedule", "created_by").annotate(
num_snapshots_cached=Coalesce(
Subquery(snapshot_count, output_field=IntegerField()),
Value(0),
),
)
def get_fieldsets(self, request, obj=None):
return self.fieldsets if obj else self.add_fieldsets
+27 -16
View File
@@ -635,12 +635,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Parse JSONL or plain URL
try:
entry = json.loads(line)
snapshot_id = entry.get("id") or entry.get("snapshot_id")
url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip()))
depth = entry.get("depth", 0)
title = entry.get("title")
timestamp = entry.get("timestamp")
tags = entry.get("tags", "")
except json.JSONDecodeError:
snapshot_id = None
url = sanitize_extracted_url(fix_url_from_markdown(line.strip()))
depth = 0
title = None
@@ -660,25 +662,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if not self.has_remaining_snapshot_capacity():
break
# Create snapshot if doesn't exist
snapshot, created = Snapshot.objects.get_or_create(
url=url,
crawl=self,
defaults={
"depth": depth,
"title": title,
"timestamp": timestamp or str(timezone.now().timestamp()),
"status": Snapshot.INITIAL_STATE,
"retry_at": timezone.now(),
# Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
},
)
defaults = {
"depth": depth,
"title": title,
"timestamp": timestamp or str(timezone.now().timestamp()),
"status": Snapshot.INITIAL_STATE,
"retry_at": timezone.now(),
# Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
}
if snapshot_id:
snapshot, created = Snapshot.objects.update_or_create(
id=snapshot_id,
defaults={
**defaults,
"url": url,
"crawl": self,
},
)
else:
snapshot, created = Snapshot.objects.get_or_create(
url=url,
crawl=self,
defaults=defaults,
)
if created:
created_snapshots.append(snapshot)
# Save tags if present
if tags:
snapshot.save_tags(tags.split(","))
if tags:
snapshot.save_tags(tags.split(","))
# Ensure crawl -> snapshot symlink exists for both new and existing snapshots
try:
@@ -9,14 +9,16 @@
<span class="abx-admin-card__icon" aria-hidden="true">
<svg viewBox="0 0 24 24" role="presentation" focusable="false">
{% if model.object_name == "Crawl" %}
<path d="M4 7h6m4 0h6M7 7v5a3 3 0 0 0 3 3h4m3-3 3 3-3 3" />
<path d="M10 4h4v6h-4z" />
<path d="M7 5h10a2 2 0 0 1 2 2v10" />
<path d="M5 8h10a2 2 0 0 1 2 2v10H5z" />
<path d="M8 12h6M8 16h4" />
{% elif model.object_name == "CrawlSchedule" %}
<path d="M7 3v4M17 3v4M4 9h16M6 5h12a2 2 0 0 1 2 2v11a2 2 0 0 1-2 2H6a2 2 0 0 1-2-2V7a2 2 0 0 1 2-2z" />
<path d="M9 14h3v3" />
{% elif model.object_name == "Snapshot" %}
<path d="M5 4h14a2 2 0 0 1 2 2v12a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V6a2 2 0 0 1 2-2z" />
<path d="m7 15 3-3 2 2 4-5 3 4v3H7z" />
<path d="M6 3h9l4 4v14H6z" />
<path d="M15 3v5h5" />
<path d="M10 14h4M12 12v4" />
{% elif model.object_name == "ArchiveResult" %}
<path d="M6 3h8l4 4v14H6z" />
<path d="M14 3v5h5M9 14l2 2 4-5" />
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "archivebox",
"version": "0.9.31rc26",
"version": "0.9.31rc30",
"repository": "github:ArchiveBox/ArchiveBox",
"license": "MIT",
"dependencies": {
+4 -4
View File
@@ -1,6 +1,6 @@
[project]
name = "archivebox"
version = "0.9.31rc27"
version = "0.9.31rc30"
requires-python = ">=3.13"
description = "Self-hosted internet archiving solution."
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
@@ -79,9 +79,9 @@ dependencies = [
### Extractor dependencies (optional binaries detected at runtime via shutil.which)
### Binary/Package Management
"abxbus>=2.5.4", # EventBus API
"abxpkg>=1.10.17", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
"abx-plugins>=1.10.82", # shared ArchiveBox plugin package with Chrome/Puppeteer dependency wiring
"abx-dl>=1.10.82", # shared ArchiveBox downloader package with blocking install preflight
"abxpkg>=1.10.20", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
"abx-plugins>=1.10.85", # shared ArchiveBox plugin package with Chrome/Puppeteer dependency wiring
"abx-dl>=1.10.85", # shared ArchiveBox downloader package with blocking install preflight
### UUID7 backport for Python <3.14
"uuid7>=0.1.0; python_version < '3.14'", # provides the uuid_extensions module on Python 3.13
]