mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-06-21 19:10:45 -04:00
release: archivebox 0.9.31rc30
This commit is contained in:
+38
-28
@@ -267,7 +267,6 @@ RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends build-essential gcc python3-dev \
|
||||
&& uv sync \
|
||||
--no-cache \
|
||||
--no-dev \
|
||||
--inexact \
|
||||
--all-extras \
|
||||
@@ -281,24 +280,6 @@ RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
# installs the pip packages that archivebox depends on, defined in pyproject.toml dependencies
|
||||
|
||||
# Install ArchiveBox Python package from the checked-out source.
|
||||
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
|
||||
&& pip install \
|
||||
--no-deps \
|
||||
"$CODE_DIR" \
|
||||
&& ( \
|
||||
pip show archivebox \
|
||||
&& which archivebox \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt \
|
||||
&& find /venv "$CODE_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
|
||||
&& find /venv "$CODE_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete
|
||||
# installs archivebox itself, and any other vendored packages in pkgs/*, defined in pyproject.toml workspaces
|
||||
|
||||
####################################################
|
||||
|
||||
# Setup ArchiveBox runtime config
|
||||
ENV TMP_DIR=/tmp/archivebox \
|
||||
PIP_VENV_PYTHON=/usr/bin/python3.12 \
|
||||
@@ -318,18 +299,47 @@ RUN openssl rand -hex 16 > /etc/machine-id \
|
||||
&& chown "$DEFAULT_PUID:$DEFAULT_PGID" "$PLAYWRIGHT_BROWSERS_PATH" \
|
||||
&& echo -e "\nTMP_DIR=$TMP_DIR\nLIB_DIR=$LIB_DIR\nPLAYWRIGHT_BROWSERS_PATH=$PLAYWRIGHT_BROWSERS_PATH\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt
|
||||
|
||||
# Pre-bake plugin-managed runtime dependencies using the same installer paths
|
||||
# users run later via archivebox init --install / archivebox install. Build-time
|
||||
# runs as root so providers can satisfy OS-level deps, then ownership is
|
||||
# returned to the runtime archivebox user.
|
||||
RUN echo "[+] Initializing image collection and installing plugin runtime dependencies into $LIB_DIR..." \
|
||||
&& PUID=0 PGID=0 archivebox init --install \
|
||||
&& find /venv "$CODE_DIR" "$LIB_DIR" "$DATA_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
|
||||
&& find /venv "$CODE_DIR" "$LIB_DIR" "$DATA_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \
|
||||
# Pre-bake plugin-managed runtime dependencies using the same abx-dl installer
|
||||
# path users run later, before copying ArchiveBox source so source-only edits do
|
||||
# not invalidate the heavy browser/plugin dependency layer.
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||
--mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
--mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
|
||||
--mount=type=cache,target=/root/.cache/puppeteer,sharing=locked,id=puppeteer-$TARGETARCH$TARGETVARIANT \
|
||||
--mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] Installing plugin runtime dependencies into $LIB_DIR..." \
|
||||
&& PUID=0 PGID=0 abx-dl plugins --install \
|
||||
&& find "$LIB_DIR" "$DATA_DIR"/personas -type d -name __pycache__ -prune -exec rm -rf {} + \
|
||||
&& find "$LIB_DIR" "$DATA_DIR"/personas -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \
|
||||
&& rm -rf /root/.cache /var/cache/apt/* /var/lib/apt/lists/* \
|
||||
&& (chown "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR" "$DATA_DIR"/logs "$DATA_DIR"/sources "$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas "$DATA_DIR"/index.sqlite3 "$DATA_DIR"/ArchiveBox.conf 2>/dev/null || true) \
|
||||
&& (chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR"/personas 2>/dev/null || true) \
|
||||
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$LIB_DIR"
|
||||
|
||||
# Install ArchiveBox Python package from the checked-out source.
|
||||
WORKDIR "$CODE_DIR"
|
||||
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
|
||||
&& pip install \
|
||||
--no-deps \
|
||||
"$CODE_DIR" \
|
||||
&& ( \
|
||||
pip show archivebox \
|
||||
&& which archivebox \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt \
|
||||
&& find /venv "$CODE_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
|
||||
&& find /venv "$CODE_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete
|
||||
# installs archivebox itself, and any other vendored packages in pkgs/*, defined in pyproject.toml workspaces
|
||||
|
||||
# Initialize an empty image collection without rerunning dependency installs.
|
||||
WORKDIR "$DATA_DIR"
|
||||
RUN echo "[+] Initializing image collection..." \
|
||||
&& PUID=0 PGID=0 archivebox init \
|
||||
&& find "$DATA_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \
|
||||
&& find "$DATA_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \
|
||||
&& (chown "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR" "$DATA_DIR"/logs "$DATA_DIR"/sources "$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas "$DATA_DIR"/index.sqlite3 "$DATA_DIR"/ArchiveBox.conf 2>/dev/null || true)
|
||||
|
||||
# Print version for nice docker finish summary
|
||||
RUN (echo -e "\n\n[√] Finished Docker build successfully. Saving build summary in: /VERSION.txt" \
|
||||
&& echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \
|
||||
|
||||
@@ -58,6 +58,7 @@ class StatusChoices(str, Enum):
|
||||
|
||||
class AddCommandSchema(Schema):
|
||||
urls: list[str]
|
||||
snapshot_ids: list[str] | None = None
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
parser: str = "auto"
|
||||
@@ -119,6 +120,7 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
|
||||
|
||||
crawl, snapshots = add(
|
||||
urls=args.urls,
|
||||
snapshot_ids=args.snapshot_ids,
|
||||
tag=args.tag,
|
||||
depth=args.depth,
|
||||
update=args.update,
|
||||
|
||||
@@ -4,6 +4,7 @@ __package__ = "archivebox.cli"
|
||||
__command__ = "archivebox add"
|
||||
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -47,6 +48,7 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
||||
@enforce_types
|
||||
def add(
|
||||
urls: str | list[str],
|
||||
snapshot_ids: list[str] | None = None,
|
||||
depth: int | str = 0,
|
||||
max_urls: int = 0,
|
||||
max_size: int | str = 0,
|
||||
@@ -102,10 +104,22 @@ def add(
|
||||
if update is None:
|
||||
update = not config.ONLY_NEW
|
||||
|
||||
if isinstance(urls, str):
|
||||
url_list = [line.strip() for line in urls.splitlines() if line.strip()]
|
||||
else:
|
||||
url_list = [str(url).strip() for url in urls if str(url).strip()]
|
||||
if snapshot_ids and len(snapshot_ids) != len(url_list):
|
||||
raise ValueError("snapshot_ids length must match urls length")
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
|
||||
if snapshot_ids:
|
||||
sources_file.write_text(
|
||||
"\n".join(json.dumps({"url": url, "id": snapshot_ids[index], "tags": tag, "depth": 0}) for index, url in enumerate(url_list)),
|
||||
)
|
||||
else:
|
||||
sources_file.write_text("\n".join(url_list))
|
||||
|
||||
# 2. Create a new Crawl with inline URLs
|
||||
cli_args = [*sys.argv]
|
||||
|
||||
@@ -60,13 +60,13 @@ class ArchiveBoxAdmin(admin.AdminSite):
|
||||
except (IndexError, TypeError, ValueError):
|
||||
continue
|
||||
if count >= 1_000_000_000:
|
||||
count_label = f"~{count / 1_000_000_000:.1f}B"
|
||||
count_label = f"{count / 1_000_000_000:.1f}B"
|
||||
elif count >= 1_000_000:
|
||||
count_label = f"~{count / 1_000_000:.1f}M"
|
||||
count_label = f"{count / 1_000_000:.1f}M"
|
||||
elif count >= 1_000:
|
||||
count_label = f"~{count / 1_000:.1f}K"
|
||||
count_label = f"{count / 1_000:.1f}K"
|
||||
else:
|
||||
count_label = f"~{count:,}"
|
||||
count_label = f"{count:,}"
|
||||
count_label = count_label.replace(".0", "")
|
||||
for model in models:
|
||||
model["object_count"] = count
|
||||
|
||||
@@ -8,7 +8,8 @@ from django.utils.html import escape, format_html, format_html_join
|
||||
from django.utils import timezone
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.contrib import admin, messages
|
||||
from django.db.models import Count, Q
|
||||
from django.db.models import Count, IntegerField, OuterRef, Q, Subquery, Value
|
||||
from django.db.models.functions import Coalesce
|
||||
|
||||
|
||||
from django_object_actions import action
|
||||
@@ -576,7 +577,15 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
def get_queryset(self, request):
|
||||
"""Optimize queries with select_related and annotations."""
|
||||
qs = super().get_queryset(request)
|
||||
return qs.select_related("schedule", "created_by").annotate(num_snapshots_cached=Count("snapshot_set"))
|
||||
snapshot_count = (
|
||||
Snapshot.objects.filter(crawl_id=OuterRef("pk")).order_by().values("crawl_id").annotate(count=Count("pk")).values("count")
|
||||
)
|
||||
return qs.select_related("schedule", "created_by").annotate(
|
||||
num_snapshots_cached=Coalesce(
|
||||
Subquery(snapshot_count, output_field=IntegerField()),
|
||||
Value(0),
|
||||
),
|
||||
)
|
||||
|
||||
def get_fieldsets(self, request, obj=None):
|
||||
return self.fieldsets if obj else self.add_fieldsets
|
||||
|
||||
+27
-16
@@ -635,12 +635,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
# Parse JSONL or plain URL
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
snapshot_id = entry.get("id") or entry.get("snapshot_id")
|
||||
url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip()))
|
||||
depth = entry.get("depth", 0)
|
||||
title = entry.get("title")
|
||||
timestamp = entry.get("timestamp")
|
||||
tags = entry.get("tags", "")
|
||||
except json.JSONDecodeError:
|
||||
snapshot_id = None
|
||||
url = sanitize_extracted_url(fix_url_from_markdown(line.strip()))
|
||||
depth = 0
|
||||
title = None
|
||||
@@ -660,25 +662,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if not self.has_remaining_snapshot_capacity():
|
||||
break
|
||||
|
||||
# Create snapshot if doesn't exist
|
||||
snapshot, created = Snapshot.objects.get_or_create(
|
||||
url=url,
|
||||
crawl=self,
|
||||
defaults={
|
||||
"depth": depth,
|
||||
"title": title,
|
||||
"timestamp": timestamp or str(timezone.now().timestamp()),
|
||||
"status": Snapshot.INITIAL_STATE,
|
||||
"retry_at": timezone.now(),
|
||||
# Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
|
||||
},
|
||||
)
|
||||
defaults = {
|
||||
"depth": depth,
|
||||
"title": title,
|
||||
"timestamp": timestamp or str(timezone.now().timestamp()),
|
||||
"status": Snapshot.INITIAL_STATE,
|
||||
"retry_at": timezone.now(),
|
||||
# Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
|
||||
}
|
||||
if snapshot_id:
|
||||
snapshot, created = Snapshot.objects.update_or_create(
|
||||
id=snapshot_id,
|
||||
defaults={
|
||||
**defaults,
|
||||
"url": url,
|
||||
"crawl": self,
|
||||
},
|
||||
)
|
||||
else:
|
||||
snapshot, created = Snapshot.objects.get_or_create(
|
||||
url=url,
|
||||
crawl=self,
|
||||
defaults=defaults,
|
||||
)
|
||||
|
||||
if created:
|
||||
created_snapshots.append(snapshot)
|
||||
# Save tags if present
|
||||
if tags:
|
||||
snapshot.save_tags(tags.split(","))
|
||||
if tags:
|
||||
snapshot.save_tags(tags.split(","))
|
||||
|
||||
# Ensure crawl -> snapshot symlink exists for both new and existing snapshots
|
||||
try:
|
||||
|
||||
@@ -9,14 +9,16 @@
|
||||
<span class="abx-admin-card__icon" aria-hidden="true">
|
||||
<svg viewBox="0 0 24 24" role="presentation" focusable="false">
|
||||
{% if model.object_name == "Crawl" %}
|
||||
<path d="M4 7h6m4 0h6M7 7v5a3 3 0 0 0 3 3h4m3-3 3 3-3 3" />
|
||||
<path d="M10 4h4v6h-4z" />
|
||||
<path d="M7 5h10a2 2 0 0 1 2 2v10" />
|
||||
<path d="M5 8h10a2 2 0 0 1 2 2v10H5z" />
|
||||
<path d="M8 12h6M8 16h4" />
|
||||
{% elif model.object_name == "CrawlSchedule" %}
|
||||
<path d="M7 3v4M17 3v4M4 9h16M6 5h12a2 2 0 0 1 2 2v11a2 2 0 0 1-2 2H6a2 2 0 0 1-2-2V7a2 2 0 0 1 2-2z" />
|
||||
<path d="M9 14h3v3" />
|
||||
{% elif model.object_name == "Snapshot" %}
|
||||
<path d="M5 4h14a2 2 0 0 1 2 2v12a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V6a2 2 0 0 1 2-2z" />
|
||||
<path d="m7 15 3-3 2 2 4-5 3 4v3H7z" />
|
||||
<path d="M6 3h9l4 4v14H6z" />
|
||||
<path d="M15 3v5h5" />
|
||||
<path d="M10 14h4M12 12v4" />
|
||||
{% elif model.object_name == "ArchiveResult" %}
|
||||
<path d="M6 3h8l4 4v14H6z" />
|
||||
<path d="M14 3v5h5M9 14l2 2 4-5" />
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "archivebox",
|
||||
"version": "0.9.31rc26",
|
||||
"version": "0.9.31rc30",
|
||||
"repository": "github:ArchiveBox/ArchiveBox",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
|
||||
+4
-4
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "archivebox"
|
||||
version = "0.9.31rc27"
|
||||
version = "0.9.31rc30"
|
||||
requires-python = ">=3.13"
|
||||
description = "Self-hosted internet archiving solution."
|
||||
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
|
||||
@@ -79,9 +79,9 @@ dependencies = [
|
||||
### Extractor dependencies (optional binaries detected at runtime via shutil.which)
|
||||
### Binary/Package Management
|
||||
"abxbus>=2.5.4", # EventBus API
|
||||
"abxpkg>=1.10.17", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
|
||||
"abx-plugins>=1.10.82", # shared ArchiveBox plugin package with Chrome/Puppeteer dependency wiring
|
||||
"abx-dl>=1.10.82", # shared ArchiveBox downloader package with blocking install preflight
|
||||
"abxpkg>=1.10.20", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
|
||||
"abx-plugins>=1.10.85", # shared ArchiveBox plugin package with Chrome/Puppeteer dependency wiring
|
||||
"abx-dl>=1.10.85", # shared ArchiveBox downloader package with blocking install preflight
|
||||
### UUID7 backport for Python <3.14
|
||||
"uuid7>=0.1.0; python_version < '3.14'", # provides the uuid_extensions module on Python 3.13
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user