diff --git a/Dockerfile b/Dockerfile index f16afed9..c72854fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -267,7 +267,6 @@ RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends build-essential gcc python3-dev \ && uv sync \ - --no-cache \ --no-dev \ --inexact \ --all-extras \ @@ -281,24 +280,6 @@ RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \ && rm -rf /var/lib/apt/lists/* # installs the pip packages that archivebox depends on, defined in pyproject.toml dependencies -# Install ArchiveBox Python package from the checked-out source. -COPY --chown=root:root --chmod=755 "." "$CODE_DIR/" -RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ - echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \ - && pip install \ - --no-deps \ - "$CODE_DIR" \ - && ( \ - pip show archivebox \ - && which archivebox \ - && echo -e '\n\n' \ - ) | tee -a /VERSION.txt \ - && find /venv "$CODE_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \ - && find /venv "$CODE_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete - # installs archivebox itself, and any other vendored packages in pkgs/*, defined in pyproject.toml workspaces - -#################################################### - # Setup ArchiveBox runtime config ENV TMP_DIR=/tmp/archivebox \ PIP_VENV_PYTHON=/usr/bin/python3.12 \ @@ -318,18 +299,47 @@ RUN openssl rand -hex 16 > /etc/machine-id \ && chown "$DEFAULT_PUID:$DEFAULT_PGID" "$PLAYWRIGHT_BROWSERS_PATH" \ && echo -e "\nTMP_DIR=$TMP_DIR\nLIB_DIR=$LIB_DIR\nPLAYWRIGHT_BROWSERS_PATH=$PLAYWRIGHT_BROWSERS_PATH\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt -# Pre-bake plugin-managed runtime dependencies using the same installer paths -# users run later via archivebox init --install / archivebox install. Build-time -# runs as root so providers can satisfy OS-level deps, then ownership is -# returned to the runtime archivebox user. -RUN echo "[+] Initializing image collection and installing plugin runtime dependencies into $LIB_DIR..." \ - && PUID=0 PGID=0 archivebox init --install \ - && find /venv "$CODE_DIR" "$LIB_DIR" "$DATA_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \ - && find /venv "$CODE_DIR" "$LIB_DIR" "$DATA_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \ +# Pre-bake plugin-managed runtime dependencies using the same abx-dl installer +# path users run later, before copying ArchiveBox source so source-only edits do +# not invalidate the heavy browser/plugin dependency layer. +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.cache/puppeteer,sharing=locked,id=puppeteer-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \ + echo "[+] Installing plugin runtime dependencies into $LIB_DIR..." \ + && PUID=0 PGID=0 abx-dl plugins --install \ + && find "$LIB_DIR" "$DATA_DIR"/personas -type d -name __pycache__ -prune -exec rm -rf {} + \ + && find "$LIB_DIR" "$DATA_DIR"/personas -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \ && rm -rf /root/.cache /var/cache/apt/* /var/lib/apt/lists/* \ - && (chown "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR" "$DATA_DIR"/logs "$DATA_DIR"/sources "$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas "$DATA_DIR"/index.sqlite3 "$DATA_DIR"/ArchiveBox.conf 2>/dev/null || true) \ + && (chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR"/personas 2>/dev/null || true) \ && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$LIB_DIR" +# Install ArchiveBox Python package from the checked-out source. +WORKDIR "$CODE_DIR" +COPY --chown=root:root --chmod=755 "." "$CODE_DIR/" +RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \ + && pip install \ + --no-deps \ + "$CODE_DIR" \ + && ( \ + pip show archivebox \ + && which archivebox \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt \ + && find /venv "$CODE_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \ + && find /venv "$CODE_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete + # installs archivebox itself, and any other vendored packages in pkgs/*, defined in pyproject.toml workspaces + +# Initialize an empty image collection without rerunning dependency installs. +WORKDIR "$DATA_DIR" +RUN echo "[+] Initializing image collection..." \ + && PUID=0 PGID=0 archivebox init \ + && find "$DATA_DIR" -type d -name __pycache__ -prune -exec rm -rf {} + \ + && find "$DATA_DIR" -type f \( -name '*.pyc' -o -name '*.pyo' \) -delete \ + && (chown "$DEFAULT_PUID:$DEFAULT_PGID" "$DATA_DIR" "$DATA_DIR"/logs "$DATA_DIR"/sources "$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas "$DATA_DIR"/index.sqlite3 "$DATA_DIR"/ArchiveBox.conf 2>/dev/null || true) + # Print version for nice docker finish summary RUN (echo -e "\n\n[√] Finished Docker build successfully. Saving build summary in: /VERSION.txt" \ && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \ diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 17ac4985..737a41e3 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -58,6 +58,7 @@ class StatusChoices(str, Enum): class AddCommandSchema(Schema): urls: list[str] + snapshot_ids: list[str] | None = None tag: str = "" depth: int = 0 parser: str = "auto" @@ -119,6 +120,7 @@ def cli_add(request: HttpRequest, args: AddCommandSchema): crawl, snapshots = add( urls=args.urls, + snapshot_ids=args.snapshot_ids, tag=args.tag, depth=args.depth, update=args.update, diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index f4d6c638..3c04eb45 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -4,6 +4,7 @@ __package__ = "archivebox.cli" __command__ = "archivebox add" import sys +import json from pathlib import Path from typing import TYPE_CHECKING @@ -47,6 +48,7 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]: @enforce_types def add( urls: str | list[str], + snapshot_ids: list[str] | None = None, depth: int | str = 0, max_urls: int = 0, max_size: int | str = 0, @@ -102,10 +104,22 @@ def add( if update is None: update = not config.ONLY_NEW + if isinstance(urls, str): + url_list = [line.strip() for line in urls.splitlines() if line.strip()] + else: + url_list = [str(url).strip() for url in urls if str(url).strip()] + if snapshot_ids and len(snapshot_ids) != len(url_list): + raise ValueError("snapshot_ids length must match urls length") + # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt" sources_file.parent.mkdir(parents=True, exist_ok=True) - sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls)) + if snapshot_ids: + sources_file.write_text( + "\n".join(json.dumps({"url": url, "id": snapshot_ids[index], "tags": tag, "depth": 0}) for index, url in enumerate(url_list)), + ) + else: + sources_file.write_text("\n".join(url_list)) # 2. Create a new Crawl with inline URLs cli_args = [*sys.argv] diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py index 0d7cd2b3..ff5bfe97 100644 --- a/archivebox/core/admin_site.py +++ b/archivebox/core/admin_site.py @@ -60,13 +60,13 @@ class ArchiveBoxAdmin(admin.AdminSite): except (IndexError, TypeError, ValueError): continue if count >= 1_000_000_000: - count_label = f"~{count / 1_000_000_000:.1f}B" + count_label = f"{count / 1_000_000_000:.1f}B" elif count >= 1_000_000: - count_label = f"~{count / 1_000_000:.1f}M" + count_label = f"{count / 1_000_000:.1f}M" elif count >= 1_000: - count_label = f"~{count / 1_000:.1f}K" + count_label = f"{count / 1_000:.1f}K" else: - count_label = f"~{count:,}" + count_label = f"{count:,}" count_label = count_label.replace(".0", "") for model in models: model["object_count"] = count diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 31c535fd..5a982fc2 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -8,7 +8,8 @@ from django.utils.html import escape, format_html, format_html_join from django.utils import timezone from django.utils.safestring import mark_safe from django.contrib import admin, messages -from django.db.models import Count, Q +from django.db.models import Count, IntegerField, OuterRef, Q, Subquery, Value +from django.db.models.functions import Coalesce from django_object_actions import action @@ -576,7 +577,15 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): def get_queryset(self, request): """Optimize queries with select_related and annotations.""" qs = super().get_queryset(request) - return qs.select_related("schedule", "created_by").annotate(num_snapshots_cached=Count("snapshot_set")) + snapshot_count = ( + Snapshot.objects.filter(crawl_id=OuterRef("pk")).order_by().values("crawl_id").annotate(count=Count("pk")).values("count") + ) + return qs.select_related("schedule", "created_by").annotate( + num_snapshots_cached=Coalesce( + Subquery(snapshot_count, output_field=IntegerField()), + Value(0), + ), + ) def get_fieldsets(self, request, obj=None): return self.fieldsets if obj else self.add_fieldsets diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 26596dfe..a37f7286 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -635,12 +635,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Parse JSONL or plain URL try: entry = json.loads(line) + snapshot_id = entry.get("id") or entry.get("snapshot_id") url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip())) depth = entry.get("depth", 0) title = entry.get("title") timestamp = entry.get("timestamp") tags = entry.get("tags", "") except json.JSONDecodeError: + snapshot_id = None url = sanitize_extracted_url(fix_url_from_markdown(line.strip())) depth = 0 title = None @@ -660,25 +662,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if not self.has_remaining_snapshot_capacity(): break - # Create snapshot if doesn't exist - snapshot, created = Snapshot.objects.get_or_create( - url=url, - crawl=self, - defaults={ - "depth": depth, - "title": title, - "timestamp": timestamp or str(timezone.now().timestamp()), - "status": Snapshot.INITIAL_STATE, - "retry_at": timezone.now(), - # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl - }, - ) + defaults = { + "depth": depth, + "title": title, + "timestamp": timestamp or str(timezone.now().timestamp()), + "status": Snapshot.INITIAL_STATE, + "retry_at": timezone.now(), + # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl + } + if snapshot_id: + snapshot, created = Snapshot.objects.update_or_create( + id=snapshot_id, + defaults={ + **defaults, + "url": url, + "crawl": self, + }, + ) + else: + snapshot, created = Snapshot.objects.get_or_create( + url=url, + crawl=self, + defaults=defaults, + ) if created: created_snapshots.append(snapshot) - # Save tags if present - if tags: - snapshot.save_tags(tags.split(",")) + if tags: + snapshot.save_tags(tags.split(",")) # Ensure crawl -> snapshot symlink exists for both new and existing snapshots try: diff --git a/archivebox/templates/admin/index_model_card.html b/archivebox/templates/admin/index_model_card.html index 1b4e96ae..606a48fa 100644 --- a/archivebox/templates/admin/index_model_card.html +++ b/archivebox/templates/admin/index_model_card.html @@ -9,14 +9,16 @@