Files
ArchiveBox/Dockerfile
2026-06-14 17:29:38 -07:00

266 lines
12 KiB
Docker

# syntax=docker/dockerfile:1.7
# Multistage ArchiveBox Dockerfile that consumes the abx-dl runtime image.
# abx-dl owns Python, Node, Chromium, and downloader plugin runtimes.
# ArchiveBox owns sonic, supervisor, Django, and the app runtime.
# Build abx-dl first, then point this file at it:
# docker buildx build ../abx-dl -f ../abx-dl/Dockerfile \
# --build-context abxbus=../abxbus \
# --build-context abxpkg=../abxpkg \
# --build-context abx-plugins=../abx-plugins \
# -t archivebox/abx-dl:dev
# docker buildx build . -f Dockerfile \
# --build-arg ABX_DL_IMAGE=archivebox/abx-dl:latest \
# -t archivebox:multistage
ARG ABX_DL_IMAGE=archivebox/abx-dl:latest
FROM archivebox/sonic:1.4.9 AS sonic
FROM ${ABX_DL_IMAGE} AS archivebox-runtime-base
ARG TARGETPLATFORM
ARG TARGETOS
ARG TARGETARCH
ARG TARGETVARIANT
ENV TZ=UTC \
LANGUAGE=en_US:en \
LC_ALL=C.UTF-8 \
LANG=C.UTF-8 \
DEBIAN_FRONTEND=noninteractive \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
PYTHONIOENCODING=UTF-8 \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_COMPILE=1 \
PIP_ONLY_BINARY=aiohttp \
npm_config_loglevel=error
ENV PYTHON_VERSION=3.13 \
NODE_VERSION=24
ENV ARCHIVEBOX_USER=archivebox \
DEFAULT_ARCHIVEBOX_UID=911 \
DEFAULT_ARCHIVEBOX_GID=911 \
IN_DOCKER=True
ENV CODE_DIR=/app \
DATA_DIR=/data \
CONFIG_DIR=/opt/archivebox \
ABXPKG_LIB_DIR=/opt/archivebox/lib \
PLAYWRIGHT_BROWSERS_PATH=/opt/archivebox/lib/playwright/cache \
PERSONAS_DIR=/data/personas \
CHROME_HEADLESS=true \
CHROME_SANDBOX=false \
CHROME_ISOLATION=crawl \
CHROME_ARGS_EXTRA='["--disable-gpu","--disable-features=Translate,OptimizationGuideModelDownloading,MediaRouter"]'
ENV TMP_DIR=/tmp/archivebox \
PIP_VENV_PYTHON=/venv/bin/python3 \
GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no
ENV HOME=/home/archivebox \
XDG_CONFIG_HOME=/home/archivebox/.config \
XDG_CACHE_HOME=/opt/archivebox/lib/cache \
ABXPKG_INSTALL_TIMEOUT=600 \
ABXPKG_POSTINSTALL_SCRIPTS=True \
ABXPKG_MIN_RELEASE_AGE=0 \
TIMEOUT=600
ENV UV_COMPILE_BYTECODE=false \
UV_PYTHON_PREFERENCE=managed \
UV_PYTHON_INSTALL_DIR=/opt/uv/python \
UV_LINK_MODE=copy \
UV_PROJECT_ENVIRONMENT=/venv \
VIRTUAL_ENV=/venv \
PATH="/venv/bin:/opt/node/bin:$PATH"
SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "nounset", "-c"]
WORKDIR "$CODE_DIR"
RUN cp /VERSION.txt /ABX-DL-VERSION.txt \
&& (echo "[i] Docker build for ArchiveBox multistage starting..." \
&& echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) (${TARGETARCH} ${TARGETVARIANT})" \
&& echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \
&& uname -a \
&& sed -n '1,7p' /etc/os-release \
&& abxpkg load --binproviders=env node \
&& abxpkg load --binproviders=env uv \
) | tee -a /VERSION.txt
FROM archivebox-runtime-base AS archivebox-builder
WORKDIR "$CODE_DIR"
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
--mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
--mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
<<'EOF'
echo "[+] UV Installing ArchiveBox dependencies from pyproject.toml..."
echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache
echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-install-recommends
echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-install-suggests
rm -f /etc/apt/apt.conf.d/docker-clean
apt-get update -qq
apt-get install -qq -y --no-install-recommends \
build-essential gcc libldap2-dev libsasl2-dev libssl-dev
/usr/bin/uv venv --clear /venv --python "${PYTHON_VERSION}"
/usr/bin/uv pip install setuptools pip wheel
mkdir -p /tmp/archivebox-uv-project
/venv/bin/python - <<'PY'
from pathlib import Path
import json
import re
import urllib.request
source = Path("/app/pyproject.toml")
target = Path("/tmp/archivebox-uv-project/pyproject.toml")
text = source.read_text()
text = text.replace(
'environments = ["sys_platform == \'darwin\'", "sys_platform == \'linux\'"]',
'environments = ["sys_platform == \'linux\'"]',
)
# Docker builds need the just-published internal abx wheels immediately, but
# PyPI simple can lag the version JSON endpoints by tens of minutes. Generate a
# Docker-only dependency view from the version JSON so the published package
# metadata stays normal while image builds remain resumable after a release.
for package in ("abxpkg", "abx-plugins", "abx-dl"):
match = re.search(rf'"{re.escape(package)}>=(?P<version>[^"]+)"', text)
if not match:
continue
version = match.group("version")
with urllib.request.urlopen(f"https://pypi.org/pypi/{package}/{version}/json", timeout=20) as response:
data = json.load(response)
wheel_url = next(url["url"] for url in data["urls"] if url["filename"].endswith(".whl"))
text = re.sub(
rf'"{re.escape(package)}>=[^"]+"',
f'"{package} @ {wheel_url}"',
text,
count=1,
)
target.write_text(text)
PY
/usr/bin/uv sync \
--project /tmp/archivebox-uv-project \
--refresh \
--no-dev \
--inexact \
--no-install-project \
--no-install-workspace \
--no-sources
(find /venv/lib/python3.*/site-packages -type f -name '*.so' -exec strip --strip-unneeded {} + 2>/dev/null || true)
rm -f /venv/bin/uv /venv/bin/uvx
apt-get purge -y build-essential gcc libldap2-dev libsasl2-dev libssl-dev
apt-get autoremove -y
rm -rf /var/lib/apt/lists/*
EOF
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
&& COMMIT_HASH="$( \
if [[ -f "$CODE_DIR/.git/HEAD" ]]; then \
HEAD_REF="$(cat "$CODE_DIR/.git/HEAD")"; \
if [[ "$HEAD_REF" =~ ^[0-9a-fA-F]{40}$ ]]; then \
echo "$HEAD_REF"; \
elif [[ "$HEAD_REF" == ref:\ * ]]; then \
REF_PATH="${HEAD_REF#ref: }"; \
cat "$CODE_DIR/.git/$REF_PATH" 2>/dev/null || awk -v ref="$REF_PATH" '$2 == ref {print $1}' "$CODE_DIR/.git/packed-refs" 2>/dev/null || true; \
fi; \
fi)" \
&& if [[ "$COMMIT_HASH" =~ ^[0-9a-fA-F]{40}$ ]]; then echo "COMMIT_HASH=$COMMIT_HASH" | tee -a /VERSION.txt; fi \
&& /usr/bin/uv pip install --no-deps "$CODE_DIR" \
&& rm -f /venv/bin/uv /venv/bin/uvx \
&& /usr/bin/uv pip show archivebox | tee -a /VERSION.txt
FROM archivebox-runtime-base
LABEL name="archivebox" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
description="All-in-one self-hosted internet archiving solution" \
homepage="https://github.com/ArchiveBox/ArchiveBox" \
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \
org.opencontainers.image.title="ArchiveBox" \
org.opencontainers.image.vendor="ArchiveBox" \
org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \
org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \
com.docker.image.source.entrypoint="Dockerfile"
COPY --from=sonic /usr/local/bin/sonic /usr/local/bin/sonic
COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg
COPY --from=archivebox-builder /opt/uv/python /opt/uv/python
COPY --from=archivebox-builder /venv /venv
COPY --from=archivebox-builder /app /app
COPY --from=archivebox-builder /VERSION.txt /VERSION.txt
RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_ARCHIVEBOX_UID}..." \
&& printf 'export PATH="/venv/bin:/opt/node/bin:$PATH"\n' > /etc/profile.d/archivebox-path.sh \
&& ln -sf /venv/bin/archivebox /usr/local/bin/archivebox \
&& ln -sf /venv/bin/daphne /usr/local/bin/daphne \
&& ln -sf /venv/bin/supervisord /usr/local/bin/supervisord \
&& ln -sf /venv/bin/supervisorctl /usr/local/bin/supervisorctl \
&& getent group "$ARCHIVEBOX_USER" >/dev/null || groupadd --system "$ARCHIVEBOX_USER" \
&& id -u "$ARCHIVEBOX_USER" >/dev/null 2>&1 || useradd --system --create-home --gid "$ARCHIVEBOX_USER" --groups audio,video "$ARCHIVEBOX_USER" \
&& usermod --append --groups audio,video "$ARCHIVEBOX_USER" \
&& [[ "$(id -u "$ARCHIVEBOX_USER")" == "$DEFAULT_ARCHIVEBOX_UID" ]] || usermod -u "$DEFAULT_ARCHIVEBOX_UID" "$ARCHIVEBOX_USER" \
&& [[ "$(id -g "$ARCHIVEBOX_USER")" == "$DEFAULT_ARCHIVEBOX_GID" ]] || groupmod -g "$DEFAULT_ARCHIVEBOX_GID" "$ARCHIVEBOX_USER" \
&& abxpkg load --binproviders=env sonic | tee -a /VERSION.txt \
&& install -d -o "$DEFAULT_ARCHIVEBOX_UID" -g "$DEFAULT_ARCHIVEBOX_GID" "$DATA_DIR" "$TMP_DIR" "$CONFIG_DIR" "$ABXPKG_LIB_DIR" "$XDG_CACHE_HOME" "$PLAYWRIGHT_BROWSERS_PATH" \
&& install -d -o "$DEFAULT_ARCHIVEBOX_UID" -g "$DEFAULT_ARCHIVEBOX_GID" "/home/$ARCHIVEBOX_USER" \
&& chown "$DEFAULT_ARCHIVEBOX_UID:$DEFAULT_ARCHIVEBOX_GID" "$DATA_DIR" "$TMP_DIR" \
&& chown -R "$DEFAULT_ARCHIVEBOX_UID:$DEFAULT_ARCHIVEBOX_GID" "$ABXPKG_LIB_DIR" \
&& openssl rand -hex 16 > /etc/machine-id \
&& echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER ARCHIVEBOX_UID=$(id -u "$ARCHIVEBOX_USER") ARCHIVEBOX_GID=$(id -g "$ARCHIVEBOX_USER")" | tee -a /VERSION.txt \
&& echo -e "TMP_DIR=$TMP_DIR\nABXPKG_LIB_DIR=$ABXPKG_LIB_DIR\nPLAYWRIGHT_BROWSERS_PATH=$PLAYWRIGHT_BROWSERS_PATH\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt
WORKDIR "$DATA_DIR"
RUN echo "[+] Initializing image collection..." \
&& find "$DATA_DIR" -mindepth 1 -maxdepth 1 -exec rm -rf {} + \
&& HOME="$TMP_DIR" archivebox init \
&& (chown "$DEFAULT_ARCHIVEBOX_UID:$DEFAULT_ARCHIVEBOX_GID" \
"$DATA_DIR" "$DATA_DIR"/.archivebox_id "$DATA_DIR"/ArchiveBox.conf "$DATA_DIR"/index.sqlite3 \
"$DATA_DIR"/logs "$DATA_DIR"/logs/* "$DATA_DIR"/sources \
"$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas \
"$DATA_DIR"/tmp "$DATA_DIR"/tmp/* \
"$CONFIG_DIR" "$CONFIG_DIR"/config.env "$CONFIG_DIR"/derived.env \
"$TMP_DIR" "$ABXPKG_LIB_DIR" "$XDG_CACHE_HOME" "$PLAYWRIGHT_BROWSERS_PATH" \
2>/dev/null || true) \
&& find "$TMP_DIR" -mindepth 1 -maxdepth 1 -exec rm -rf {} +
RUN chmod +x "$CODE_DIR"/bin/*.sh \
&& chmod g+w "$TMP_DIR" "$ABXPKG_LIB_DIR" "$PLAYWRIGHT_BROWSERS_PATH"
RUN --mount=type=cache,target=/tmp/abxpkg-cache,sharing=locked,mode=1777 \
for forbidden_bin in gcc g++ make; do ! abxpkg load --binproviders=env "$forbidden_bin" >/dev/null 2>&1 || (echo "Unexpected build tool in runtime: $forbidden_bin" >&2 && exit 1); done \
&& stat -c "%U:%G %a %n" "$CONFIG_DIR" "$ABXPKG_LIB_DIR" "$PLAYWRIGHT_BROWSERS_PATH" \
&& setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups test -w "$CONFIG_DIR" \
&& setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups test -w "$ABXPKG_LIB_DIR" \
&& python3 -c 'from abx_dl.models import discover_plugins; [print(f"export {plugin.enabled_key}=True") for plugin in discover_plugins(runtime="archivebox").values() if plugin.enabled_key in plugin.config.properties]' > /tmp/archivebox-enable-plugins.env \
&& sort /tmp/archivebox-enable-plugins.env | tee -a /VERSION.txt \
&& source /tmp/archivebox-enable-plugins.env \
&& setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups env HOME=/tmp/abxpkg-cache ABXPKG_NO_CACHE=True ABXPKG_TMP_CACHE_DIR=/tmp/abxpkg-cache archivebox install \
&& setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups env HOME=/tmp/abxpkg-cache archivebox version 2>&1 | tee -a /VERSION.txt \
&& rm -rf /root/.cache /var/cache/apt/* /var/lib/apt/lists/*
RUN (echo -e "\n\n[√] Finished ArchiveBox multistage Docker build successfully." \
&& echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) (${TARGETARCH} ${TARGETVARIANT})" \
&& echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \
) | tee -a /VERSION.txt
WORKDIR "$DATA_DIR"
VOLUME "$DATA_DIR"
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
CMD curl --fail --silent --show-error --max-time 5 --connect-timeout 2 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK'
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "--init", "0.0.0.0:8000"]