Files
ArchiveBox/docker-compose.yml
2026-06-10 09:13:02 -07:00

273 lines
15 KiB
YAML

# Usage:
# mkdir -p ~/archivebox/data && cd ~/archivebox
# curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
# docker compose run archivebox init
# docker compose up -d && open 'http://admin.archivebox.localhost:8000'
# docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
# docker compose run -T archivebox add < ~/Downloads/bookmarks.txt
# docker compose run archivebox help
# Documentation:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
services:
archivebox:
image: ${ARCHIVEBOX_IMAGE:-archivebox/archivebox:dev}
ports:
- 8000:8000
volumes:
- ./data:/data
environment:
# - ADMIN_USERNAME=admin # creates an admin user on first run with the given user/pass combo
# - ADMIN_PASSWORD=SomeSecretPassword
- BASE_URL=${BASE_URL:-http://archivebox.localhost:8000} # public URL used to build admin/web/api/snapshot links
- SERVER_SECURITY_MODE=${SERVER_SECURITY_MODE:-safe-subdomains-fullreplay} # safe-onedomain-nojsreplay if you can't do wildcard DNS *.your.domain
- PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
# For all other options, it's better to use data/ArchiveBox.conf or the new Personas config feature in the admin UI...
# - TIMEOUT=60
# - CHECK_SSL_VALIDITY=False
# - USER_AGENT="..."
# ...
# For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration
shm_size: "1gb" # Chrome runs more efficiently when using a reasonably sized shared memory pool
####################################################################################################################
######## Optional Addons: tweak examples below as needed for your specific use case ########
### `archivebox server` now runs the orchestrator itself, so scheduled crawls and queued UI/API jobs
# are processed by the main container without needing a separate scheduler sidecar. To add a new job:
# $ docker compose run archivebox schedule --add --every=day --depth=1 'https://example.com/some/rss/feed.xml'
# the running server orchestrator will pick it up automatically at the next due time.
# https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving
### ArchiveBox now starts and uses Sonic automatically when SEARCH_BACKEND_ENGINE=sonic.
# If Sonic is ever started after not running for a while, update its full-text index by running:
# $ docker compose run archivebox update --index-only
# https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search
### This optional container runs xvfb+noVNC so you can watch the ArchiveBox browser as it archives things,
# or remote control it to set up a chrome profile w/ login credentials for sites you want to archive.
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#docker-vnc-setup
# novnc:
# image: theasp/novnc:latest
# profiles:
# - novnc
# environment:
# - DISPLAY_WIDTH=1920
# - DISPLAY_HEIGHT=1080
# - RUN_XTERM=no
# ports:
# # to view/control ArchiveBox's browser, visit: http://127.0.0.1:8080/vnc.html
# # restricted to access from localhost by default because it has no authentication
# - 127.0.0.1:8080:8080
### TLS / HTTPS ingress (opt-in, everything below is driven by env vars only).
#
# ArchiveBox serves the admin/web/api control plane AND every archived
# snapshot on its own subdomain for security isolation, so a public deployment
# needs wildcard DNS + TLS for *.your.domain. Pick ONE of the two ingress options
# below by activating its profile (e.g. put COMPOSE_PROFILES=https or =tunnel in a
# .env file next to this one, then `docker compose up -d`). Both want:
# BASE_URL=https://archive.example.com
# SERVER_SECURITY_MODE=safe-subdomains-fullreplay
### Option A — Cloudflare Tunnel (no public IP / behind NAT, e.g. home/NAS).
# Cloudflare's edge terminates TLS and resolves *.your.domain to a SINGLE tunnel;
# every snapshot/control subdomain rides one connection to archivebox:8000, which
# routes by Host header — so the tunnel itself needs no wildcard cert or per-host
# config. ZERO manual setup: the one-shot tunnel-init below uses your
# CLOUDFLARE_API_KEY (give it Account:Cloudflare Tunnel:Edit + Zone:DNS:Edit
# + Zone:Read) to create/reuse the tunnel, point *.your.domain and your.domain at
# it, and write its connector token — then cloudflared just runs it.
tunnel-init:
image: python:3-alpine # tiny stdlib-only provisioner; runs as root so it can chown the token
profiles: ["tunnel"]
restart: "no"
environment:
- BASE_URL=${BASE_URL:-https://archive.example.com}
- CLOUDFLARE_API_KEY=${CLOUDFLARE_API_KEY:-} # a Cloudflare API *Token* (used as a Bearer token), NOT the legacy global API key
- CLOUDFLARE_ACCOUNT_ID=${CLOUDFLARE_ACCOUNT_ID:-} # optional; first account used if unset
- TUNNEL_SERVICE=http://archivebox:8000
- TUNNEL_TOKEN_OUT=/shared/token
volumes:
- ./data/proxy/tunnel:/shared
entrypoint:
- python3
- -c
- |
import os, json, base64, secrets, urllib.request, urllib.error
API = "https://api.cloudflare.com/client/v4"
TOKEN = os.environ["CLOUDFLARE_API_KEY"]
DOMAIN = os.environ.get("BASE_URL", "").split("://")[-1].split("/")[0].split(":")[0]
SERVICE = os.environ.get("TUNNEL_SERVICE", "http://archivebox:8000")
OUT = os.environ.get("TUNNEL_TOKEN_OUT", "/shared/token")
H = {"Authorization": "Bearer " + TOKEN, "Content-Type": "application/json"}
def call(method, path, data=None):
body = json.dumps(data).encode() if data is not None else None
req = urllib.request.Request(API + path, data=body, headers=H, method=method)
try:
with urllib.request.urlopen(req, timeout=30) as r: return json.load(r)
except urllib.error.HTTPError as e: return json.load(e)
assert DOMAIN and TOKEN, "set BASE_URL (https://archive.example.com) + CLOUDFLARE_API_KEY"
acct = os.environ.get("CLOUDFLARE_ACCOUNT_ID", "").strip() or call("GET", "/accounts")["result"][0]["id"]
labels = DOMAIN.split("."); zone = None # DOMAIN may be a subdomain; find its registrable zone
for i in range(len(labels) - 1):
res = call("GET", f"/zones?name={'.'.join(labels[i:])}")["result"]
if res: zone = res[0]["id"]; break
assert zone, f"no Cloudflare zone found for {DOMAIN}"
NAME = "archivebox-" + DOMAIN.replace(".", "-")
ts = call("GET", f"/accounts/{acct}/cfd_tunnel?name={NAME}&is_deleted=false")["result"]
tid = ts[0]["id"] if ts else call("POST", f"/accounts/{acct}/cfd_tunnel", {"name": NAME,
"tunnel_secret": base64.b64encode(secrets.token_bytes(32)).decode(), "config_src": "cloudflare"})["result"]["id"]
target = f"{tid}.cfargotunnel.com"
call("PUT", f"/accounts/{acct}/cfd_tunnel/{tid}/configurations", {"config": {"ingress": [
{"hostname": f"*.{DOMAIN}", "service": SERVICE}, {"hostname": DOMAIN, "service": SERVICE},
{"service": "http_status:404"}]}})
for name in (DOMAIN, f"*.{DOMAIN}"):
recs = call("GET", f"/zones/{zone}/dns_records?name={name}")["result"]
cname = [r for r in recs if r["type"] == "CNAME"]
for r in [r for r in recs if r["type"] in ("A", "AAAA")] + cname[1:]:
call("DELETE", f"/zones/{zone}/dns_records/{r['id']}")
desired = {"type": "CNAME", "name": name, "content": target, "proxied": True, "ttl": 1}
call("PUT", f"/zones/{zone}/dns_records/{cname[0]['id']}", desired) if cname else call("POST", f"/zones/{zone}/dns_records", desired)
tok = call("GET", f"/accounts/{acct}/cfd_tunnel/{tid}/token")["result"]
os.makedirs(os.path.dirname(OUT) or ".", exist_ok=True)
with open(OUT, "w") as f: f.write(tok)
os.chmod(OUT, 0o600) # private: never world-readable on the host bind-mount
try: os.chown(OUT, 65532, 65532) # best-effort: own it by the cloudflared (uid 65532) connector that reads it
except OSError as e: print(f"[tunnel-init] warning: could not chown {OUT} to uid 65532 ({e}); ensure the cloudflared container can read it")
print(f"[tunnel-init] {NAME} ({tid}): *.{DOMAIN} + {DOMAIN} -> {SERVICE}; connector token -> {OUT}")
cloudflared:
image: cloudflare/cloudflared
profiles: ["tunnel"]
restart: unless-stopped
depends_on:
archivebox:
condition: service_started
tunnel-init:
condition: service_completed_successfully
command: tunnel --no-autoupdate --protocol http2 run --token-file /shared/token
volumes:
- ./data/proxy/tunnel:/shared:ro
### Option B — Traefik reverse proxy + automatic wildcard TLS (you have a public IP).
# ONE container terminates TLS for the apex + every snapshot subdomain and proxies
# to archivebox:8000. Traefik is also an ACME client (it embeds go-acme/lego), so it
# fetches a single *.your.domain WILDCARD cert via DNS-01 and auto-renews it — no
# separate cert sidecar. All config is generated inline; no extra files.
#
# WILDCARD DNS — you must do this ONE manual step first (no proxy can do it for you):
# point a wildcard record at this server's public IP, e.g. at your DNS host add
# A *.archive.example.com -> <this server's public IP>
# A archive.example.com -> <this server's public IP>
# (AAAA too if you have IPv6). That's what makes snap-*.archive.example.com reach
# this box. Traefik then only needs the DNS *API* to solve the ACME DNS-01 challenge:
#
# set ARCHIVEBOX_ACME_DNS to your provider and put its credentials in a .env next to
# this file (passed straight through to Traefik/lego) — any of ~100 providers:
# cloudflare -> ARCHIVEBOX_ACME_DNS=cloudflare + CLOUDFLARE_DNS_API_TOKEN=...
# route53 -> ARCHIVEBOX_ACME_DNS=route53 + AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_REGION
# digitalocean -> ARCHIVEBOX_ACME_DNS=digitalocean + DO_AUTH_TOKEN
# ... full list + exact var names: https://doc.traefik.io/traefik/https/acme/#providers
# Leave ARCHIVEBOX_ACME_DNS unset to skip ACME — Traefik then serves its built-in
# self-signed cert (browser warning), handy for local/testing.
traefik:
image: traefik:v3
profiles: ["https"]
restart: unless-stopped
depends_on: [archivebox]
ports:
- "80:80"
- "443:443"
environment:
- BASE_URL=${BASE_URL:-https://archive.example.com}
- ARCHIVEBOX_ACME_EMAIL=${ARCHIVEBOX_ACME_EMAIL:-admin@example.com}
- ARCHIVEBOX_ACME_DNS=${ARCHIVEBOX_ACME_DNS:-}
env_file:
- path: .env # passes your DNS provider's creds (CLOUDFLARE_DNS_API_TOKEN, AWS_*, DO_AUTH_TOKEN, ...) to Traefik
required: false
volumes:
- ./data/proxy/traefik:/certs # Traefik stores acme.json (the wildcard cert) here
entrypoint:
- sh
- -c
- |
set -eu
DOMAIN=$$(printf '%s' "$$BASE_URL" | sed -E 's#^[a-z]+://##; s#[:/].*##')
# catch-all router -> archivebox (Host-routed); domain-free, so no docker socket needed
printf 'http:\n routers:\n archivebox:\n rule: "HostRegexp(`^.+$$`)"\n service: archivebox\n services:\n archivebox:\n loadBalancer:\n servers:\n - url: "http://archivebox:8000"\n' > /etc/traefik/dynamic.yml
set -- --entrypoints.web.address=:80 --entrypoints.websecure.address=:443 \
--entrypoints.web.http.redirections.entrypoint.to=websecure \
--entrypoints.web.http.redirections.entrypoint.scheme=https \
--providers.file.filename=/etc/traefik/dynamic.yml
if [ -n "$${ARCHIVEBOX_ACME_DNS:-}" ]; then
echo "[traefik] wildcard cert for *.$$DOMAIN via $$ARCHIVEBOX_ACME_DNS DNS-01"
set -- "$$@" --entrypoints.websecure.http.tls.certresolver=le \
--entrypoints.websecure.http.tls.domains[0].main="$$DOMAIN" \
--entrypoints.websecure.http.tls.domains[0].sans="*.$$DOMAIN" \
--certificatesresolvers.le.acme.email="$$ARCHIVEBOX_ACME_EMAIL" \
--certificatesresolvers.le.acme.storage=/certs/acme.json \
--certificatesresolvers.le.acme.dnschallenge=true \
--certificatesresolvers.le.acme.dnschallenge.provider="$$ARCHIVEBOX_ACME_DNS"
else
echo "[traefik] no ARCHIVEBOX_ACME_DNS set -> serving Traefik's default self-signed cert (set a DNS provider for real wildcard TLS)"
fi
exec traefik "$$@"
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel to avoid IP blocks.
# You can also use any other VPN that works at the docker/IP level, e.g. Tailscale, OpenVPN, etc.
# wireguard:
# image: linuxserver/wireguard:latest
# network_mode: 'service:archivebox'
# cap_add:
# - NET_ADMIN
# - SYS_MODULE
# sysctls:
# - net.ipv4.conf.all.rp_filter=2
# - net.ipv4.conf.all.src_valid_mark=1
# volumes:
# - /lib/modules:/lib/modules
# - ./wireguard.conf:/config/wg0.conf:ro
### Example: Run ChangeDetection.io to watch for changes to websites, then trigger ArchiveBox to archive them
# Documentation: https://github.com/dgtlmoon/changedetection.io
# More info: https://github.com/dgtlmoon/changedetection.io/blob/master/docker-compose.yml
# changedetection:
# image: ghcr.io/dgtlmoon/changedetection.io
# volumes:
# - ./data-changedetection:/datastore
# HOW TO: Set up cloud storage for your ./data/archive (e.g. Amazon S3, Backblaze B2, Google Drive, OneDrive, SFTP, etc.)
# https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage
#
# Follow the steps here to set up the Docker RClone Plugin https://rclone.org/docker/
# $ docker plugin install rclone/docker-volume-rclone:amd64 --grant-all-permissions --alias rclone
# $ nano /var/lib/docker-plugins/rclone/config/rclone.conf
# [examplegdrive]
# type = drive
# scope = drive
# drive_id = 1234567...
# root_folder_id = 0Abcd...
# token = {"access_token":...}
# volumes:
# archive:
# driver: rclone
# driver_opts:
# remote: 'examplegdrive:archivebox'
# allow_other: 'true'
# vfs_cache_mode: full
# poll_interval: 0