ArchiveBox/docker-compose.yml

# Usage:
#     mkdir -p ~/archivebox/data && cd ~/archivebox
#     curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
#     docker compose run archivebox init
#     docker compose up -d && open 'http://admin.archivebox.localhost:8000'

#     docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
#     docker compose run -T archivebox add < ~/Downloads/bookmarks.txt
#     docker compose run archivebox help
# Documentation:
#     https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose

services:
    archivebox:
        image: ${ARCHIVEBOX_IMAGE:-archivebox/archivebox:dev}
        ports:
            - 8000:8000
        volumes:
            - ./data:/data
        environment:
            # - ADMIN_USERNAME=admin                             # creates an admin user on first run with the given user/pass combo
            # - ADMIN_PASSWORD=SomeSecretPassword
            - BASE_URL=${BASE_URL:-http://archivebox.localhost:8000}                  # public URL used to build admin/web/api/snapshot links
            - SERVER_SECURITY_MODE=${SERVER_SECURITY_MODE:-safe-subdomains-fullreplay} # safe-onedomain-nojsreplay if you can't do wildcard DNS *.your.domain
            - PUBLIC_ADD_VIEW=False                              # set to True to allow anonymous users to submit new URLs to archive
            # For all other options, it's better to use data/ArchiveBox.conf or the new Personas config feature in the admin UI...
            # - TIMEOUT=60
            # - CHECK_SSL_VALIDITY=False
            # - USER_AGENT="..."
            # ...
            # For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration
        shm_size: "1gb"                         # Chrome runs more efficiently when using a reasonably sized shared memory pool


####################################################################################################################
    ######## Optional Addons: tweak examples below as needed for your specific use case ########

    ### `archivebox server` now runs the orchestrator itself, so scheduled crawls and queued UI/API jobs
    # are processed by the main container without needing a separate scheduler sidecar. To add a new job:
    #   $ docker compose run archivebox schedule --add --every=day --depth=1 'https://example.com/some/rss/feed.xml'
    # the running server orchestrator will pick it up automatically at the next due time.
    # https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving


    ### ArchiveBox now starts and uses Sonic automatically when SEARCH_BACKEND_ENGINE=sonic.
    # If Sonic is ever started after not running for a while, update its full-text index by running:
    #   $ docker compose run archivebox update --index-only
    # https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search


    ### This optional container runs xvfb+noVNC so you can watch the ArchiveBox browser as it archives things,
    # or remote control it to set up a chrome profile w/ login credentials for sites you want to archive.
    # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
    # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#docker-vnc-setup

    # novnc:
    #     image: theasp/novnc:latest
    #     profiles:
    #         - novnc
    #     environment:
    #         - DISPLAY_WIDTH=1920
    #         - DISPLAY_HEIGHT=1080
    #         - RUN_XTERM=no
    #     ports:
    #         # to view/control ArchiveBox's browser, visit: http://127.0.0.1:8080/vnc.html
    #         # restricted to access from localhost by default because it has no authentication
    #         - 127.0.0.1:8080:8080


    ### TLS / HTTPS ingress (opt-in, everything below is driven by env vars only).
    #
    # ArchiveBox serves the admin/web/api control plane AND every archived
    # snapshot on its own subdomain for security isolation, so a public deployment
    # needs wildcard DNS + TLS for *.your.domain. Pick ONE of the two ingress options
    # below by activating its profile (e.g. put COMPOSE_PROFILES=https or =tunnel in a
    # .env file next to this one, then `docker compose up -d`). Both want:
    #     BASE_URL=https://archive.example.com
    #     SERVER_SECURITY_MODE=safe-subdomains-fullreplay

    ### Option A — Cloudflare Tunnel (no public IP / behind NAT, e.g. home/NAS).
    # Cloudflare's edge terminates TLS and resolves *.your.domain to a SINGLE tunnel;
    # every snapshot/control subdomain rides one connection to archivebox:8000, which
    # routes by Host header — so the tunnel itself needs no wildcard cert or per-host
    # config. ZERO manual setup: the one-shot tunnel-init below uses your
    # CLOUDFLARE_API_KEY (give it Account:Cloudflare Tunnel:Edit + Zone:DNS:Edit
    # + Zone:Read) to create/reuse the tunnel, point *.your.domain and your.domain at
    # it, and write its connector token — then cloudflared just runs it.
    tunnel-init:
        image: python:3-alpine   # tiny stdlib-only provisioner; runs as root so it can chown the token
        profiles: ["tunnel"]
        restart: "no"
        environment:
            - BASE_URL=${BASE_URL:-https://archive.example.com}
            - CLOUDFLARE_API_KEY=${CLOUDFLARE_API_KEY:-}   # a Cloudflare API *Token* (used as a Bearer token), NOT the legacy global API key
            - CLOUDFLARE_ACCOUNT_ID=${CLOUDFLARE_ACCOUNT_ID:-}   # optional; first account used if unset
            - TUNNEL_SERVICE=http://archivebox:8000
            - TUNNEL_TOKEN_OUT=/shared/token
        volumes:
            - ./data/proxy/tunnel:/shared
        entrypoint:
            - python3
            - -c
            - |
              import os, json, base64, secrets, urllib.request, urllib.error
              API = "https://api.cloudflare.com/client/v4"
              TOKEN = os.environ["CLOUDFLARE_API_KEY"]
              DOMAIN = os.environ.get("BASE_URL", "").split("://")[-1].split("/")[0].split(":")[0]
              SERVICE = os.environ.get("TUNNEL_SERVICE", "http://archivebox:8000")
              OUT = os.environ.get("TUNNEL_TOKEN_OUT", "/shared/token")
              H = {"Authorization": "Bearer " + TOKEN, "Content-Type": "application/json"}
              def call(method, path, data=None):
                  body = json.dumps(data).encode() if data is not None else None
                  req = urllib.request.Request(API + path, data=body, headers=H, method=method)
                  try:
                      with urllib.request.urlopen(req, timeout=30) as r: return json.load(r)
                  except urllib.error.HTTPError as e: return json.load(e)
              assert DOMAIN and TOKEN, "set BASE_URL (https://archive.example.com) + CLOUDFLARE_API_KEY"
              acct = os.environ.get("CLOUDFLARE_ACCOUNT_ID", "").strip() or call("GET", "/accounts")["result"][0]["id"]
              labels = DOMAIN.split("."); zone = None   # DOMAIN may be a subdomain; find its registrable zone
              for i in range(len(labels) - 1):
                  res = call("GET", f"/zones?name={'.'.join(labels[i:])}")["result"]
                  if res: zone = res[0]["id"]; break
              assert zone, f"no Cloudflare zone found for {DOMAIN}"
              NAME = "archivebox-" + DOMAIN.replace(".", "-")
              ts = call("GET", f"/accounts/{acct}/cfd_tunnel?name={NAME}&is_deleted=false")["result"]
              tid = ts[0]["id"] if ts else call("POST", f"/accounts/{acct}/cfd_tunnel", {"name": NAME,
                      "tunnel_secret": base64.b64encode(secrets.token_bytes(32)).decode(), "config_src": "cloudflare"})["result"]["id"]
              target = f"{tid}.cfargotunnel.com"
              call("PUT", f"/accounts/{acct}/cfd_tunnel/{tid}/configurations", {"config": {"ingress": [
                  {"hostname": f"*.{DOMAIN}", "service": SERVICE}, {"hostname": DOMAIN, "service": SERVICE},
                  {"service": "http_status:404"}]}})
              for name in (DOMAIN, f"*.{DOMAIN}"):
                  recs = call("GET", f"/zones/{zone}/dns_records?name={name}")["result"]
                  cname = [r for r in recs if r["type"] == "CNAME"]
                  for r in [r for r in recs if r["type"] in ("A", "AAAA")] + cname[1:]:
                      call("DELETE", f"/zones/{zone}/dns_records/{r['id']}")
                  desired = {"type": "CNAME", "name": name, "content": target, "proxied": True, "ttl": 1}
                  call("PUT", f"/zones/{zone}/dns_records/{cname[0]['id']}", desired) if cname else call("POST", f"/zones/{zone}/dns_records", desired)
              tok = call("GET", f"/accounts/{acct}/cfd_tunnel/{tid}/token")["result"]
              os.makedirs(os.path.dirname(OUT) or ".", exist_ok=True)
              with open(OUT, "w") as f: f.write(tok)
              os.chmod(OUT, 0o600)  # private: never world-readable on the host bind-mount
              try: os.chown(OUT, 65532, 65532)  # best-effort: own it by the cloudflared (uid 65532) connector that reads it
              except OSError as e: print(f"[tunnel-init] warning: could not chown {OUT} to uid 65532 ({e}); ensure the cloudflared container can read it")
              print(f"[tunnel-init] {NAME} ({tid}): *.{DOMAIN} + {DOMAIN} -> {SERVICE}; connector token -> {OUT}")

    cloudflared:
        image: cloudflare/cloudflared
        profiles: ["tunnel"]
        restart: unless-stopped
        depends_on:
            archivebox:
                condition: service_started
            tunnel-init:
                condition: service_completed_successfully
        command: tunnel --no-autoupdate --protocol http2 run --token-file /shared/token
        volumes:
            - ./data/proxy/tunnel:/shared:ro

    ### Option B — Traefik reverse proxy + automatic wildcard TLS (you have a public IP).
    # ONE container terminates TLS for the apex + every snapshot subdomain and proxies
    # to archivebox:8000. Traefik is also an ACME client (it embeds go-acme/lego), so it
    # fetches a single *.your.domain WILDCARD cert via DNS-01 and auto-renews it — no
    # separate cert sidecar. All config is generated inline; no extra files.
    #
    # WILDCARD DNS — you must do this ONE manual step first (no proxy can do it for you):
    #   point a wildcard record at this server's public IP, e.g. at your DNS host add
    #       A   *.archive.example.com   ->  <this server's public IP>
    #       A   archive.example.com     ->  <this server's public IP>
    #   (AAAA too if you have IPv6). That's what makes snap-*.archive.example.com reach
    #   this box. Traefik then only needs the DNS *API* to solve the ACME DNS-01 challenge:
    #
    #   set ARCHIVEBOX_ACME_DNS to your provider and put its credentials in a .env next to
    #   this file (passed straight through to Traefik/lego) — any of ~100 providers:
    #     cloudflare   -> ARCHIVEBOX_ACME_DNS=cloudflare   + CLOUDFLARE_DNS_API_TOKEN=...
    #     route53      -> ARCHIVEBOX_ACME_DNS=route53      + AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_REGION
    #     digitalocean -> ARCHIVEBOX_ACME_DNS=digitalocean + DO_AUTH_TOKEN
    #     ... full list + exact var names: https://doc.traefik.io/traefik/https/acme/#providers
    #   Leave ARCHIVEBOX_ACME_DNS unset to skip ACME — Traefik then serves its built-in
    #   self-signed cert (browser warning), handy for local/testing.
    traefik:
        image: traefik:v3
        profiles: ["https"]
        restart: unless-stopped
        depends_on: [archivebox]
        ports:
            - "80:80"
            - "443:443"
        environment:
            - BASE_URL=${BASE_URL:-https://archive.example.com}
            - ARCHIVEBOX_ACME_EMAIL=${ARCHIVEBOX_ACME_EMAIL:-admin@example.com}
            - ARCHIVEBOX_ACME_DNS=${ARCHIVEBOX_ACME_DNS:-}
        env_file:
            - path: .env       # passes your DNS provider's creds (CLOUDFLARE_DNS_API_TOKEN, AWS_*, DO_AUTH_TOKEN, ...) to Traefik
              required: false
        volumes:
            - ./data/proxy/traefik:/certs   # Traefik stores acme.json (the wildcard cert) here
        entrypoint:
            - sh
            - -c
            - |
              set -eu
              DOMAIN=$$(printf '%s' "$$BASE_URL" | sed -E 's#^[a-z]+://##; s#[:/].*##')
              # catch-all router -> archivebox (Host-routed); domain-free, so no docker socket needed
              printf 'http:\n  routers:\n    archivebox:\n      rule: "HostRegexp(`^.+$$`)"\n      service: archivebox\n  services:\n    archivebox:\n      loadBalancer:\n        servers:\n          - url: "http://archivebox:8000"\n' > /etc/traefik/dynamic.yml
              set -- --entrypoints.web.address=:80 --entrypoints.websecure.address=:443 \
                --entrypoints.web.http.redirections.entrypoint.to=websecure \
                --entrypoints.web.http.redirections.entrypoint.scheme=https \
                --providers.file.filename=/etc/traefik/dynamic.yml
              if [ -n "$${ARCHIVEBOX_ACME_DNS:-}" ]; then
                echo "[traefik] wildcard cert for *.$$DOMAIN via $$ARCHIVEBOX_ACME_DNS DNS-01"
                set -- "$$@" --entrypoints.websecure.http.tls.certresolver=le \
                  --entrypoints.websecure.http.tls.domains[0].main="$$DOMAIN" \
                  --entrypoints.websecure.http.tls.domains[0].sans="*.$$DOMAIN" \
                  --certificatesresolvers.le.acme.email="$$ARCHIVEBOX_ACME_EMAIL" \
                  --certificatesresolvers.le.acme.storage=/certs/acme.json \
                  --certificatesresolvers.le.acme.dnschallenge=true \
                  --certificatesresolvers.le.acme.dnschallenge.provider="$$ARCHIVEBOX_ACME_DNS"
              else
                echo "[traefik] no ARCHIVEBOX_ACME_DNS set -> serving Traefik's default self-signed cert (set a DNS provider for real wildcard TLS)"
              fi
              exec traefik "$$@"

    ### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel to avoid IP blocks.
    # You can also use any other VPN that works at the docker/IP level, e.g. Tailscale, OpenVPN, etc.

    # wireguard:
    #   image: linuxserver/wireguard:latest
    #   network_mode: 'service:archivebox'
    #   cap_add:
    #     - NET_ADMIN
    #     - SYS_MODULE
    #   sysctls:
    #     - net.ipv4.conf.all.rp_filter=2
    #     - net.ipv4.conf.all.src_valid_mark=1
    #   volumes:
    #     - /lib/modules:/lib/modules
    #     - ./wireguard.conf:/config/wg0.conf:ro

    ### Example: Run ChangeDetection.io to watch for changes to websites, then trigger ArchiveBox to archive them
    # Documentation: https://github.com/dgtlmoon/changedetection.io
    # More info: https://github.com/dgtlmoon/changedetection.io/blob/master/docker-compose.yml

    # changedetection:
    #     image: ghcr.io/dgtlmoon/changedetection.io
    #     volumes:
    #         - ./data-changedetection:/datastore


# HOW TO: Set up cloud storage for your ./data/archive (e.g. Amazon S3, Backblaze B2, Google Drive, OneDrive, SFTP, etc.)
#   https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage
#
#   Follow the steps here to set up the Docker RClone Plugin https://rclone.org/docker/
#     $ docker plugin install rclone/docker-volume-rclone:amd64 --grant-all-permissions --alias rclone
#     $ nano /var/lib/docker-plugins/rclone/config/rclone.conf
#     [examplegdrive]
#     type = drive
#     scope = drive
#     drive_id = 1234567...
#     root_folder_id = 0Abcd...
#     token = {"access_token":...}

# volumes:
#     archive:
#         driver: rclone
#         driver_opts:
#             remote: 'examplegdrive:archivebox'
#             allow_other: 'true'
#             vfs_cache_mode: full
#             poll_interval: 0