Fix runtime hook config regressions

This commit is contained in:
Nick Sweeting
2026-06-21 06:09:04 -07:00
parent ef6f616b37
commit 3fa4fd5d18
5 changed files with 56 additions and 8 deletions
+2 -1
View File
@@ -4,6 +4,7 @@ import math
import json
import mimetypes
import re
from html import unescape
from collections import defaultdict
from pathlib import Path, PurePosixPath
from uuid import UUID
@@ -832,7 +833,7 @@ def _snapshots_rss_response(
crawl_user = snapshot.crawl.created_by if snapshot.crawl_id else None
description = f"Original URL: {snapshot.url}\nArchived snapshot: {archived_url}"
feed.add_item(
title=snapshot.title or snapshot.url,
title=unescape(snapshot.title or snapshot.url),
link=archived_url or web_base_url,
description=description,
unique_id=str(snapshot.id),
+1 -1
View File
@@ -664,7 +664,7 @@ class ArchiveBoxBaseConfig(
def _scoped_config(self, *, include_execution: bool) -> dict[str, Any]:
keys = type(self)._crawl_runtime_keys() if include_execution else type(self)._crawl_frozen_keys()
payload = self.model_dump(mode="json")
return {key: payload[key] for key in keys if payload.get(key) is not None}
return {key: payload[key] for key in keys if payload.get(key, None) is not None}
def for_crawl(self) -> dict[str, Any]:
"""Config scoped to crawl execution, without runtime object overlays."""
+13 -2
View File
@@ -279,15 +279,25 @@ def run_hook(
records = process.get_records() # Get parsed JSONL output
"""
from archivebox.machine.models import Process, Machine, NetworkInterface
from archivebox.config.common import ArchiveBoxConfig, _archivebox_config_input_names, get_config, normalize_runtime_config
from archivebox.config.common import (
ArchiveBoxConfig,
_archivebox_config_input_names,
get_config,
normalize_runtime_config,
_plugin_enabled_config_keys,
)
config_scope = {key.removeprefix("config_"): kwargs.pop(key) for key in list(kwargs) if key.startswith("config_")}
config_overrides = _config_to_overrides(config)
resolved_config = get_config(overrides=config_overrides, **config_scope)
hook_config = normalize_runtime_config(
resolved_config.for_crawl_runtime(),
resolved_config.for_crawl_runtime(runtime_overrides=config_overrides),
json_safe=False,
)
plugin_enabled_keys = set(_plugin_enabled_config_keys().values())
if plugin_enabled_keys.intersection(hook_config):
for enabled_key in plugin_enabled_keys:
hook_config.setdefault(enabled_key, False)
# Auto-detect timeout from plugin config if not explicitly provided
if timeout is None:
@@ -368,6 +378,7 @@ def run_hook(
archivebox_config_input_names = _archivebox_config_input_names()
for key in archivebox_config_input_names:
env.pop(key, None)
env.pop("PLUGINS", None)
env["DATA_DIR"] = str(CONSTANTS.DATA_DIR)
env["LIBRARY_VERSION"] = VERSION
env.setdefault("MACHINE_ID", os.environ.get("MACHINE_ID", CONSTANTS.MACHINE_ID))
+26 -3
View File
@@ -31,6 +31,26 @@ def current_network_interface_with_machine():
return NetworkInterface.objects.select_related("machine").get(id=current_iface.id)
def normalize_process_env(env: dict) -> dict:
normalized = dict(env or {})
raw_plugins = normalized.pop("PLUGINS", "")
selected_plugins = {name.strip().lower() for name in str(raw_plugins).split(",") if name.strip()}
from archivebox.config.common import ArchiveBoxConfig, _archivebox_config_input_names, is_sensitive_config_key
allowed_config_keys = ArchiveBoxConfig._crawl_runtime_keys()
config_input_names = _archivebox_config_input_names()
for key in list(normalized):
if is_sensitive_config_key(key) or (key in config_input_names and key not in allowed_config_keys):
normalized.pop(key, None)
if selected_plugins:
from archivebox.config.common import _plugin_enabled_config_keys, _plugins_with_required_plugins
selected_plugins = _plugins_with_required_plugins(selected_plugins)
for plugin_name, enabled_key in _plugin_enabled_config_keys().items():
normalized.setdefault(enabled_key, "True" if plugin_name in selected_plugins else "False")
return normalized
class ProcessService(BaseService):
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [
ProcessStartedEvent,
@@ -74,6 +94,7 @@ class ProcessService(BaseService):
started_at=started_at,
)
process = await process_query.order_by("-modified_at").afirst()
process_env = normalize_process_env(event.env)
if process is None:
process = await Process.objects.acreate(
machine=iface.machine,
@@ -82,7 +103,7 @@ class ProcessService(BaseService):
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
env=process_env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
@@ -97,7 +118,7 @@ class ProcessService(BaseService):
process.pwd = event.output_dir
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.env = process_env
process.timeout = event.timeout
process.pid = event.pid or None
process.url = event.url or process.url
@@ -172,6 +193,7 @@ class ProcessService(BaseService):
started_at=started_at,
)
process = await process_query.order_by("-modified_at").afirst()
process_env = normalize_process_env(event.env)
if process is None:
await Process.objects.acreate(
machine=iface.machine,
@@ -180,7 +202,7 @@ class ProcessService(BaseService):
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
env=process_env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
@@ -197,6 +219,7 @@ class ProcessService(BaseService):
"machine_id": iface.machine_id,
"iface_id": iface.id,
"pwd": event.output_dir,
"env": process_env,
"pid": event.pid or process.pid,
"url": event.url or process.url,
"process_type": process_type or process.process_type,
+14 -1
View File
@@ -55,7 +55,12 @@ from abxbus import BaseEvent
from abxbus.event_bus import EventBus, get_current_event, in_handler_context
from abxbus.event_handler import EventHandlerAbortedError, EventHandlerCancelledError
from archivebox.config.common import ArchiveBoxBaseConfig, normalize_runtime_config
from archivebox.config.common import (
ArchiveBoxBaseConfig,
normalize_runtime_config,
_plugin_enabled_config_keys,
_plugins_with_required_plugins,
)
from archivebox.misc.db import run_db_analyze_batch
from archivebox.core.shutdown_util import foreground_shutdown_signals, raise_if_shutdown_requested
from archivebox.search.sonic_daemon import register_sonic_daemon_event_handler
@@ -800,6 +805,11 @@ class CrawlRunner:
},
)
normalized_config = normalize_runtime_config(config)
configured_plugins = [name.strip().lower() for name in str(normalized_config.get("PLUGINS") or "").split(",") if name.strip()]
if configured_plugins:
selected_plugin_names = _plugins_with_required_plugins(set(configured_plugins))
for plugin_name, enabled_key in _plugin_enabled_config_keys().items():
normalized_config.setdefault(enabled_key, plugin_name in selected_plugin_names)
return {
"id": str(snapshot.id),
"url": snapshot.url,
@@ -1437,6 +1447,9 @@ def queued_plugins_for_snapshot(snapshot_id: str) -> list[str] | None:
def config_overrides_for_queued_plugins(selected_plugins: list[str], **overrides: Any) -> dict[str, Any]:
config_overrides = dict(overrides)
config_overrides["PLUGINS"] = ",".join(selected_plugins)
selected_plugin_names = _plugins_with_required_plugins({plugin_name.lower() for plugin_name in selected_plugins})
for plugin_name, enabled_key in _plugin_enabled_config_keys().items():
config_overrides[enabled_key] = plugin_name in selected_plugin_names
for plugin_name in selected_plugins:
if plugin_name.startswith("search_backend_"):
config_overrides[f"{plugin_name.upper()}_ENABLED"] = True