mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-06-21 19:10:45 -04:00
Fix runtime hook config regressions
This commit is contained in:
@@ -4,6 +4,7 @@ import math
|
||||
import json
|
||||
import mimetypes
|
||||
import re
|
||||
from html import unescape
|
||||
from collections import defaultdict
|
||||
from pathlib import Path, PurePosixPath
|
||||
from uuid import UUID
|
||||
@@ -832,7 +833,7 @@ def _snapshots_rss_response(
|
||||
crawl_user = snapshot.crawl.created_by if snapshot.crawl_id else None
|
||||
description = f"Original URL: {snapshot.url}\nArchived snapshot: {archived_url}"
|
||||
feed.add_item(
|
||||
title=snapshot.title or snapshot.url,
|
||||
title=unescape(snapshot.title or snapshot.url),
|
||||
link=archived_url or web_base_url,
|
||||
description=description,
|
||||
unique_id=str(snapshot.id),
|
||||
|
||||
@@ -664,7 +664,7 @@ class ArchiveBoxBaseConfig(
|
||||
def _scoped_config(self, *, include_execution: bool) -> dict[str, Any]:
|
||||
keys = type(self)._crawl_runtime_keys() if include_execution else type(self)._crawl_frozen_keys()
|
||||
payload = self.model_dump(mode="json")
|
||||
return {key: payload[key] for key in keys if payload.get(key) is not None}
|
||||
return {key: payload[key] for key in keys if payload.get(key, None) is not None}
|
||||
|
||||
def for_crawl(self) -> dict[str, Any]:
|
||||
"""Config scoped to crawl execution, without runtime object overlays."""
|
||||
|
||||
@@ -279,15 +279,25 @@ def run_hook(
|
||||
records = process.get_records() # Get parsed JSONL output
|
||||
"""
|
||||
from archivebox.machine.models import Process, Machine, NetworkInterface
|
||||
from archivebox.config.common import ArchiveBoxConfig, _archivebox_config_input_names, get_config, normalize_runtime_config
|
||||
from archivebox.config.common import (
|
||||
ArchiveBoxConfig,
|
||||
_archivebox_config_input_names,
|
||||
get_config,
|
||||
normalize_runtime_config,
|
||||
_plugin_enabled_config_keys,
|
||||
)
|
||||
|
||||
config_scope = {key.removeprefix("config_"): kwargs.pop(key) for key in list(kwargs) if key.startswith("config_")}
|
||||
config_overrides = _config_to_overrides(config)
|
||||
resolved_config = get_config(overrides=config_overrides, **config_scope)
|
||||
hook_config = normalize_runtime_config(
|
||||
resolved_config.for_crawl_runtime(),
|
||||
resolved_config.for_crawl_runtime(runtime_overrides=config_overrides),
|
||||
json_safe=False,
|
||||
)
|
||||
plugin_enabled_keys = set(_plugin_enabled_config_keys().values())
|
||||
if plugin_enabled_keys.intersection(hook_config):
|
||||
for enabled_key in plugin_enabled_keys:
|
||||
hook_config.setdefault(enabled_key, False)
|
||||
|
||||
# Auto-detect timeout from plugin config if not explicitly provided
|
||||
if timeout is None:
|
||||
@@ -368,6 +378,7 @@ def run_hook(
|
||||
archivebox_config_input_names = _archivebox_config_input_names()
|
||||
for key in archivebox_config_input_names:
|
||||
env.pop(key, None)
|
||||
env.pop("PLUGINS", None)
|
||||
env["DATA_DIR"] = str(CONSTANTS.DATA_DIR)
|
||||
env["LIBRARY_VERSION"] = VERSION
|
||||
env.setdefault("MACHINE_ID", os.environ.get("MACHINE_ID", CONSTANTS.MACHINE_ID))
|
||||
|
||||
@@ -31,6 +31,26 @@ def current_network_interface_with_machine():
|
||||
return NetworkInterface.objects.select_related("machine").get(id=current_iface.id)
|
||||
|
||||
|
||||
def normalize_process_env(env: dict) -> dict:
|
||||
normalized = dict(env or {})
|
||||
raw_plugins = normalized.pop("PLUGINS", "")
|
||||
selected_plugins = {name.strip().lower() for name in str(raw_plugins).split(",") if name.strip()}
|
||||
from archivebox.config.common import ArchiveBoxConfig, _archivebox_config_input_names, is_sensitive_config_key
|
||||
|
||||
allowed_config_keys = ArchiveBoxConfig._crawl_runtime_keys()
|
||||
config_input_names = _archivebox_config_input_names()
|
||||
for key in list(normalized):
|
||||
if is_sensitive_config_key(key) or (key in config_input_names and key not in allowed_config_keys):
|
||||
normalized.pop(key, None)
|
||||
if selected_plugins:
|
||||
from archivebox.config.common import _plugin_enabled_config_keys, _plugins_with_required_plugins
|
||||
|
||||
selected_plugins = _plugins_with_required_plugins(selected_plugins)
|
||||
for plugin_name, enabled_key in _plugin_enabled_config_keys().items():
|
||||
normalized.setdefault(enabled_key, "True" if plugin_name in selected_plugins else "False")
|
||||
return normalized
|
||||
|
||||
|
||||
class ProcessService(BaseService):
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [
|
||||
ProcessStartedEvent,
|
||||
@@ -74,6 +94,7 @@ class ProcessService(BaseService):
|
||||
started_at=started_at,
|
||||
)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
process_env = normalize_process_env(event.env)
|
||||
if process is None:
|
||||
process = await Process.objects.acreate(
|
||||
machine=iface.machine,
|
||||
@@ -82,7 +103,7 @@ class ProcessService(BaseService):
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
env=process_env,
|
||||
timeout=event.timeout,
|
||||
pid=event.pid or None,
|
||||
url=event.url or None,
|
||||
@@ -97,7 +118,7 @@ class ProcessService(BaseService):
|
||||
|
||||
process.pwd = event.output_dir
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.env = process_env
|
||||
process.timeout = event.timeout
|
||||
process.pid = event.pid or None
|
||||
process.url = event.url or process.url
|
||||
@@ -172,6 +193,7 @@ class ProcessService(BaseService):
|
||||
started_at=started_at,
|
||||
)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
process_env = normalize_process_env(event.env)
|
||||
if process is None:
|
||||
await Process.objects.acreate(
|
||||
machine=iface.machine,
|
||||
@@ -180,7 +202,7 @@ class ProcessService(BaseService):
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
env=process_env,
|
||||
timeout=event.timeout,
|
||||
pid=event.pid or None,
|
||||
url=event.url or None,
|
||||
@@ -197,6 +219,7 @@ class ProcessService(BaseService):
|
||||
"machine_id": iface.machine_id,
|
||||
"iface_id": iface.id,
|
||||
"pwd": event.output_dir,
|
||||
"env": process_env,
|
||||
"pid": event.pid or process.pid,
|
||||
"url": event.url or process.url,
|
||||
"process_type": process_type or process.process_type,
|
||||
|
||||
@@ -55,7 +55,12 @@ from abxbus import BaseEvent
|
||||
from abxbus.event_bus import EventBus, get_current_event, in_handler_context
|
||||
from abxbus.event_handler import EventHandlerAbortedError, EventHandlerCancelledError
|
||||
|
||||
from archivebox.config.common import ArchiveBoxBaseConfig, normalize_runtime_config
|
||||
from archivebox.config.common import (
|
||||
ArchiveBoxBaseConfig,
|
||||
normalize_runtime_config,
|
||||
_plugin_enabled_config_keys,
|
||||
_plugins_with_required_plugins,
|
||||
)
|
||||
from archivebox.misc.db import run_db_analyze_batch
|
||||
from archivebox.core.shutdown_util import foreground_shutdown_signals, raise_if_shutdown_requested
|
||||
from archivebox.search.sonic_daemon import register_sonic_daemon_event_handler
|
||||
@@ -800,6 +805,11 @@ class CrawlRunner:
|
||||
},
|
||||
)
|
||||
normalized_config = normalize_runtime_config(config)
|
||||
configured_plugins = [name.strip().lower() for name in str(normalized_config.get("PLUGINS") or "").split(",") if name.strip()]
|
||||
if configured_plugins:
|
||||
selected_plugin_names = _plugins_with_required_plugins(set(configured_plugins))
|
||||
for plugin_name, enabled_key in _plugin_enabled_config_keys().items():
|
||||
normalized_config.setdefault(enabled_key, plugin_name in selected_plugin_names)
|
||||
return {
|
||||
"id": str(snapshot.id),
|
||||
"url": snapshot.url,
|
||||
@@ -1437,6 +1447,9 @@ def queued_plugins_for_snapshot(snapshot_id: str) -> list[str] | None:
|
||||
def config_overrides_for_queued_plugins(selected_plugins: list[str], **overrides: Any) -> dict[str, Any]:
|
||||
config_overrides = dict(overrides)
|
||||
config_overrides["PLUGINS"] = ",".join(selected_plugins)
|
||||
selected_plugin_names = _plugins_with_required_plugins({plugin_name.lower() for plugin_name in selected_plugins})
|
||||
for plugin_name, enabled_key in _plugin_enabled_config_keys().items():
|
||||
config_overrides[enabled_key] = plugin_name in selected_plugin_names
|
||||
for plugin_name in selected_plugins:
|
||||
if plugin_name.startswith("search_backend_"):
|
||||
config_overrides[f"{plugin_name.upper()}_ENABLED"] = True
|
||||
|
||||
Reference in New Issue
Block a user