diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index a90ed6ec..a32982cd 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -4,6 +4,7 @@ import math import json import mimetypes import re +from html import unescape from collections import defaultdict from pathlib import Path, PurePosixPath from uuid import UUID @@ -832,7 +833,7 @@ def _snapshots_rss_response( crawl_user = snapshot.crawl.created_by if snapshot.crawl_id else None description = f"Original URL: {snapshot.url}\nArchived snapshot: {archived_url}" feed.add_item( - title=snapshot.title or snapshot.url, + title=unescape(snapshot.title or snapshot.url), link=archived_url or web_base_url, description=description, unique_id=str(snapshot.id), diff --git a/archivebox/config/common.py b/archivebox/config/common.py index fdf46741..329ddab8 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -664,7 +664,7 @@ class ArchiveBoxBaseConfig( def _scoped_config(self, *, include_execution: bool) -> dict[str, Any]: keys = type(self)._crawl_runtime_keys() if include_execution else type(self)._crawl_frozen_keys() payload = self.model_dump(mode="json") - return {key: payload[key] for key in keys if payload.get(key) is not None} + return {key: payload[key] for key in keys if payload.get(key, None) is not None} def for_crawl(self) -> dict[str, Any]: """Config scoped to crawl execution, without runtime object overlays.""" diff --git a/archivebox/plugins/hooks.py b/archivebox/plugins/hooks.py index 03773a34..1a1bde2b 100644 --- a/archivebox/plugins/hooks.py +++ b/archivebox/plugins/hooks.py @@ -279,15 +279,25 @@ def run_hook( records = process.get_records() # Get parsed JSONL output """ from archivebox.machine.models import Process, Machine, NetworkInterface - from archivebox.config.common import ArchiveBoxConfig, _archivebox_config_input_names, get_config, normalize_runtime_config + from archivebox.config.common import ( + ArchiveBoxConfig, + _archivebox_config_input_names, + get_config, + normalize_runtime_config, + _plugin_enabled_config_keys, + ) config_scope = {key.removeprefix("config_"): kwargs.pop(key) for key in list(kwargs) if key.startswith("config_")} config_overrides = _config_to_overrides(config) resolved_config = get_config(overrides=config_overrides, **config_scope) hook_config = normalize_runtime_config( - resolved_config.for_crawl_runtime(), + resolved_config.for_crawl_runtime(runtime_overrides=config_overrides), json_safe=False, ) + plugin_enabled_keys = set(_plugin_enabled_config_keys().values()) + if plugin_enabled_keys.intersection(hook_config): + for enabled_key in plugin_enabled_keys: + hook_config.setdefault(enabled_key, False) # Auto-detect timeout from plugin config if not explicitly provided if timeout is None: @@ -368,6 +378,7 @@ def run_hook( archivebox_config_input_names = _archivebox_config_input_names() for key in archivebox_config_input_names: env.pop(key, None) + env.pop("PLUGINS", None) env["DATA_DIR"] = str(CONSTANTS.DATA_DIR) env["LIBRARY_VERSION"] = VERSION env.setdefault("MACHINE_ID", os.environ.get("MACHINE_ID", CONSTANTS.MACHINE_ID)) diff --git a/archivebox/services/process_service.py b/archivebox/services/process_service.py index 3cd9e26c..b2228c42 100644 --- a/archivebox/services/process_service.py +++ b/archivebox/services/process_service.py @@ -31,6 +31,26 @@ def current_network_interface_with_machine(): return NetworkInterface.objects.select_related("machine").get(id=current_iface.id) +def normalize_process_env(env: dict) -> dict: + normalized = dict(env or {}) + raw_plugins = normalized.pop("PLUGINS", "") + selected_plugins = {name.strip().lower() for name in str(raw_plugins).split(",") if name.strip()} + from archivebox.config.common import ArchiveBoxConfig, _archivebox_config_input_names, is_sensitive_config_key + + allowed_config_keys = ArchiveBoxConfig._crawl_runtime_keys() + config_input_names = _archivebox_config_input_names() + for key in list(normalized): + if is_sensitive_config_key(key) or (key in config_input_names and key not in allowed_config_keys): + normalized.pop(key, None) + if selected_plugins: + from archivebox.config.common import _plugin_enabled_config_keys, _plugins_with_required_plugins + + selected_plugins = _plugins_with_required_plugins(selected_plugins) + for plugin_name, enabled_key in _plugin_enabled_config_keys().items(): + normalized.setdefault(enabled_key, "True" if plugin_name in selected_plugins else "False") + return normalized + + class ProcessService(BaseService): LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ ProcessStartedEvent, @@ -74,6 +94,7 @@ class ProcessService(BaseService): started_at=started_at, ) process = await process_query.order_by("-modified_at").afirst() + process_env = normalize_process_env(event.env) if process is None: process = await Process.objects.acreate( machine=iface.machine, @@ -82,7 +103,7 @@ class ProcessService(BaseService): worker_type=worker_type, pwd=event.output_dir, cmd=[event.hook_path, *event.hook_args], - env=event.env, + env=process_env, timeout=event.timeout, pid=event.pid or None, url=event.url or None, @@ -97,7 +118,7 @@ class ProcessService(BaseService): process.pwd = event.output_dir process.cmd = [event.hook_path, *event.hook_args] - process.env = event.env + process.env = process_env process.timeout = event.timeout process.pid = event.pid or None process.url = event.url or process.url @@ -172,6 +193,7 @@ class ProcessService(BaseService): started_at=started_at, ) process = await process_query.order_by("-modified_at").afirst() + process_env = normalize_process_env(event.env) if process is None: await Process.objects.acreate( machine=iface.machine, @@ -180,7 +202,7 @@ class ProcessService(BaseService): worker_type=worker_type, pwd=event.output_dir, cmd=[event.hook_path, *event.hook_args], - env=event.env, + env=process_env, timeout=event.timeout, pid=event.pid or None, url=event.url or None, @@ -197,6 +219,7 @@ class ProcessService(BaseService): "machine_id": iface.machine_id, "iface_id": iface.id, "pwd": event.output_dir, + "env": process_env, "pid": event.pid or process.pid, "url": event.url or process.url, "process_type": process_type or process.process_type, diff --git a/archivebox/services/runner.py b/archivebox/services/runner.py index acd1a40b..b021c5d8 100644 --- a/archivebox/services/runner.py +++ b/archivebox/services/runner.py @@ -55,7 +55,12 @@ from abxbus import BaseEvent from abxbus.event_bus import EventBus, get_current_event, in_handler_context from abxbus.event_handler import EventHandlerAbortedError, EventHandlerCancelledError -from archivebox.config.common import ArchiveBoxBaseConfig, normalize_runtime_config +from archivebox.config.common import ( + ArchiveBoxBaseConfig, + normalize_runtime_config, + _plugin_enabled_config_keys, + _plugins_with_required_plugins, +) from archivebox.misc.db import run_db_analyze_batch from archivebox.core.shutdown_util import foreground_shutdown_signals, raise_if_shutdown_requested from archivebox.search.sonic_daemon import register_sonic_daemon_event_handler @@ -800,6 +805,11 @@ class CrawlRunner: }, ) normalized_config = normalize_runtime_config(config) + configured_plugins = [name.strip().lower() for name in str(normalized_config.get("PLUGINS") or "").split(",") if name.strip()] + if configured_plugins: + selected_plugin_names = _plugins_with_required_plugins(set(configured_plugins)) + for plugin_name, enabled_key in _plugin_enabled_config_keys().items(): + normalized_config.setdefault(enabled_key, plugin_name in selected_plugin_names) return { "id": str(snapshot.id), "url": snapshot.url, @@ -1437,6 +1447,9 @@ def queued_plugins_for_snapshot(snapshot_id: str) -> list[str] | None: def config_overrides_for_queued_plugins(selected_plugins: list[str], **overrides: Any) -> dict[str, Any]: config_overrides = dict(overrides) config_overrides["PLUGINS"] = ",".join(selected_plugins) + selected_plugin_names = _plugins_with_required_plugins({plugin_name.lower() for plugin_name in selected_plugins}) + for plugin_name, enabled_key in _plugin_enabled_config_keys().items(): + config_overrides[enabled_key] = plugin_name in selected_plugin_names for plugin_name in selected_plugins: if plugin_name.startswith("search_backend_"): config_overrides[f"{plugin_name.upper()}_ENABLED"] = True