tapehoard/backend/app/services/scanner.py

import concurrent.futures
import hashlib
import os
import shutil
import subprocess
import threading
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple

import psutil
from loguru import logger
from sqlalchemy.orm import Session
from sqlalchemy.orm.exc import ObjectDeletedError, StaleDataError

from app.db import models
from app.db.database import SessionLocal

# Fast file discovery via `find -printf` (GNU find or compatible).
# Detected once at import time; falls back to os.walk if unavailable.
_FAST_FIND_BINARY: Optional[str] = None

# Fast hashing via `sha256sum` or `shasum`.
# Detected once at import time; falls back to Python hashlib if unavailable.
_FAST_HASH_BINARY: Optional[str] = None


def _detect_fast_find() -> Optional[str]:
    """Check if a `find` binary with `-printf` support is available.

    Tries `gfind` (GNU find via Homebrew on macOS) first, then `find`.
    Returns the binary path if `-printf` works, otherwise ``None``.
    """
    for candidate in ("gfind", "find"):
        binary = shutil.which(candidate)
        if binary is None:
            continue
        try:
            result = subprocess.run(
                [binary, "/tmp", "-maxdepth", "0", "-printf", "%f\n"],
                capture_output=True,
                timeout=5,
            )
            if result.returncode == 0 and result.stdout.strip() == b"tmp":
                return binary
        except Exception:
            continue
    return None


def _detect_fast_hash() -> Optional[str]:
    """Check if a SHA-256 binary is available for batch hashing.

    Tries `sha256sum` (GNU coreutils, Linux/Homebrew) then `shasum` (macOS).
    Returns the binary path if it works, otherwise ``None``.
    """
    # Try sha256sum first (Linux, Homebrew gnu-coreutils)
    binary = shutil.which("sha256sum")
    if binary:
        try:
            result = subprocess.run(
                [binary, "/dev/null"],
                capture_output=True,
                timeout=5,
            )
            if (
                result.returncode == 0
                and b"e3b0c44298fc1c149afbf4c8996fb924" in result.stdout
            ):
                return binary
        except Exception:
            pass

    # Try shasum (macOS default)
    binary = shutil.which("shasum")
    if binary:
        try:
            result = subprocess.run(
                [binary, "-a", "256", "/dev/null"],
                capture_output=True,
                timeout=5,
            )
            if (
                result.returncode == 0
                and b"e3b0c44298fc1c149afbf4c8996fb924" in result.stdout
            ):
                return binary
        except Exception:
            pass

    return None


def _init_fast_features() -> Tuple[Optional[str], Optional[str]]:
    global _FAST_FIND_BINARY, _FAST_HASH_BINARY
    _FAST_FIND_BINARY = _detect_fast_find()
    _FAST_HASH_BINARY = _detect_fast_hash()

    if _FAST_FIND_BINARY:
        logger.info(f"Fast file discovery enabled: using {_FAST_FIND_BINARY} -printf")
    else:
        logger.info("Fast file discovery unavailable: falling back to os.walk")

    if _FAST_HASH_BINARY:
        logger.info(f"Fast hashing enabled: using {_FAST_HASH_BINARY}")
    else:
        logger.info("Fast hashing unavailable: falling back to Python hashlib")

    return _FAST_FIND_BINARY, _FAST_HASH_BINARY


_FAST_FIND_BINARY, _FAST_HASH_BINARY = _init_fast_features()


def _hash_file_batch_fast(
    file_paths: List[str], binary: str
) -> Dict[str, Optional[str]]:
    """Hash a batch of files using a native SHA-256 binary.

    Streams output line-by-line via subprocess.Popen for incremental progress.

    Args:
        file_paths: Paths to hash.
        binary: Path to sha256sum or shasum.

    Returns a mapping of file_path -> hex_digest (or None on failure).
    """
    results: Dict[str, Optional[str]] = {}

    if not file_paths:
        return results

    # Build command: shasum needs -a 256 prefix, sha256sum doesn't
    if binary.endswith("sha256sum"):
        cmd = [binary, "--"] + file_paths
    else:
        # shasum
        cmd = [binary, "-a", "256", "--"] + file_paths

    try:
        proc = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
        )

        # Stream output line-by-line for incremental progress
        if proc.stdout is None:
            return results
        for line in iter(proc.stdout.readline, b""):
            line = line.strip()
            if not line:
                continue
            # Format: "<hash>  <path>" or "<hash> *<path>"
            parts = line.split(b"  ", 1)
            if len(parts) != 2:
                # Try single space with binary marker: "<hash> *<path>"
                parts = line.split(b" *", 1)
                if len(parts) != 2:
                    continue

            file_hash = parts[0].decode("ascii", errors="replace").lower()
            raw_path = parts[1].decode("utf-8", errors="replace")

            # sha256sum may escape backslashes in filenames; handle common case
            clean_path = raw_path.replace("\\\\", "\\")

            results[clean_path] = file_hash

        proc.stdout.close()
        proc.wait()

    except Exception as e:
        logger.error(f"Native hash batch failed: {e}")

    return results


def _discover_files_fast(
    root_base: str,
    job_id: Optional[int],
    batch_size: int,
    current_timestamp,
    resolve_tracking,
    sync_metadata_batch,
    metrics_lock,
    metrics,
    db_session: Session,
) -> Tuple[int, int]:
    """Walk a tree using `find -printf` for fast metadata extraction.

    Streams output line-by-line via subprocess.Popen so progress updates
    appear as files are discovered instead of waiting for find to finish.

    Returns (files_found, files_batched) counts.
    """
    total_files_found = 0
    files_batched = 0
    pending_metadata: List[Dict[str, Any]] = []

    # -printf format: path\tsize\tmtime (tab-separated; split from right for safety)
    find_binary = _FAST_FIND_BINARY
    if find_binary is None:
        logger.warning(
            "Fast file discovery requested but no compatible `find` binary found"
        )
        return 0, 0
    cmd = [
        find_binary,
        root_base,
        "-type",
        "f",
        "-printf",
        "%p\t%s\t%T@\n",
    ]

    try:
        proc = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
        )
        if proc.stdout is None:
            logger.error(
                f"Fast file discovery failed: could not open stdout for {root_base}"
            )
            return 0, 0
    except Exception as e:
        logger.error(f"Fast file discovery failed for {root_base}: {e}")
        return 0, 0

    # Stream output line by line (tab-separated: path\tsize\tmtime)
    for line in iter(proc.stdout.readline, b""):
        if job_id is not None and JobManager.is_cancelled(job_id):
            break

        if not line.strip():
            continue

        # Split from right: mtime and size are always numeric
        parts = line.split(b"\t")
        if len(parts) < 3:
            continue

        # First n-2 parts may be path components (tabs in filename are rare)
        full_file_path = b"\t".join(parts[:-2]).decode("utf-8", errors="replace")
        try:
            file_size = int(parts[-2])
            file_mtime = float(parts[-1])
        except (ValueError, IndexError):
            continue

        total_files_found += 1
        with metrics_lock:
            metrics["total_files_found"] = total_files_found
            metrics["current_path"] = os.path.dirname(full_file_path)

        is_ignored = resolve_tracking(full_file_path)
        pending_metadata.append(
            {
                "path": full_file_path,
                "size": file_size,
                "mtime": file_mtime,
                "ignored": is_ignored,
            }
        )

        if len(pending_metadata) >= batch_size:
            sync_metadata_batch(db_session, pending_metadata, current_timestamp)
            db_session.commit()
            files_batched += len(pending_metadata)
            pending_metadata = []
            if job_id is not None:
                JobManager.update_job(
                    job_id,
                    10.0,
                    f"Discovered {total_files_found} items...",
                )

    proc.stdout.close()
    proc.wait()

    # Flush remaining batch
    if pending_metadata:
        sync_metadata_batch(db_session, pending_metadata, current_timestamp)
        db_session.commit()
        files_batched += len(pending_metadata)

    return total_files_found, files_batched


class JobManager:
    """Manages operational job states and persistence with high resilience for background threads."""

    @staticmethod
    def create_job(db_session: Session, job_type: str) -> models.Job:
        """Creates a new job record in the database."""
        job_record = models.Job(job_type=job_type, status="PENDING")
        db_session.add(job_record)
        db_session.commit()
        db_session.refresh(job_record)
        return job_record

    @staticmethod
    def start_job(job_id: int):
        """Marks a job as running and sets the start timestamp."""
        with SessionLocal() as db_session:
            try:
                job_record = db_session.get(models.Job, job_id)
                if job_record:
                    job_record.status = "RUNNING"
                    job_record.started_at = datetime.now(timezone.utc)
                    db_session.commit()
            except (StaleDataError, Exception) as e:
                db_session.rollback()
                logger.debug(f"JobManager.start_job failed for {job_id}: {e}")

    @staticmethod
    def update_job(job_id: int, progress: float, current_task: str):
        """Updates the progress and current task description for a job."""
        with SessionLocal() as db_session:
            try:
                job_record = db_session.get(models.Job, job_id)
                if job_record:
                    job_record.progress = progress
                    job_record.current_task = current_task
                    db_session.commit()
            except (StaleDataError, Exception) as e:
                db_session.rollback()
                logger.debug(f"JobManager.update_job failed for {job_id}: {e}")

    @staticmethod
    def complete_job(job_id: int):
        """Marks a job as successfully completed if it is still active."""
        with SessionLocal() as db_session:
            try:
                job_record = db_session.get(models.Job, job_id)
                if job_record and job_record.status in ("PENDING", "RUNNING"):
                    job_record.status = "COMPLETED"
                    job_record.progress = 100.0
                    job_record.completed_at = datetime.now(timezone.utc)
                    db_session.commit()
            except (StaleDataError, Exception) as e:
                db_session.rollback()
                logger.debug(f"JobManager.complete_job failed for {job_id}: {e}")

    @staticmethod
    def fail_job(job_id: int, error_message: str):
        """Marks a job as failed if it is still active."""
        with SessionLocal() as db_session:
            try:
                job_record = db_session.get(models.Job, job_id)
                if job_record and job_record.status in ("PENDING", "RUNNING"):
                    job_record.status = "FAILED"
                    job_record.error_message = error_message
                    job_record.completed_at = datetime.now(timezone.utc)
                    db_session.commit()
            except (StaleDataError, Exception) as e:
                db_session.rollback()
                logger.debug(f"JobManager.fail_job failed for {job_id}: {e}")

    @staticmethod
    def add_job_log(job_id: int, message: str):
        """Appends a log entry to a job's log history."""
        with SessionLocal() as db_session:
            try:
                log_entry = models.JobLog(job_id=job_id, message=message)
                db_session.add(log_entry)
                db_session.commit()
            except (StaleDataError, Exception) as e:
                db_session.rollback()
                logger.debug(f"JobManager.add_job_log failed for {job_id}: {e}")

    @staticmethod
    def cancel_job(job_id: int):
        """Submits a cancellation request for a pending or running job."""
        with SessionLocal() as db_session:
            try:
                job_record = db_session.get(models.Job, job_id)
                if job_record and job_record.status in ["PENDING", "RUNNING"]:
                    job_record.status = "FAILED"
                    job_record.is_cancelled = True
                    job_record.error_message = "Cancelled by user"
                    job_record.completed_at = datetime.now(timezone.utc)
                    db_session.commit()
            except (StaleDataError, Exception) as e:
                db_session.rollback()
                logger.debug(f"JobManager.cancel_job failed for {job_id}: {e}")

    @staticmethod
    def is_cancelled(job_id: int) -> bool:
        """Checks if a job has been cancelled by the user."""
        with SessionLocal() as db_session:
            try:
                job_record = db_session.get(models.Job, job_id)
                return bool(job_record and job_record.is_cancelled)
            except Exception:
                return False


class ScannerService:
    """Handles recursive filesystem discovery and content indexing."""

    def __init__(self):
        self.is_running: bool = False
        self.is_hashing: bool = False
        self.last_run_time: Optional[datetime] = None

        # Thread-safe Metrics
        self.files_processed: int = 0
        self.files_hashed: int = 0
        self.files_new: int = 0
        self.files_modified: int = 0
        self.files_missing: int = 0
        self.total_files_found: int = 0
        self.bytes_hashed: int = 0
        self.start_time: float = 0.0
        self.is_throttled: bool = False
        self.current_path: str = ""
        self._metrics_lock = threading.Lock()
        self._current_iowait: float = 0.0

        # Background Monitors
        self._throttle_thread = threading.Thread(
            target=self._monitor_iowait, daemon=True
        )
        self._throttle_thread.start()

    def _monitor_iowait(self):
        """Polls system I/O pressure to enable dynamic back-off."""
        while self.is_running or self.is_hashing:
            try:
                cpu_times = psutil.cpu_times_percent(interval=0.1)
                iowait_value = getattr(cpu_times, "iowait", 0.0)
                with self._metrics_lock:
                    self.is_throttled = iowait_value > 5.0
                    self._current_iowait = iowait_value
            except Exception:
                pass
            # Use short sleep in a loop so we notice when the flags change
            for _ in range(20):
                if not (self.is_running or self.is_hashing):
                    return
                time.sleep(0.1)

    def _set_process_priority(self, level: str):
        """Adjusts the CPU and I/O priority of the current process."""
        try:
            p = psutil.Process(os.getpid())
            if level == "background":
                if hasattr(p, "ionice"):
                    p.ionice(
                        psutil.IOPRIO_CLASS_IDLE  # ty: ignore[unresolved-attribute]
                    )
                p.nice(19)
            else:
                if hasattr(p, "ionice"):
                    p.ionice(psutil.IOPRIO_CLASS_BE)  # ty: ignore[unresolved-attribute]
                p.nice(0)
        except Exception:
            pass

    def compute_sha256(
        self, file_path: str, job_id: Optional[int] = None
    ) -> Optional[str]:
        """Calculates the SHA-256 hash of a file with dynamic throttling."""
        try:
            sha256_hash = hashlib.sha256()
            with open(file_path, "rb") as f:
                for byte_block in iter(lambda: f.read(1024 * 1024), b""):
                    if job_id is not None and JobManager.is_cancelled(job_id):
                        return None

                    # Dynamic Throttling: If system I/O is high, sleep between blocks
                    if self.is_throttled:
                        time.sleep(0.1)

                    sha256_hash.update(byte_block)
                    with self._metrics_lock:
                        self.bytes_hashed += len(byte_block)
            return sha256_hash.hexdigest()
        except (OSError, PermissionError):
            return None

    def _format_throughput(self) -> str:
        """Calculates and formats current hashing speed."""
        elapsed_seconds = time.time() - self.start_time
        if elapsed_seconds <= 0:
            return "0 B/s"
        bytes_per_second = self.bytes_hashed / elapsed_seconds
        for unit in ["B/s", "KB/s", "MB/s", "GB/s"]:
            if bytes_per_second < 1024:
                return f"{bytes_per_second:.1f} {unit}"
            bytes_per_second /= 1024
        return f"{bytes_per_second:.1f} TB/s"

    def scan_sources(self, db_session: Session, job_id: Optional[int] = None):
        """Executes Phase 1: Metadata discovery and index synchronization."""
        if self.is_running:
            return
        self.is_running = True

        if job_id is not None:
            JobManager.start_job(job_id)
            JobManager.update_job(job_id, 0.0, "Starting system scan...")
            JobManager.add_job_log(job_id, "Starting system scan")

        self._set_process_priority("normal")
        with self._metrics_lock:
            self.files_processed = 0
            self.files_new = 0
            self.files_modified = 0
            self.files_missing = 0
            self.total_files_found = 0

        try:
            from app.api.common import get_exclusion_spec, get_source_roots

            exclusion_spec = get_exclusion_spec(db_session)
            source_roots = get_source_roots(db_session)
            tracking_rules = db_session.query(models.TrackedSource).all()
            tracking_map = {rule.path: rule.action for rule in tracking_rules}

            def resolve_tracking(absolute_path: str) -> bool:
                # 1. User Tracking Policy (Explicit overrides)
                applicable_rules = []
                for rule_path, action in tracking_map.items():
                    if absolute_path == rule_path or absolute_path.startswith(
                        rule_path + "/"
                    ):
                        applicable_rules.append((len(rule_path), action))

                if applicable_rules:
                    # Most specific rule wins
                    applicable_rules.sort(key=lambda x: x[0], reverse=True)
                    return applicable_rules[0][1] == "exclude"

                # 2. Global Exclusions (Default automatic behavior)
                if exclusion_spec and exclusion_spec.match_file(absolute_path):
                    return True

                return False

            current_timestamp = datetime.now(timezone.utc)
            BATCH_SIZE = 1000
            pending_metadata: List[Dict[str, Any]] = []

            # Initialize Phase 2 in background
            hashing_thread = threading.Thread(target=self.run_hashing)
            hashing_thread.daemon = True
            hashing_thread.start()

            for root_base in source_roots:
                if job_id is not None and JobManager.is_cancelled(job_id):
                    break
                if not os.path.exists(root_base):
                    continue

                if _FAST_FIND_BINARY:
                    # Fast path: GNU find -printf (metadata extracted in C)
                    metrics = {
                        "total_files_found": 0,
                        "current_path": root_base,
                    }
                    found, _ = _discover_files_fast(
                        root_base,
                        job_id,
                        BATCH_SIZE,
                        current_timestamp,
                        resolve_tracking,
                        self._sync_metadata_batch,
                        self._metrics_lock,
                        metrics,
                        db_session,
                    )
                    with self._metrics_lock:
                        self.total_files_found += found
                else:
                    # Compatibility path: Python os.walk + os.stat
                    for current_dir, _sub_dirs, file_names in os.walk(root_base):
                        if job_id is not None and JobManager.is_cancelled(job_id):
                            break

                        for name in file_names:
                            full_file_path = os.path.join(current_dir, name)
                            with self._metrics_lock:
                                self.total_files_found += 1
                                self.current_path = current_dir

                            try:
                                file_stats = os.stat(full_file_path)
                                is_ignored = resolve_tracking(full_file_path)
                                pending_metadata.append(
                                    {
                                        "path": full_file_path,
                                        "size": file_stats.st_size,
                                        "mtime": file_stats.st_mtime,
                                        "ignored": is_ignored,
                                    }
                                )
                            except (OSError, FileNotFoundError):
                                continue

                            if len(pending_metadata) >= BATCH_SIZE:
                                self._sync_metadata_batch(
                                    db_session, pending_metadata, current_timestamp
                                )
                                db_session.commit()
                                pending_metadata = []
                                if job_id is not None:
                                    JobManager.update_job(
                                        job_id,
                                        10.0,
                                        f"Discovered {self.total_files_found} items...",
                                    )

            if pending_metadata:
                self._sync_metadata_batch(
                    db_session, pending_metadata, current_timestamp
                )
                db_session.commit()

            # Detect files present in DB but not found during this scan
            stale_query = (
                db_session.query(models.FilesystemState)
                .filter(
                    models.FilesystemState.last_seen_timestamp < current_timestamp,
                    models.FilesystemState.is_deleted.is_(False),
                    models.FilesystemState.is_ignored.is_(False),
                )
                .yield_per(1000)
            )
            if not JobManager.is_cancelled(job_id) if job_id else True:
                missing_count = 0
                for record in stale_query:
                    if not os.path.exists(record.file_path):
                        record.is_deleted = True
                        missing_count += 1
                    else:
                        record.is_deleted = False
                        record.missing_acknowledged_at = None
                        record.last_seen_timestamp = current_timestamp
                db_session.commit()
                if missing_count:
                    with self._metrics_lock:
                        self.files_missing += missing_count
                    logger.info(
                        f"Scan detected {missing_count} files missing from disk (marked as deleted)"
                    )

            if job_id is not None and not JobManager.is_cancelled(job_id):
                JobManager.add_job_log(
                    job_id,
                    f"Scan complete: {self.files_new} new, {self.files_modified} modified, {self.files_missing} missing",
                )
                JobManager.complete_job(job_id)
                self.last_run_time = current_timestamp

        except Exception as scan_error:
            logger.exception(f"Metadata discovery failed: {scan_error}")
            db_session.rollback()
            if job_id is not None:
                JobManager.fail_job(job_id, str(scan_error))
        finally:
            self.is_running = False

    def _sync_metadata_batch(
        self, db_session: Session, batch: List[Dict[str, Any]], timestamp: datetime
    ):
        """Synchronizes a batch of metadata with the database index."""
        file_paths = [file_meta["path"] for file_meta in batch]

        # Batch Fetch Existing Metadata (Chunked for SQLite limits)
        existing_records = {}
        SQLITE_VARIABLE_LIMIT = 500
        for i in range(0, len(file_paths), SQLITE_VARIABLE_LIMIT):
            chunk = file_paths[i : i + SQLITE_VARIABLE_LIMIT]
            chunk_records = (
                db_session.query(models.FilesystemState)
                .filter(models.FilesystemState.file_path.in_(chunk))
                .all()
            )
            for record in chunk_records:
                existing_records[record.file_path] = record

        for file_meta in batch:
            record = existing_records.get(file_meta["path"])
            if not record:
                with self._metrics_lock:
                    self.files_new += 1
                db_session.add(
                    models.FilesystemState(
                        file_path=file_meta["path"],
                        size=file_meta["size"],
                        mtime=file_meta["mtime"],
                        is_ignored=file_meta["ignored"],
                        last_seen_timestamp=timestamp,
                    )
                )
            else:
                metadata_changed = (
                    record.size != file_meta["size"]
                    or record.mtime != file_meta["mtime"]
                )
                if metadata_changed:
                    record.sha256_hash = None
                    with self._metrics_lock:
                        self.files_modified += 1

                record.size = file_meta["size"]
                record.mtime = file_meta["mtime"]
                record.is_ignored = file_meta["ignored"]
                record.last_seen_timestamp = timestamp

            with self._metrics_lock:
                self.files_processed += 1

    def stop(self):
        """Signals background threads to stop and clears operational flags."""
        with self._metrics_lock:
            self.is_running = False
            self.is_hashing = False
        logger.info("Scanner service shutdown signaled.")

    def run_hashing(self):
        """Executes Phase 2: Background Content Hashing."""
        if self.is_hashing:
            return
        with self._metrics_lock:
            self.is_hashing = True

        self._set_process_priority("background")

        try:
            with SessionLocal() as db_session:
                hashing_job = JobManager.create_job(db_session, "HASH")
                JobManager.start_job(hashing_job.id)

                self.start_time = time.time()
                self.bytes_hashed = 0
                self.files_hashed = 0

                # Count total work pending for progress reporting
                total_pending = (
                    db_session.query(models.FilesystemState)
                    .filter(
                        models.FilesystemState.sha256_hash.is_(None),
                        models.FilesystemState.is_ignored.is_(False),
                        models.FilesystemState.is_deleted.is_(False),
                    )
                    .count()
                )

                # Fast hash batch size: more files per batch reduces subprocess overhead
                HASH_BATCH_SIZE = 100 if _FAST_HASH_BINARY else 100
                # How many files to pull from DB per iteration
                FETCH_LIMIT = HASH_BATCH_SIZE * 4

                while self.is_hashing:
                    # Find unindexed work (exclude deleted files - they cannot be hashed)
                    hashing_targets = (
                        db_session.query(models.FilesystemState)
                        .filter(
                            models.FilesystemState.sha256_hash.is_(None),
                            models.FilesystemState.is_ignored.is_(False),
                            models.FilesystemState.is_deleted.is_(False),
                        )
                        .limit(FETCH_LIMIT)
                        .all()
                    )

                    if not hashing_targets:
                        # If we are in 'Phase 1' (discovery), wait for more files to appear
                        if self.is_running:
                            time.sleep(1)
                            continue
                        # Otherwise, we are done
                        break

                    if JobManager.is_cancelled(hashing_job.id):
                        break

                    if _FAST_HASH_BINARY:
                        # Fast path: batch files to native sha256sum/shasum
                        # Group into sub-batches of HASH_BATCH_SIZE for parallel processing
                        file_paths = [t.file_path for t in hashing_targets]
                        path_to_record = {t.file_path: t for t in hashing_targets}

                        sub_batches = [
                            file_paths[i : i + HASH_BATCH_SIZE]
                            for i in range(0, len(file_paths), HASH_BATCH_SIZE)
                        ]

                        max_workers = min(os.cpu_count() or 4, len(sub_batches))
                        with concurrent.futures.ThreadPoolExecutor(
                            max_workers=max_workers
                        ) as hashing_executor:
                            future_to_batch = {
                                hashing_executor.submit(
                                    _hash_file_batch_fast,
                                    batch,
                                    _FAST_HASH_BINARY,
                                ): batch
                                for batch in sub_batches
                            }

                            for future in concurrent.futures.as_completed(
                                future_to_batch
                            ):
                                if not self.is_hashing:
                                    break

                                batch = future_to_batch[future]
                                try:
                                    batch_results = future.result()
                                except Exception:
                                    continue

                                # Apply hashes and detect missing files ONLY for this batch
                                for file_path in batch:
                                    target_record = path_to_record.get(file_path)
                                    if not target_record:
                                        continue

                                    if file_path in batch_results:
                                        target_record.sha256_hash = batch_results[
                                            file_path
                                        ]
                                        with self._metrics_lock:
                                            self.bytes_hashed += target_record.size or 0
                                            self.files_hashed += 1
                                            # Report progress incrementally as files complete
                                            if self.files_hashed % 5 == 0:
                                                progress = min(
                                                    99.9,
                                                    (
                                                        self.files_hashed
                                                        / max(total_pending, 1)
                                                    )
                                                    * 100,
                                                )
                                                JobManager.update_job(
                                                    hashing_job.id,
                                                    progress,
                                                    f"Hashed {self.files_hashed} files ({self._format_throughput()})...",
                                                )
                                    elif not os.path.exists(file_path):
                                        target_record.is_deleted = True
                                        with self._metrics_lock:
                                            self.files_missing += 1

                                # Throttle between sub-batches if I/O pressure is high
                                with self._metrics_lock:
                                    should_throttle = self.is_throttled
                                if should_throttle:
                                    time.sleep(0.5)
                    else:
                        # Compatibility path: Python hashlib via thread pool
                        max_workers = os.cpu_count() or 4
                        with concurrent.futures.ThreadPoolExecutor(
                            max_workers=max_workers
                        ) as hashing_executor:
                            future_to_file = {
                                hashing_executor.submit(
                                    self.compute_sha256,
                                    target.file_path,
                                    hashing_job.id,
                                ): target
                                for target in hashing_targets
                            }

                            for future in concurrent.futures.as_completed(
                                future_to_file
                            ):
                                if not self.is_hashing:
                                    break

                                target_record = future_to_file[future]
                                try:
                                    computed_hash = future.result()
                                except Exception:
                                    continue

                                if computed_hash:
                                    target_record.sha256_hash = computed_hash
                                    self.files_hashed += 1
                                elif not os.path.exists(target_record.file_path):
                                    target_record.is_deleted = True
                                    with self._metrics_lock:
                                        self.files_missing += 1

                                if self.files_hashed % 5 == 0:
                                    progress = min(
                                        99.9,
                                        (self.files_hashed / max(total_pending, 1))
                                        * 100,
                                    )
                                    JobManager.update_job(
                                        hashing_job.id,
                                        progress,
                                        f"Hashed {self.files_hashed} files ({self._format_throughput()})...",
                                    )

                    # Commit batch
                    try:
                        db_session.commit()
                    except (StaleDataError, Exception):
                        db_session.rollback()
                        break

                if not JobManager.is_cancelled(hashing_job.id) and self.is_hashing:
                    JobManager.add_job_log(
                        hashing_job.id,
                        f"Hashing complete: {self.files_hashed} files indexed",
                    )
                    JobManager.complete_job(hashing_job.id)

        except ObjectDeletedError:
            logger.debug(
                "Background hashing aborted: Job was deleted by another process"
            )
            # Exit gracefully - another process cancelled this job
            try:
                with self._metrics_lock:
                    self.is_hashing = False
            except Exception:
                pass
        except Exception as e:
            logger.error(f"Background hashing failed: {e}")
            # Try to report failure, but don't blow up if JobManager fails too
            try:
                if "hashing_job" in locals():
                    JobManager.fail_job(hashing_job.id, str(e))
            except Exception:
                pass
        finally:
            with self._metrics_lock:
                self.is_hashing = False


scanner_manager = ScannerService()