fast discover was also slower than os.walk

2026-05-05 19:36:51 -04:00
parent 4d4d9fa1e0
commit 9e51247564
2 changed files with 34 additions and 217 deletions
@@ -1,12 +1,10 @@
 import concurrent.futures
 import hashlib
 import os
 import shutil
 import subprocess
 import threading
 import time
 from datetime import datetime, timezone
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 import psutil
 from loguru import logger
@@ -16,161 +14,6 @@ from sqlalchemy.orm.exc import ObjectDeletedError, StaleDataError
 from app.db import models
 from app.db.database import SessionLocal
 # Fast file discovery via `find -printf` (GNU find or compatible).
 # Detected once at import time; falls back to os.walk if unavailable.
 _FAST_FIND_BINARY: Optional[str] = None
 def _detect_fast_find() -> Optional[str]:
    """Check if a `find` binary with `-printf` support is available.
    Tries `gfind` (GNU find via Homebrew on macOS) first, then `find`.
    Returns the binary path if `-printf` works, otherwise ``None``.
    """
    for candidate in ("gfind", "find"):
        binary = shutil.which(candidate)
        if binary is None:
            continue
        try:
            result = subprocess.run(
                [binary, "/tmp", "-maxdepth", "0", "-printf", "%f\n"],
                capture_output=True,
                timeout=5,
            )
            if result.returncode == 0 and result.stdout.strip() == b"tmp":
                return binary
        except Exception:
            continue
    return None
 def _init_fast_features() -> Optional[str]:
    global _FAST_FIND_BINARY
    _FAST_FIND_BINARY = _detect_fast_find()
    if _FAST_FIND_BINARY:
        logger.info(f"Fast file discovery enabled: using {_FAST_FIND_BINARY} -printf")
    else:
        logger.info("Fast file discovery unavailable: falling back to os.walk")
    return _FAST_FIND_BINARY
 _FAST_FIND_BINARY = _init_fast_features()
 def _discover_files_fast(
    root_base: str,
    job_id: Optional[int],
    batch_size: int,
    current_timestamp,
    resolve_tracking,
    sync_metadata_batch,
    metrics_lock,
    metrics,
    db_session: Session,
 ) -> Tuple[int, int]:
    """Walk a tree using `find -printf` for fast metadata extraction.
    Streams output line-by-line via subprocess.Popen so progress updates
    appear as files are discovered instead of waiting for find to finish.
    Returns (files_found, files_batched) counts.
    """
    total_files_found = 0
    files_batched = 0
    pending_metadata: List[Dict[str, Any]] = []
    # -printf format: path\tsize\tmtime (tab-separated; split from right for safety)
    find_binary = _FAST_FIND_BINARY
    if find_binary is None:
        logger.warning(
            "Fast file discovery requested but no compatible `find` binary found"
        )
        return 0, 0
    cmd = [
        find_binary,
        root_base,
        "-type",
        "f",
        "-printf",
        "%p\t%s\t%T@\n",
    ]
    try:
        proc = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
        )
        if proc.stdout is None:
            logger.error(
                f"Fast file discovery failed: could not open stdout for {root_base}"
            )
            return 0, 0
    except Exception as e:
        logger.error(f"Fast file discovery failed for {root_base}: {e}")
        return 0, 0
    # Stream output line by line (tab-separated: path\tsize\tmtime)
    for line in iter(proc.stdout.readline, b""):
        if job_id is not None and JobManager.is_cancelled(job_id):
            break
        if not line.strip():
            continue
        # Split from right: mtime and size are always numeric
        parts = line.split(b"\t")
        if len(parts) < 3:
            continue
        # First n-2 parts may be path components (tabs in filename are rare)
        full_file_path = b"\t".join(parts[:-2]).decode("utf-8", errors="replace")
        try:
            file_size = int(parts[-2])
            file_mtime = float(parts[-1])
        except (ValueError, IndexError):
            continue
        total_files_found += 1
        with metrics_lock:
            metrics["total_files_found"] = total_files_found
            metrics["current_path"] = os.path.dirname(full_file_path)
        is_ignored = resolve_tracking(full_file_path)
        pending_metadata.append(
            {
                "path": full_file_path,
                "size": file_size,
                "mtime": file_mtime,
                "ignored": is_ignored,
            }
        )
        if len(pending_metadata) >= batch_size:
            sync_metadata_batch(db_session, pending_metadata, current_timestamp)
            db_session.commit()
            files_batched += len(pending_metadata)
            pending_metadata = []
            if job_id is not None:
                JobManager.update_job(
                    job_id,
                    10.0,
                    f"Discovered {total_files_found} items...",
                )
    proc.stdout.close()
    proc.wait()
    # Flush remaining batch
    if pending_metadata:
        sync_metadata_batch(db_session, pending_metadata, current_timestamp)
        db_session.commit()
        files_batched += len(pending_metadata)
    return total_files_found, files_batched
 class JobManager:
    """Manages operational job states and persistence with high resilience for background threads."""
@@ -439,63 +282,42 @@ class ScannerService:
                if not os.path.exists(root_base):
                    continue
-                if _FAST_FIND_BINARY:
+                for current_dir, _sub_dirs, file_names in os.walk(root_base):
-                    # Fast path: GNU find -printf (metadata extracted in C)
+                    if job_id is not None and JobManager.is_cancelled(job_id):
-                    metrics = {
+                        break
                        "total_files_found": 0,
                        "current_path": root_base,
                    }
                    found, _ = _discover_files_fast(
                        root_base,
                        job_id,
                        BATCH_SIZE,
                        current_timestamp,
                        resolve_tracking,
                        self._sync_metadata_batch,
                        self._metrics_lock,
                        metrics,
                        db_session,
                    )
                    with self._metrics_lock:
                        self.total_files_found += found
                else:
                    # Compatibility path: Python os.walk + os.stat
                    for current_dir, _sub_dirs, file_names in os.walk(root_base):
                        if job_id is not None and JobManager.is_cancelled(job_id):
                            break
-                        for name in file_names:
+                    for name in file_names:
-                            full_file_path = os.path.join(current_dir, name)
+                        full_file_path = os.path.join(current_dir, name)
-                            with self._metrics_lock:
+                        with self._metrics_lock:
-                                self.total_files_found += 1
+                            self.total_files_found += 1
-                                self.current_path = current_dir
+                            self.current_path = current_dir
-                            try:
+                        try:
-                                file_stats = os.stat(full_file_path)
+                            file_stats = os.stat(full_file_path)
-                                is_ignored = resolve_tracking(full_file_path)
+                            is_ignored = resolve_tracking(full_file_path)
-                                pending_metadata.append(
+                            pending_metadata.append(
-                                    {
+                                {
-                                        "path": full_file_path,
+                                    "path": full_file_path,
-                                        "size": file_stats.st_size,
+                                    "size": file_stats.st_size,
-                                        "mtime": file_stats.st_mtime,
+                                    "mtime": file_stats.st_mtime,
-                                        "ignored": is_ignored,
+                                    "ignored": is_ignored,
-                                    }
+                                }
                            )
                        except (OSError, FileNotFoundError):
                            continue
                        if len(pending_metadata) >= BATCH_SIZE:
                            self._sync_metadata_batch(
                                db_session, pending_metadata, current_timestamp
                            )
                            db_session.commit()
                            pending_metadata = []
                            if job_id is not None:
                                JobManager.update_job(
                                    job_id,
                                    10.0,
                                    f"Discovered {self.total_files_found} items...",
                                )
                            except (OSError, FileNotFoundError):
                                continue
                            if len(pending_metadata) >= BATCH_SIZE:
                                self._sync_metadata_batch(
                                    db_session, pending_metadata, current_timestamp
                                )
                                db_session.commit()
                                pending_metadata = []
                                if job_id is not None:
                                    JobManager.update_job(
                                        job_id,
                                        10.0,
                                        f"Discovered {self.total_files_found} items...",
                                    )
            if pending_metadata:
                self._sync_metadata_batch(
@@ -114,9 +114,6 @@ def test_scan_sources_mocked(db_session, mocker):
    """Tests the discovery scan with mocked filesystem."""
    scanner = ScannerService()
    # Disable fast find so the test uses the os.walk fallback path
    mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
    # Mock settings
    mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
    mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
@@ -146,7 +143,6 @@ def test_missing_file_marked_deleted_at_end_of_scan(db_session, mocker):
    """Tests that files not seen during a scan are marked as deleted."""
    scanner = ScannerService()
    mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
    mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
    mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
    mocker.patch("os.walk", return_value=[])
@@ -181,7 +177,6 @@ def test_existing_file_not_marked_deleted(db_session, mocker):
    """Tests that files found during scan retain is_deleted=False."""
    scanner = ScannerService()
    mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
    mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
    mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
    mocker.patch("os.path.exists", return_value=True)