fast discover was also slower than os.walk
Continuous Integration / e2e-tests (push) Successful in 5m18s
Continuous Integration / backend-tests (push) Successful in 38s
Continuous Integration / frontend-check (push) Successful in 15s

This commit is contained in:
2026-05-05 19:36:51 -04:00
parent 4d4d9fa1e0
commit 9e51247564
2 changed files with 34 additions and 217 deletions
+1 -179
View File
@@ -1,12 +1,10 @@
import concurrent.futures
import hashlib
import os
import shutil
import subprocess
import threading
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional
import psutil
from loguru import logger
@@ -16,161 +14,6 @@ from sqlalchemy.orm.exc import ObjectDeletedError, StaleDataError
from app.db import models
from app.db.database import SessionLocal
# Fast file discovery via `find -printf` (GNU find or compatible).
# Detected once at import time; falls back to os.walk if unavailable.
_FAST_FIND_BINARY: Optional[str] = None
def _detect_fast_find() -> Optional[str]:
"""Check if a `find` binary with `-printf` support is available.
Tries `gfind` (GNU find via Homebrew on macOS) first, then `find`.
Returns the binary path if `-printf` works, otherwise ``None``.
"""
for candidate in ("gfind", "find"):
binary = shutil.which(candidate)
if binary is None:
continue
try:
result = subprocess.run(
[binary, "/tmp", "-maxdepth", "0", "-printf", "%f\n"],
capture_output=True,
timeout=5,
)
if result.returncode == 0 and result.stdout.strip() == b"tmp":
return binary
except Exception:
continue
return None
def _init_fast_features() -> Optional[str]:
global _FAST_FIND_BINARY
_FAST_FIND_BINARY = _detect_fast_find()
if _FAST_FIND_BINARY:
logger.info(f"Fast file discovery enabled: using {_FAST_FIND_BINARY} -printf")
else:
logger.info("Fast file discovery unavailable: falling back to os.walk")
return _FAST_FIND_BINARY
_FAST_FIND_BINARY = _init_fast_features()
def _discover_files_fast(
root_base: str,
job_id: Optional[int],
batch_size: int,
current_timestamp,
resolve_tracking,
sync_metadata_batch,
metrics_lock,
metrics,
db_session: Session,
) -> Tuple[int, int]:
"""Walk a tree using `find -printf` for fast metadata extraction.
Streams output line-by-line via subprocess.Popen so progress updates
appear as files are discovered instead of waiting for find to finish.
Returns (files_found, files_batched) counts.
"""
total_files_found = 0
files_batched = 0
pending_metadata: List[Dict[str, Any]] = []
# -printf format: path\tsize\tmtime (tab-separated; split from right for safety)
find_binary = _FAST_FIND_BINARY
if find_binary is None:
logger.warning(
"Fast file discovery requested but no compatible `find` binary found"
)
return 0, 0
cmd = [
find_binary,
root_base,
"-type",
"f",
"-printf",
"%p\t%s\t%T@\n",
]
try:
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)
if proc.stdout is None:
logger.error(
f"Fast file discovery failed: could not open stdout for {root_base}"
)
return 0, 0
except Exception as e:
logger.error(f"Fast file discovery failed for {root_base}: {e}")
return 0, 0
# Stream output line by line (tab-separated: path\tsize\tmtime)
for line in iter(proc.stdout.readline, b""):
if job_id is not None and JobManager.is_cancelled(job_id):
break
if not line.strip():
continue
# Split from right: mtime and size are always numeric
parts = line.split(b"\t")
if len(parts) < 3:
continue
# First n-2 parts may be path components (tabs in filename are rare)
full_file_path = b"\t".join(parts[:-2]).decode("utf-8", errors="replace")
try:
file_size = int(parts[-2])
file_mtime = float(parts[-1])
except (ValueError, IndexError):
continue
total_files_found += 1
with metrics_lock:
metrics["total_files_found"] = total_files_found
metrics["current_path"] = os.path.dirname(full_file_path)
is_ignored = resolve_tracking(full_file_path)
pending_metadata.append(
{
"path": full_file_path,
"size": file_size,
"mtime": file_mtime,
"ignored": is_ignored,
}
)
if len(pending_metadata) >= batch_size:
sync_metadata_batch(db_session, pending_metadata, current_timestamp)
db_session.commit()
files_batched += len(pending_metadata)
pending_metadata = []
if job_id is not None:
JobManager.update_job(
job_id,
10.0,
f"Discovered {total_files_found} items...",
)
proc.stdout.close()
proc.wait()
# Flush remaining batch
if pending_metadata:
sync_metadata_batch(db_session, pending_metadata, current_timestamp)
db_session.commit()
files_batched += len(pending_metadata)
return total_files_found, files_batched
class JobManager:
"""Manages operational job states and persistence with high resilience for background threads."""
@@ -439,27 +282,6 @@ class ScannerService:
if not os.path.exists(root_base):
continue
if _FAST_FIND_BINARY:
# Fast path: GNU find -printf (metadata extracted in C)
metrics = {
"total_files_found": 0,
"current_path": root_base,
}
found, _ = _discover_files_fast(
root_base,
job_id,
BATCH_SIZE,
current_timestamp,
resolve_tracking,
self._sync_metadata_batch,
self._metrics_lock,
metrics,
db_session,
)
with self._metrics_lock:
self.total_files_found += found
else:
# Compatibility path: Python os.walk + os.stat
for current_dir, _sub_dirs, file_names in os.walk(root_base):
if job_id is not None and JobManager.is_cancelled(job_id):
break
-5
View File
@@ -114,9 +114,6 @@ def test_scan_sources_mocked(db_session, mocker):
"""Tests the discovery scan with mocked filesystem."""
scanner = ScannerService()
# Disable fast find so the test uses the os.walk fallback path
mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
# Mock settings
mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
@@ -146,7 +143,6 @@ def test_missing_file_marked_deleted_at_end_of_scan(db_session, mocker):
"""Tests that files not seen during a scan are marked as deleted."""
scanner = ScannerService()
mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
mocker.patch("os.walk", return_value=[])
@@ -181,7 +177,6 @@ def test_existing_file_not_marked_deleted(db_session, mocker):
"""Tests that files found during scan retain is_deleted=False."""
scanner = ScannerService()
mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
mocker.patch("os.path.exists", return_value=True)