fast discover was also slower than os.walk
Continuous Integration / e2e-tests (push) Successful in 5m18s
Continuous Integration / backend-tests (push) Successful in 38s
Continuous Integration / frontend-check (push) Successful in 15s

This commit is contained in:
2026-05-05 19:36:51 -04:00
parent 4d4d9fa1e0
commit 9e51247564
2 changed files with 34 additions and 217 deletions
+34 -212
View File
@@ -1,12 +1,10 @@
import concurrent.futures import concurrent.futures
import hashlib import hashlib
import os import os
import shutil
import subprocess
import threading import threading
import time import time
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional
import psutil import psutil
from loguru import logger from loguru import logger
@@ -16,161 +14,6 @@ from sqlalchemy.orm.exc import ObjectDeletedError, StaleDataError
from app.db import models from app.db import models
from app.db.database import SessionLocal from app.db.database import SessionLocal
# Fast file discovery via `find -printf` (GNU find or compatible).
# Detected once at import time; falls back to os.walk if unavailable.
_FAST_FIND_BINARY: Optional[str] = None
def _detect_fast_find() -> Optional[str]:
"""Check if a `find` binary with `-printf` support is available.
Tries `gfind` (GNU find via Homebrew on macOS) first, then `find`.
Returns the binary path if `-printf` works, otherwise ``None``.
"""
for candidate in ("gfind", "find"):
binary = shutil.which(candidate)
if binary is None:
continue
try:
result = subprocess.run(
[binary, "/tmp", "-maxdepth", "0", "-printf", "%f\n"],
capture_output=True,
timeout=5,
)
if result.returncode == 0 and result.stdout.strip() == b"tmp":
return binary
except Exception:
continue
return None
def _init_fast_features() -> Optional[str]:
global _FAST_FIND_BINARY
_FAST_FIND_BINARY = _detect_fast_find()
if _FAST_FIND_BINARY:
logger.info(f"Fast file discovery enabled: using {_FAST_FIND_BINARY} -printf")
else:
logger.info("Fast file discovery unavailable: falling back to os.walk")
return _FAST_FIND_BINARY
_FAST_FIND_BINARY = _init_fast_features()
def _discover_files_fast(
root_base: str,
job_id: Optional[int],
batch_size: int,
current_timestamp,
resolve_tracking,
sync_metadata_batch,
metrics_lock,
metrics,
db_session: Session,
) -> Tuple[int, int]:
"""Walk a tree using `find -printf` for fast metadata extraction.
Streams output line-by-line via subprocess.Popen so progress updates
appear as files are discovered instead of waiting for find to finish.
Returns (files_found, files_batched) counts.
"""
total_files_found = 0
files_batched = 0
pending_metadata: List[Dict[str, Any]] = []
# -printf format: path\tsize\tmtime (tab-separated; split from right for safety)
find_binary = _FAST_FIND_BINARY
if find_binary is None:
logger.warning(
"Fast file discovery requested but no compatible `find` binary found"
)
return 0, 0
cmd = [
find_binary,
root_base,
"-type",
"f",
"-printf",
"%p\t%s\t%T@\n",
]
try:
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)
if proc.stdout is None:
logger.error(
f"Fast file discovery failed: could not open stdout for {root_base}"
)
return 0, 0
except Exception as e:
logger.error(f"Fast file discovery failed for {root_base}: {e}")
return 0, 0
# Stream output line by line (tab-separated: path\tsize\tmtime)
for line in iter(proc.stdout.readline, b""):
if job_id is not None and JobManager.is_cancelled(job_id):
break
if not line.strip():
continue
# Split from right: mtime and size are always numeric
parts = line.split(b"\t")
if len(parts) < 3:
continue
# First n-2 parts may be path components (tabs in filename are rare)
full_file_path = b"\t".join(parts[:-2]).decode("utf-8", errors="replace")
try:
file_size = int(parts[-2])
file_mtime = float(parts[-1])
except (ValueError, IndexError):
continue
total_files_found += 1
with metrics_lock:
metrics["total_files_found"] = total_files_found
metrics["current_path"] = os.path.dirname(full_file_path)
is_ignored = resolve_tracking(full_file_path)
pending_metadata.append(
{
"path": full_file_path,
"size": file_size,
"mtime": file_mtime,
"ignored": is_ignored,
}
)
if len(pending_metadata) >= batch_size:
sync_metadata_batch(db_session, pending_metadata, current_timestamp)
db_session.commit()
files_batched += len(pending_metadata)
pending_metadata = []
if job_id is not None:
JobManager.update_job(
job_id,
10.0,
f"Discovered {total_files_found} items...",
)
proc.stdout.close()
proc.wait()
# Flush remaining batch
if pending_metadata:
sync_metadata_batch(db_session, pending_metadata, current_timestamp)
db_session.commit()
files_batched += len(pending_metadata)
return total_files_found, files_batched
class JobManager: class JobManager:
"""Manages operational job states and persistence with high resilience for background threads.""" """Manages operational job states and persistence with high resilience for background threads."""
@@ -439,63 +282,42 @@ class ScannerService:
if not os.path.exists(root_base): if not os.path.exists(root_base):
continue continue
if _FAST_FIND_BINARY: for current_dir, _sub_dirs, file_names in os.walk(root_base):
# Fast path: GNU find -printf (metadata extracted in C) if job_id is not None and JobManager.is_cancelled(job_id):
metrics = { break
"total_files_found": 0,
"current_path": root_base,
}
found, _ = _discover_files_fast(
root_base,
job_id,
BATCH_SIZE,
current_timestamp,
resolve_tracking,
self._sync_metadata_batch,
self._metrics_lock,
metrics,
db_session,
)
with self._metrics_lock:
self.total_files_found += found
else:
# Compatibility path: Python os.walk + os.stat
for current_dir, _sub_dirs, file_names in os.walk(root_base):
if job_id is not None and JobManager.is_cancelled(job_id):
break
for name in file_names: for name in file_names:
full_file_path = os.path.join(current_dir, name) full_file_path = os.path.join(current_dir, name)
with self._metrics_lock: with self._metrics_lock:
self.total_files_found += 1 self.total_files_found += 1
self.current_path = current_dir self.current_path = current_dir
try: try:
file_stats = os.stat(full_file_path) file_stats = os.stat(full_file_path)
is_ignored = resolve_tracking(full_file_path) is_ignored = resolve_tracking(full_file_path)
pending_metadata.append( pending_metadata.append(
{ {
"path": full_file_path, "path": full_file_path,
"size": file_stats.st_size, "size": file_stats.st_size,
"mtime": file_stats.st_mtime, "mtime": file_stats.st_mtime,
"ignored": is_ignored, "ignored": is_ignored,
} }
)
except (OSError, FileNotFoundError):
continue
if len(pending_metadata) >= BATCH_SIZE:
self._sync_metadata_batch(
db_session, pending_metadata, current_timestamp
)
db_session.commit()
pending_metadata = []
if job_id is not None:
JobManager.update_job(
job_id,
10.0,
f"Discovered {self.total_files_found} items...",
) )
except (OSError, FileNotFoundError):
continue
if len(pending_metadata) >= BATCH_SIZE:
self._sync_metadata_batch(
db_session, pending_metadata, current_timestamp
)
db_session.commit()
pending_metadata = []
if job_id is not None:
JobManager.update_job(
job_id,
10.0,
f"Discovered {self.total_files_found} items...",
)
if pending_metadata: if pending_metadata:
self._sync_metadata_batch( self._sync_metadata_batch(
-5
View File
@@ -114,9 +114,6 @@ def test_scan_sources_mocked(db_session, mocker):
"""Tests the discovery scan with mocked filesystem.""" """Tests the discovery scan with mocked filesystem."""
scanner = ScannerService() scanner = ScannerService()
# Disable fast find so the test uses the os.walk fallback path
mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
# Mock settings # Mock settings
mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"]) mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
mocker.patch("app.api.common.get_exclusion_spec", return_value=None) mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
@@ -146,7 +143,6 @@ def test_missing_file_marked_deleted_at_end_of_scan(db_session, mocker):
"""Tests that files not seen during a scan are marked as deleted.""" """Tests that files not seen during a scan are marked as deleted."""
scanner = ScannerService() scanner = ScannerService()
mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"]) mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
mocker.patch("app.api.common.get_exclusion_spec", return_value=None) mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
mocker.patch("os.walk", return_value=[]) mocker.patch("os.walk", return_value=[])
@@ -181,7 +177,6 @@ def test_existing_file_not_marked_deleted(db_session, mocker):
"""Tests that files found during scan retain is_deleted=False.""" """Tests that files found during scan retain is_deleted=False."""
scanner = ScannerService() scanner = ScannerService()
mocker.patch("app.services.scanner._FAST_FIND_BINARY", None)
mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"]) mocker.patch("app.api.common.get_source_roots", return_value=["/mock_source"])
mocker.patch("app.api.common.get_exclusion_spec", return_value=None) mocker.patch("app.api.common.get_exclusion_spec", return_value=None)
mocker.patch("os.path.exists", return_value=True) mocker.patch("os.path.exists", return_value=True)