remove 'fast hashing' that was actually slower

2026-05-05 19:13:32 -04:00
parent c3457308ba
commit 4d4d9fa1e0
2 changed files with 42 additions and 284 deletions
@@ -20,10 +20,6 @@ from app.db.database import SessionLocal
 # Detected once at import time; falls back to os.walk if unavailable.
 _FAST_FIND_BINARY: Optional[str] = None

-# Fast hashing via `sha256sum` or `shasum`.
-# Detected once at import time; falls back to Python hashlib if unavailable.
-_FAST_HASH_BINARY: Optional[str] = None
-

 def _detect_fast_find() -> Optional[str]:
    """Check if a `find` binary with `-printf` support is available.
@@ -48,132 +44,19 @@ def _detect_fast_find() -> Optional[str]:
    return None


-def _detect_fast_hash() -> Optional[str]:
-    """Check if a SHA-256 binary is available for batch hashing.
-
-    Tries `sha256sum` (GNU coreutils, Linux/Homebrew) then `shasum` (macOS).
-    Returns the binary path if it works, otherwise ``None``.
-    """
-    # Try sha256sum first (Linux, Homebrew gnu-coreutils)
-    binary = shutil.which("sha256sum")
-    if binary:
-        try:
-            result = subprocess.run(
-                [binary, "/dev/null"],
-                capture_output=True,
-                timeout=5,
-            )
-            if (
-                result.returncode == 0
-                and b"e3b0c44298fc1c149afbf4c8996fb924" in result.stdout
-            ):
-                return binary
-        except Exception:
-            pass
-
-    # Try shasum (macOS default)
-    binary = shutil.which("shasum")
-    if binary:
-        try:
-            result = subprocess.run(
-                [binary, "-a", "256", "/dev/null"],
-                capture_output=True,
-                timeout=5,
-            )
-            if (
-                result.returncode == 0
-                and b"e3b0c44298fc1c149afbf4c8996fb924" in result.stdout
-            ):
-                return binary
-        except Exception:
-            pass
-
-    return None
-
-
-def _init_fast_features() -> Tuple[Optional[str], Optional[str]]:
-    global _FAST_FIND_BINARY, _FAST_HASH_BINARY
+def _init_fast_features() -> Optional[str]:
+    global _FAST_FIND_BINARY
    _FAST_FIND_BINARY = _detect_fast_find()
-    _FAST_HASH_BINARY = _detect_fast_hash()

    if _FAST_FIND_BINARY:
        logger.info(f"Fast file discovery enabled: using {_FAST_FIND_BINARY} -printf")
    else:
        logger.info("Fast file discovery unavailable: falling back to os.walk")

-    if _FAST_HASH_BINARY:
-        logger.info(f"Fast hashing enabled: using {_FAST_HASH_BINARY}")
-    else:
-        logger.info("Fast hashing unavailable: falling back to Python hashlib")
-
-    return _FAST_FIND_BINARY, _FAST_HASH_BINARY
+    return _FAST_FIND_BINARY


-_FAST_FIND_BINARY, _FAST_HASH_BINARY = _init_fast_features()
-
-
-def _hash_file_batch_fast(
-    file_paths: List[str], binary: str
-) -> Dict[str, Optional[str]]:
-    """Hash a batch of files using a native SHA-256 binary.
-
-    Streams output line-by-line via subprocess.Popen for incremental progress.
-
-    Args:
-        file_paths: Paths to hash.
-        binary: Path to sha256sum or shasum.
-
-    Returns a mapping of file_path -> hex_digest (or None on failure).
-    """
-    results: Dict[str, Optional[str]] = {}
-
-    if not file_paths:
-        return results
-
-    # Build command: shasum needs -a 256 prefix, sha256sum doesn't
-    if binary.endswith("sha256sum"):
-        cmd = [binary, "--"] + file_paths
-    else:
-        # shasum
-        cmd = [binary, "-a", "256", "--"] + file_paths
-
-    try:
-        proc = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.DEVNULL,
-        )
-
-        # Stream output line-by-line for incremental progress
-        if proc.stdout is None:
-            return results
-        for line in iter(proc.stdout.readline, b""):
-            line = line.strip()
-            if not line:
-                continue
-            # Format: "<hash>  <path>" or "<hash> *<path>"
-            parts = line.split(b"  ", 1)
-            if len(parts) != 2:
-                # Try single space with binary marker: "<hash> *<path>"
-                parts = line.split(b" *", 1)
-                if len(parts) != 2:
-                    continue
-
-            file_hash = parts[0].decode("ascii", errors="replace").lower()
-            raw_path = parts[1].decode("utf-8", errors="replace")
-
-            # sha256sum may escape backslashes in filenames; handle common case
-            clean_path = raw_path.replace("\\\\", "\\")
-
-            results[clean_path] = file_hash
-
-        proc.stdout.close()
-        proc.wait()
-
-    except Exception as e:
-        logger.error(f"Native hash batch failed: {e}")
-
-    return results
+_FAST_FIND_BINARY = _init_fast_features()


 def _discover_files_fast(
@@ -751,10 +634,8 @@ class ScannerService:
                    .count()
                )

-                # Fast hash batch size: more files per batch reduces subprocess overhead
-                HASH_BATCH_SIZE = 100 if _FAST_HASH_BINARY else 100
                # How many files to pull from DB per iteration
-                FETCH_LIMIT = HASH_BATCH_SIZE * 4
+                FETCH_LIMIT = 400

                while self.is_hashing:
                    # Find unindexed work (exclude deleted files - they cannot be hashed)
@@ -780,82 +661,7 @@ class ScannerService:
                    if JobManager.is_cancelled(hashing_job.id):
                        break

-                    if _FAST_HASH_BINARY:
-                        # Fast path: batch files to native sha256sum/shasum
-                        # Group into sub-batches of HASH_BATCH_SIZE for parallel processing
-                        file_paths = [t.file_path for t in hashing_targets]
-                        path_to_record = {t.file_path: t for t in hashing_targets}
-
-                        sub_batches = [
-                            file_paths[i : i + HASH_BATCH_SIZE]
-                            for i in range(0, len(file_paths), HASH_BATCH_SIZE)
-                        ]
-
-                        max_workers = min(os.cpu_count() or 4, len(sub_batches))
-                        with concurrent.futures.ThreadPoolExecutor(
-                            max_workers=max_workers
-                        ) as hashing_executor:
-                            future_to_batch = {
-                                hashing_executor.submit(
-                                    _hash_file_batch_fast,
-                                    batch,
-                                    _FAST_HASH_BINARY,
-                                ): batch
-                                for batch in sub_batches
-                            }
-
-                            for future in concurrent.futures.as_completed(
-                                future_to_batch
-                            ):
-                                if not self.is_hashing:
-                                    break
-
-                                batch = future_to_batch[future]
-                                try:
-                                    batch_results = future.result()
-                                except Exception:
-                                    continue
-
-                                # Apply hashes and detect missing files ONLY for this batch
-                                for file_path in batch:
-                                    target_record = path_to_record.get(file_path)
-                                    if not target_record:
-                                        continue
-
-                                    if file_path in batch_results:
-                                        target_record.sha256_hash = batch_results[
-                                            file_path
-                                        ]
-                                        with self._metrics_lock:
-                                            self.bytes_hashed += target_record.size or 0
-                                            self.files_hashed += 1
-                                            # Report progress incrementally as files complete
-                                            if self.files_hashed % 5 == 0:
-                                                progress = min(
-                                                    99.9,
-                                                    (
-                                                        self.files_hashed
-                                                        / max(total_pending, 1)
-                                                    )
-                                                    * 100,
-                                                )
-                                                JobManager.update_job(
-                                                    hashing_job.id,
-                                                    progress,
-                                                    f"Hashed {self.files_hashed} files ({self._format_throughput()})...",
-                                                )
-                                    elif not os.path.exists(file_path):
-                                        target_record.is_deleted = True
-                                        with self._metrics_lock:
-                                            self.files_missing += 1
-
-                                # Throttle between sub-batches if I/O pressure is high
-                                with self._metrics_lock:
-                                    should_throttle = self.is_throttled
-                                if should_throttle:
-                                    time.sleep(0.5)
-                    else:
-                        # Compatibility path: Python hashlib via thread pool
+                    # Hash files using Python hashlib via thread pool
                    max_workers = os.cpu_count() or 4
                    with concurrent.futures.ThreadPoolExecutor(
                        max_workers=max_workers
@@ -869,9 +675,7 @@ class ScannerService:
                            for target in hashing_targets
                        }

-                            for future in concurrent.futures.as_completed(
-                                future_to_file
-                            ):
+                        for future in concurrent.futures.as_completed(future_to_file):
                            if not self.is_hashing:
                                break

@@ -892,8 +696,7 @@ class ScannerService:
                            if self.files_hashed % 5 == 0:
                                progress = min(
                                    99.9,
-                                        (self.files_hashed / max(total_pending, 1))
-                                        * 100,
+                                    (self.files_hashed / max(total_pending, 1)) * 100,
                                )
                                JobManager.update_job(
                                    hashing_job.id,
@@ -1,12 +1,9 @@
 import hashlib
 from datetime import datetime, timezone

-import pytest
 from app.services.scanner import (
    ScannerService,
    JobManager,
-    _hash_file_batch_fast,
-    _FAST_HASH_BINARY,
 )
 from app.db import models

@@ -145,46 +142,6 @@ def test_scan_sources_mocked(db_session, mocker):
    assert record.size == 500


-def test_hash_file_batch_fast(tmp_path):
-    """Tests native sha256sum/shasum batch hashing if available."""
-    if _FAST_HASH_BINARY is None:
-        pytest.skip("No native hash binary available")
-
-    # Create test files
-    files = {}
-    for i in range(5):
-        content = f"test content {i}".encode()
-        f = tmp_path / f"file_{i}.txt"
-        f.write_bytes(content)
-        files[str(f)] = hashlib.sha256(content).hexdigest()
-
-    # Hash via native binary
-    results = _hash_file_batch_fast(list(files.keys()), _FAST_HASH_BINARY)
-
-    assert len(results) == 5
-    for path, expected_hash in files.items():
-        assert results[path] == expected_hash
-
-
-def test_hash_file_batch_fast_empty():
-    """Tests that empty batch returns empty results."""
-    if _FAST_HASH_BINARY is None:
-        pytest.skip("No native hash binary available")
-
-    results = _hash_file_batch_fast([], _FAST_HASH_BINARY)
-    assert results == {}
-
-
-def test_hash_file_batch_fast_nonexistent():
-    """Tests that non-existent files are silently skipped."""
-    if _FAST_HASH_BINARY is None:
-        pytest.skip("No native hash binary available")
-
-    results = _hash_file_batch_fast(["/nonexistent/path"], _FAST_HASH_BINARY)
-    # Non-existent files should not produce hash entries
-    assert results == {}
-
-
 def test_missing_file_marked_deleted_at_end_of_scan(db_session, mocker):
    """Tests that files not seen during a scan are marked as deleted."""
    scanner = ScannerService()
@@ -259,8 +216,6 @@ def test_missing_file_during_hashing_marked_deleted(db_session, mocker):
    """Tests that files missing during hashing are marked as deleted."""
    scanner = ScannerService()

-    mocker.patch("app.services.scanner._FAST_HASH_BINARY", None)
-
    f = models.FilesystemState(
        file_path="/data/vanished.bin", size=10, mtime=1, is_ignored=False
    )