Files
tapehoard/backend/app/services/scanner.py
T
adamlamers 0bfd5affb4
Continuous Integration / backend-tests (push) Successful in 11m41s
Continuous Integration / frontend-check (push) Successful in 10m54s
tests & remove is_indexed flag from db
2026-04-28 23:55:39 -04:00

506 lines
20 KiB
Python

import concurrent.futures
import hashlib
import os
import threading
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
import psutil
from loguru import logger
from sqlalchemy.orm import Session
from app.db import models
from app.db.database import SessionLocal
class JobManager:
"""Manages operational job states and persistence."""
@staticmethod
def create_job(db_session: Session, job_type: str) -> models.Job:
"""Creates a new job record in the database."""
job_record = models.Job(job_type=job_type, status="PENDING")
db_session.add(job_record)
db_session.commit()
db_session.refresh(job_record)
return job_record
@staticmethod
def start_job(job_id: int):
"""Marks a job as running and sets the start timestamp."""
from sqlalchemy.orm.exc import StaleDataError
with SessionLocal() as db_session:
try:
job_record = db_session.get(models.Job, job_id)
if job_record:
job_record.status = "RUNNING"
job_record.started_at = datetime.now(timezone.utc)
db_session.commit()
except StaleDataError:
db_session.rollback()
logger.debug(
f"Job {job_id} already modified or deleted (StaleDataError)."
)
@staticmethod
def update_job(job_id: int, progress: float, current_task: str):
"""Updates the progress and current task description for a job."""
from sqlalchemy.orm.exc import StaleDataError
with SessionLocal() as db_session:
try:
job_record = db_session.get(models.Job, job_id)
if job_record:
job_record.progress = progress
job_record.current_task = current_task
db_session.commit()
except StaleDataError:
db_session.rollback()
@staticmethod
def complete_job(job_id: int):
"""Marks a job as successfully completed."""
from sqlalchemy.orm.exc import StaleDataError
with SessionLocal() as db_session:
try:
job_record = db_session.get(models.Job, job_id)
if job_record:
job_record.status = "COMPLETED"
job_record.progress = 100.0
job_record.completed_at = datetime.now(timezone.utc)
db_session.commit()
except StaleDataError:
db_session.rollback()
@staticmethod
def fail_job(job_id: int, error_message: str):
"""Marks a job as failed and records the error message."""
from sqlalchemy.orm.exc import StaleDataError
with SessionLocal() as db_session:
try:
job_record = db_session.get(models.Job, job_id)
if job_record:
job_record.status = "FAILED"
job_record.error_message = error_message
job_record.completed_at = datetime.now(timezone.utc)
db_session.commit()
except StaleDataError:
db_session.rollback()
@staticmethod
def cancel_job(job_id: int):
"""Submits a cancellation request for a pending or running job."""
from sqlalchemy.orm.exc import StaleDataError
with SessionLocal() as db_session:
try:
job_record = db_session.get(models.Job, job_id)
if job_record and job_record.status in ["PENDING", "RUNNING"]:
job_record.status = "FAILED"
job_record.error_message = "Cancelled by user"
job_record.completed_at = datetime.now(timezone.utc)
db_session.commit()
except StaleDataError:
db_session.rollback()
@staticmethod
def is_cancelled(job_id: int) -> bool:
"""Checks if a job has been cancelled by the user."""
with SessionLocal() as db_session:
job_record = db_session.get(models.Job, job_id)
return bool(
job_record
and job_record.status == "FAILED"
and job_record.error_message == "Cancelled by user"
)
class ScannerService:
"""Handles recursive filesystem discovery and content indexing."""
def __init__(self):
self.is_running: bool = False
self.is_hashing: bool = False
self.last_run_time: Optional[datetime] = None
# Thread-safe Metrics
self.files_processed: int = 0
self.files_hashed: int = 0
self.files_new: int = 0
self.files_modified: int = 0
self.total_files_found: int = 0
self.bytes_hashed: int = 0
self.start_time: float = 0.0
self.is_throttled: bool = False
self.current_path: str = ""
self._metrics_lock = threading.Lock()
self._current_iowait: float = 0.0
# Background Monitors
self._throttle_thread = threading.Thread(
target=self._monitor_iowait, daemon=True
)
self._throttle_thread.start()
def _monitor_iowait(self):
"""Polls system I/O pressure to enable dynamic back-off."""
while True:
try:
cpu_times = psutil.cpu_times_percent(interval=1)
iowait_value = getattr(cpu_times, "iowait", 0.0)
with self._metrics_lock:
self.is_throttled = iowait_value > 5.0
self._current_iowait = iowait_value
except Exception as monitor_error:
logger.debug(f"I/O Monitor pulse failed: {monitor_error}")
time.sleep(1)
def _set_process_priority(self, level: str = "normal"):
"""Adjusts CPU and I/O priority for the current process."""
try:
if level == "background":
os.nice(19)
if hasattr(psutil.Process(), "ionice") and hasattr(
psutil, "IOPRIO_CLASS_IDLE"
):
process_handle = psutil.Process()
process_handle.ionice(psutil.IOPRIO_CLASS_IDLE)
else:
os.nice(0)
if hasattr(psutil.Process(), "ionice") and hasattr(
psutil, "IOPRIO_CLASS_BE"
):
process_handle = psutil.Process()
process_handle.ionice(psutil.IOPRIO_CLASS_BE, value=4)
except Exception as priority_error:
logger.debug(f"Priority adjustment restricted: {priority_error}")
def compute_sha256(self, file_path: str, job_id: Optional[int] = None) -> str:
"""Computes the SHA-256 hash of a file with high-velocity block processing."""
hash_engine = hashlib.sha256()
# Increase block size to 8MB for high-speed NVMe saturation
# and use local counter to minimize lock contention
BLOCK_SIZE = 8 * 1024 * 1024
local_processed_bytes = 0
SYNC_THRESHOLD = 128 * 1024 * 1024 # Sync metrics every 128MB
try:
with open(file_path, "rb") as file_handle:
while True:
if job_id is not None and JobManager.is_cancelled(job_id):
return ""
# Dynamic throttling - only check periodically to save cycles
if self.is_throttled:
throttle_delay = 0.05 if self._current_iowait < 15.0 else 0.2
time.sleep(throttle_delay)
byte_block = file_handle.read(BLOCK_SIZE)
if not byte_block:
break
hash_engine.update(byte_block)
local_processed_bytes += len(byte_block)
# Batch sync metrics to global counter
if local_processed_bytes >= SYNC_THRESHOLD:
with self._metrics_lock:
self.bytes_hashed += local_processed_bytes
local_processed_bytes = 0
# Final remaining sync
if local_processed_bytes > 0:
with self._metrics_lock:
self.bytes_hashed += local_processed_bytes
return hash_engine.hexdigest()
except OSError as io_error:
logger.error(f"IO Error during hashing {file_path}: {io_error}")
return ""
except Exception as generic_error:
logger.error(f"Unexpected error hashing {file_path}: {generic_error}")
return ""
def _format_throughput(self) -> str:
"""Calculates and formats current hashing speed."""
elapsed_seconds = time.time() - self.start_time
if elapsed_seconds <= 0:
return "0 B/s"
bytes_per_second = self.bytes_hashed / elapsed_seconds
for unit in ["B/s", "KB/s", "MB/s", "GB/s"]:
if bytes_per_second < 1024:
return f"{bytes_per_second:.1f} {unit}"
bytes_per_second /= 1024
return f"{bytes_per_second:.1f} TB/s"
def scan_sources(self, db_session: Session, job_id: Optional[int] = None):
"""Executes Phase 1: Fast Metadata Discovery."""
if self.is_running:
logger.warning("Discovery scan already active.")
return
self.is_running = True
self.files_processed = 0
self.files_new = 0
self.files_modified = 0
self.total_files_found = 0
self.current_path = ""
self._set_process_priority("normal")
if job_id is not None:
JobManager.start_job(job_id)
try:
from app.api.system import get_exclusion_spec, get_source_roots
exclusion_spec = get_exclusion_spec(db_session)
source_roots = get_source_roots(db_session)
tracking_rules = db_session.query(models.TrackedSource).all()
tracking_map = {rule.path: rule.action for rule in tracking_rules}
def resolve_tracking(absolute_path: str) -> bool:
# 1. User Tracking Policy (Explicit overrides)
applicable_rules = []
for rule_path, action in tracking_map.items():
if absolute_path == rule_path or absolute_path.startswith(
rule_path + "/"
):
applicable_rules.append((len(rule_path), action))
if applicable_rules:
# Most specific rule wins
applicable_rules.sort(key=lambda x: x[0], reverse=True)
return applicable_rules[0][1] == "exclude"
# 2. Global Exclusions (Default automatic behavior)
if exclusion_spec and exclusion_spec.match_file(absolute_path):
return True
return False
current_timestamp = datetime.now(timezone.utc)
BATCH_SIZE = 1000
pending_metadata: List[Dict[str, Any]] = []
# Initialize Phase 2 in background
threading.Thread(target=self.run_hashing).start()
for root_base in source_roots:
if job_id is not None and JobManager.is_cancelled(job_id):
break
if not os.path.exists(root_base):
continue
for current_dir, sub_dirs, file_names in os.walk(root_base):
if job_id is not None and JobManager.is_cancelled(job_id):
break
for name in file_names:
full_file_path = os.path.join(current_dir, name)
with self._metrics_lock:
self.total_files_found += 1
self.current_path = current_dir
try:
file_stats = os.stat(full_file_path)
is_ignored = resolve_tracking(full_file_path)
pending_metadata.append(
{
"path": full_file_path,
"size": file_stats.st_size,
"mtime": file_stats.st_mtime,
"ignored": is_ignored,
}
)
except (OSError, FileNotFoundError):
continue
if len(pending_metadata) >= BATCH_SIZE:
self._sync_metadata_batch(
db_session, pending_metadata, current_timestamp
)
db_session.commit()
pending_metadata = []
if job_id is not None:
JobManager.update_job(
job_id,
10.0,
f"Discovered {self.total_files_found} items...",
)
if pending_metadata:
self._sync_metadata_batch(
db_session, pending_metadata, current_timestamp
)
db_session.commit()
if job_id is not None and not JobManager.is_cancelled(job_id):
JobManager.complete_job(job_id)
self.last_run_time = current_timestamp
except Exception as scan_error:
logger.exception(f"Metadata discovery failed: {scan_error}")
db_session.rollback()
if job_id is not None:
JobManager.fail_job(job_id, str(scan_error))
finally:
self.is_running = False
def _sync_metadata_batch(
self, db_session: Session, batch: List[Dict[str, Any]], timestamp: datetime
):
"""Synchronizes a batch of metadata with the database index."""
file_paths = [file_meta["path"] for file_meta in batch]
# Batch Fetch Existing Metadata (Chunked for SQLite limits)
existing_records = {}
SQLITE_VARIABLE_LIMIT = 500
for i in range(0, len(file_paths), SQLITE_VARIABLE_LIMIT):
chunk = file_paths[i : i + SQLITE_VARIABLE_LIMIT]
chunk_records = (
db_session.query(models.FilesystemState)
.filter(models.FilesystemState.file_path.in_(chunk))
.all()
)
for record in chunk_records:
existing_records[record.file_path] = record
for file_meta in batch:
record = existing_records.get(file_meta["path"])
if not record:
with self._metrics_lock:
self.files_new += 1
db_session.add(
models.FilesystemState(
file_path=file_meta["path"],
size=file_meta["size"],
mtime=file_meta["mtime"],
is_ignored=file_meta["ignored"],
last_seen_timestamp=timestamp,
)
)
else:
metadata_changed = (
record.size != file_meta["size"]
or record.mtime != file_meta["mtime"]
)
if metadata_changed:
record.sha256_hash = None
with self._metrics_lock:
self.files_modified += 1
record.size = file_meta["size"]
record.mtime = file_meta["mtime"]
record.is_ignored = file_meta["ignored"]
record.last_seen_timestamp = timestamp
with self._metrics_lock:
self.files_processed += 1
def run_hashing(self):
"""Executes Phase 2: Background Content Hashing."""
if self.is_hashing:
return
with self._metrics_lock:
self.is_hashing = True
self._set_process_priority("background")
with SessionLocal() as db_session:
hashing_job = JobManager.create_job(db_session, "HASH")
JobManager.start_job(hashing_job.id)
self.start_time = time.time()
self.bytes_hashed = 0
self.files_hashed = 0
# Count total work pending for progress reporting
total_pending = (
db_session.query(models.FilesystemState)
.filter(
models.FilesystemState.sha256_hash.is_(None),
models.FilesystemState.is_ignored.is_(False),
)
.count()
)
try:
while True:
# Find unindexed work
hashing_targets = (
db_session.query(models.FilesystemState)
.filter(
models.FilesystemState.sha256_hash.is_(None),
models.FilesystemState.is_ignored.is_(False),
)
.limit(100)
.all()
)
if not hashing_targets:
if self.is_running:
time.sleep(2)
# Recount if more work appeared during sleep
total_pending = (
db_session.query(models.FilesystemState)
.filter(
models.FilesystemState.sha256_hash.is_(None),
models.FilesystemState.is_ignored.is_(False),
)
.count()
+ self.files_hashed
)
continue
break
if JobManager.is_cancelled(hashing_job.id):
break
max_workers = os.cpu_count() or 4
with concurrent.futures.ThreadPoolExecutor(
max_workers=max_workers
) as hashing_executor:
future_to_file = {
hashing_executor.submit(
self.compute_sha256, target.file_path, hashing_job.id
): target
for target in hashing_targets
}
for future in concurrent.futures.as_completed(future_to_file):
target_record = future_to_file[future]
computed_hash = future.result()
if computed_hash:
target_record.sha256_hash = computed_hash
self.files_hashed += 1
if self.files_hashed % 5 == 0:
progress = min(
99.9,
(self.files_hashed / max(total_pending, 1)) * 100,
)
status_msg = f"Hashing: {self.files_hashed}/{total_pending} objects processed [{self._format_throughput()}]"
if self.is_throttled:
status_msg += " (THROTTLED)"
JobManager.update_job(
hashing_job.id, progress, status_msg
)
db_session.commit()
JobManager.complete_job(hashing_job.id)
except Exception as hashing_error:
logger.error(f"Background hashing failed: {hashing_error}")
JobManager.fail_job(hashing_job.id, str(hashing_error))
finally:
self.is_hashing = False
scanner_manager = ScannerService()