diff --git a/AGENTS.md b/AGENTS.md index 1fc31b0..39fe877 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -50,6 +50,15 @@ All API routes live under `app/api/`. The `system` endpoints are split into a pa Each module defines its own `APIRouter` with `tags=["System"]` and is registered in `main.py` with `prefix="/system"`. +### Index-Only Principle + +**Never rely on the live filesystem for data, except during a scan.** All read endpoints must operate exclusively on the database index. The filesystem is only accessed during: + +- **Scan operations** (`/system/scan`) — to discover files, compute hashes, and sync the index. +- **Configuration endpoints** (`/system/ls`, `/system/browse` when path is outside roots) — to help users pick source roots during setup. + +Browsing the archive, searching, or checking protection status must use the index only. This guarantees consistent results even when files are temporarily inaccessible, and prevents I/O bottlenecks on network or tape-backed storage. + ### Shared Helpers (`app/api/common.py`) Cross-cutting helpers and schemas that must not create circular imports: diff --git a/backend/app/api/system/filesystem.py b/backend/app/api/system/filesystem.py index a7c906f..b8a202f 100644 --- a/backend/app/api/system/filesystem.py +++ b/backend/app/api/system/filesystem.py @@ -13,7 +13,6 @@ from app.api.common import ( ) from sqlalchemy import text from app.db import models -import os router = APIRouter(tags=["System"]) @@ -24,7 +23,11 @@ router = APIRouter(tags=["System"]) def browse_system_path( path: Optional[str] = None, db_session: Session = Depends(get_db) ): - """Provides a browsable view of the indexed filesystem from the database.""" + """Provides a browsable view of the indexed filesystem from the database. + + Operates exclusively on the database index (index-only principle). + Never falls back to the live filesystem. + """ roots = get_source_roots(db_session) tracking_rules = db_session.query(models.TrackedSource).all() tracking_map = {rule.path: rule.action for rule in tracking_rules} @@ -57,105 +60,79 @@ def browse_system_path( target_prefix.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") ) - files_sql = text(""" + # --- Files directly under this path (non-recursive) --- + file_sql = text(""" SELECT file_path, size, mtime, sha256_hash, is_ignored FROM filesystem_state - WHERE file_path LIKE :prefix ESCAPE '\\' - AND file_path != :prefix + WHERE file_path LIKE :prefix_wildcard ESCAPE '\\' + AND file_path NOT LIKE :prefix_nested ESCAPE '\\' + AND file_path != :prefix """) - rows = db_session.execute(files_sql, {"prefix": f"{escaped_prefix}%"}).fetchall() + file_rows = db_session.execute( + file_sql, + { + "prefix": target_prefix, + "prefix_wildcard": f"{escaped_prefix}%", + "prefix_nested": f"{escaped_prefix}%/%", + }, + ).fetchall() - if not rows and os.path.isdir(path): - try: - live_results = [] - with os.scandir(path) as it: - for entry in it: - try: - if entry.name.startswith("."): - continue - entry_path = entry.path - is_dir = entry.is_dir() - is_ignored = get_ignored_status( - entry_path + "/" if is_dir else entry_path, - tracking_map, - exclusion_spec, - ) - if is_dir: - live_results.append( - FileItemSchema( - name=entry.name, - path=entry_path, - type="directory", - ignored=is_ignored, - ) - ) - else: - stat = entry.stat() - live_results.append( - FileItemSchema( - name=entry.name, - path=entry_path, - type="file", - size=stat.st_size, - mtime=stat.st_mtime, - ignored=is_ignored, - sha256_hash=None, - ) - ) - except OSError: - continue - live_results.sort(key=lambda x: (x.type != "directory", x.name.lower())) - return BrowseResponseSchema( - files=live_results, last_scan_time=last_scan_time + results: list[FileItemSchema] = [] + seen: set[str] = set() + + for file_path, size, mtime, sha256_hash, is_ignored in file_rows: + if file_path not in seen: + seen.add(file_path) + results.append( + FileItemSchema( + name=file_path.split("/")[-1], + path=file_path, + type="file", + size=size, + mtime=mtime, + ignored=is_ignored, + sha256_hash=sha256_hash, + ) ) - except OSError: - pass - # Aggregate sizes for directories from indexed rows - dir_sizes: dict[str, int] = {} - for file_path, size, _mtime, _sha256_hash, _is_ignored in rows: - relative = file_path[len(target_prefix) :] - if "/" in relative: - immediate_name = relative.split("/")[0] - child_path = target_prefix + immediate_name - dir_sizes[child_path] = dir_sizes.get(child_path, 0) + (size or 0) + # --- Directories under this path (aggregated via GROUP BY) --- + dir_sql = text(""" + SELECT + SUBSTR(file_path, LENGTH(:prefix) + 1, + INSTR(SUBSTR(file_path, LENGTH(:prefix) + 1), '/') - 1) as dir_name, + SUM(size) as total_size + FROM filesystem_state + WHERE file_path LIKE :prefix_wildcard ESCAPE '\\' + AND file_path != :prefix + AND INSTR(SUBSTR(file_path, LENGTH(:prefix) + 1), '/') > 0 + GROUP BY dir_name + """) + dir_rows = db_session.execute( + dir_sql, + { + "prefix": target_prefix, + "prefix_wildcard": f"{escaped_prefix}%", + }, + ).fetchall() - results = [] - seen = set() - - for file_path, size, mtime, sha256_hash, is_ignored in rows: - relative = file_path[len(target_prefix) :] - if "/" in relative: - immediate_name = relative.split("/")[0] - child_path = target_prefix + immediate_name - if child_path not in seen: - seen.add(child_path) - dir_ignored = get_ignored_status( - child_path + "/", tracking_map, exclusion_spec - ) - results.append( - FileItemSchema( - name=immediate_name, - path=child_path, - type="directory", - size=dir_sizes.get(child_path, 0), - ignored=dir_ignored, - ) - ) - else: - if file_path not in seen: - seen.add(file_path) - results.append( - FileItemSchema( - name=relative, - path=file_path, - type="file", - size=size, - mtime=mtime, - ignored=is_ignored, - sha256_hash=sha256_hash, - ) + for dir_name, total_size in dir_rows: + if not dir_name or dir_name == "/": + continue + child_path = target_prefix + dir_name + if child_path not in seen: + seen.add(child_path) + dir_ignored = get_ignored_status( + child_path + "/", tracking_map, exclusion_spec + ) + results.append( + FileItemSchema( + name=dir_name, + path=child_path, + type="directory", + size=total_size or 0, + ignored=dir_ignored, ) + ) results.sort(key=lambda x: (x.type != "directory", x.name.lower())) return BrowseResponseSchema(files=results, last_scan_time=last_scan_time) diff --git a/frontend/tests/full-workflow.test.ts b/frontend/tests/full-workflow.test.ts index 4e481fe..75c3fc8 100644 --- a/frontend/tests/full-workflow.test.ts +++ b/frontend/tests/full-workflow.test.ts @@ -44,6 +44,23 @@ test.describe('TapeHoard Golden Path', () => { await requestContext.post(`${API_URL}/system/settings`, { data: { key: 'restore_destinations', value: JSON.stringify([RESTORE_DEST]) } }); + + // Index-only principle: scan first so /system/browse can show files + const scanResp = await requestContext.post(`${API_URL}/system/scan`); + if (!scanResp.ok()) { + console.error('Failed to trigger initial scan'); + } + // Wait for scan to complete + const deadline = Date.now() + 30000; + while (Date.now() < deadline) { + const statusResp = await requestContext.get(`${API_URL}/system/scan/status`); + const status = await statusResp.json(); + if (!status.is_running) { + break; + } + await new Promise(r => setTimeout(r, 500)); + } + await requestContext.dispose(); });