filesystem view uses index only

don't reuse playwright server
2026-05-04 20:44:49 -04:00 · 2026-05-04 20:19:04 -04:00
4 changed files with 98 additions and 95 deletions
@@ -50,6 +50,15 @@ All API routes live under `app/api/`. The `system` endpoints are split into a pa

 Each module defines its own `APIRouter` with `tags=["System"]` and is registered in `main.py` with `prefix="/system"`.

+### Index-Only Principle
+
+**Never rely on the live filesystem for data, except during a scan.** All read endpoints must operate exclusively on the database index. The filesystem is only accessed during:
+
+- **Scan operations** (`/system/scan`) — to discover files, compute hashes, and sync the index.
+- **Configuration endpoints** (`/system/ls`, `/system/browse` when path is outside roots) — to help users pick source roots during setup.
+
+Browsing the archive, searching, or checking protection status must use the index only. This guarantees consistent results even when files are temporarily inaccessible, and prevents I/O bottlenecks on network or tape-backed storage.
+
 ### Shared Helpers (`app/api/common.py`)

 Cross-cutting helpers and schemas that must not create circular imports:
@@ -13,7 +13,6 @@ from app.api.common import (
 )
 from sqlalchemy import text
 from app.db import models
-import os

 router = APIRouter(tags=["System"])

@@ -24,7 +23,11 @@ router = APIRouter(tags=["System"])
 def browse_system_path(
    path: Optional[str] = None, db_session: Session = Depends(get_db)
 ):
-    """Provides a browsable view of the indexed filesystem from the database."""
+    """Provides a browsable view of the indexed filesystem from the database.
+
+    Operates exclusively on the database index (index-only principle).
+    Never falls back to the live filesystem.
+    """
    roots = get_source_roots(db_session)
    tracking_rules = db_session.query(models.TrackedSource).all()
    tracking_map = {rule.path: rule.action for rule in tracking_rules}
@@ -57,105 +60,79 @@ def browse_system_path(
        target_prefix.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
    )

-    files_sql = text("""
+    # --- Files directly under this path (non-recursive) ---
+    file_sql = text("""
        SELECT file_path, size, mtime, sha256_hash, is_ignored
        FROM filesystem_state
-        WHERE file_path LIKE :prefix ESCAPE '\\'
-        AND file_path != :prefix
+        WHERE file_path LIKE :prefix_wildcard ESCAPE '\\'
+          AND file_path NOT LIKE :prefix_nested ESCAPE '\\'
+          AND file_path != :prefix
    """)
-    rows = db_session.execute(files_sql, {"prefix": f"{escaped_prefix}%"}).fetchall()
+    file_rows = db_session.execute(
+        file_sql,
+        {
+            "prefix": target_prefix,
+            "prefix_wildcard": f"{escaped_prefix}%",
+            "prefix_nested": f"{escaped_prefix}%/%",
+        },
+    ).fetchall()

-    if not rows and os.path.isdir(path):
-        try:
-            live_results = []
-            with os.scandir(path) as it:
-                for entry in it:
-                    try:
-                        if entry.name.startswith("."):
-                            continue
-                        entry_path = entry.path
-                        is_dir = entry.is_dir()
-                        is_ignored = get_ignored_status(
-                            entry_path + "/" if is_dir else entry_path,
-                            tracking_map,
-                            exclusion_spec,
-                        )
-                        if is_dir:
-                            live_results.append(
-                                FileItemSchema(
-                                    name=entry.name,
-                                    path=entry_path,
-                                    type="directory",
-                                    ignored=is_ignored,
-                                )
-                            )
-                        else:
-                            stat = entry.stat()
-                            live_results.append(
-                                FileItemSchema(
-                                    name=entry.name,
-                                    path=entry_path,
-                                    type="file",
-                                    size=stat.st_size,
-                                    mtime=stat.st_mtime,
-                                    ignored=is_ignored,
-                                    sha256_hash=None,
-                                )
-                            )
-                    except OSError:
-                        continue
-            live_results.sort(key=lambda x: (x.type != "directory", x.name.lower()))
-            return BrowseResponseSchema(
-                files=live_results, last_scan_time=last_scan_time
+    results: list[FileItemSchema] = []
+    seen: set[str] = set()
+
+    for file_path, size, mtime, sha256_hash, is_ignored in file_rows:
+        if file_path not in seen:
+            seen.add(file_path)
+            results.append(
+                FileItemSchema(
+                    name=file_path.split("/")[-1],
+                    path=file_path,
+                    type="file",
+                    size=size,
+                    mtime=mtime,
+                    ignored=is_ignored,
+                    sha256_hash=sha256_hash,
+                )
            )
-        except OSError:
-            pass

-    # Aggregate sizes for directories from indexed rows
-    dir_sizes: dict[str, int] = {}
-    for file_path, size, _mtime, _sha256_hash, _is_ignored in rows:
-        relative = file_path[len(target_prefix) :]
-        if "/" in relative:
-            immediate_name = relative.split("/")[0]
-            child_path = target_prefix + immediate_name
-            dir_sizes[child_path] = dir_sizes.get(child_path, 0) + (size or 0)
+    # --- Directories under this path (aggregated via GROUP BY) ---
+    dir_sql = text("""
+        SELECT
+            SUBSTR(file_path, LENGTH(:prefix) + 1,
+                   INSTR(SUBSTR(file_path, LENGTH(:prefix) + 1), '/') - 1) as dir_name,
+            SUM(size) as total_size
+        FROM filesystem_state
+        WHERE file_path LIKE :prefix_wildcard ESCAPE '\\'
+          AND file_path != :prefix
+          AND INSTR(SUBSTR(file_path, LENGTH(:prefix) + 1), '/') > 0
+        GROUP BY dir_name
+    """)
+    dir_rows = db_session.execute(
+        dir_sql,
+        {
+            "prefix": target_prefix,
+            "prefix_wildcard": f"{escaped_prefix}%",
+        },
+    ).fetchall()

-    results = []
-    seen = set()
-
-    for file_path, size, mtime, sha256_hash, is_ignored in rows:
-        relative = file_path[len(target_prefix) :]
-        if "/" in relative:
-            immediate_name = relative.split("/")[0]
-            child_path = target_prefix + immediate_name
-            if child_path not in seen:
-                seen.add(child_path)
-                dir_ignored = get_ignored_status(
-                    child_path + "/", tracking_map, exclusion_spec
-                )
-                results.append(
-                    FileItemSchema(
-                        name=immediate_name,
-                        path=child_path,
-                        type="directory",
-                        size=dir_sizes.get(child_path, 0),
-                        ignored=dir_ignored,
-                    )
-                )
-        else:
-            if file_path not in seen:
-                seen.add(file_path)
-                results.append(
-                    FileItemSchema(
-                        name=relative,
-                        path=file_path,
-                        type="file",
-                        size=size,
-                        mtime=mtime,
-                        ignored=is_ignored,
-                        sha256_hash=sha256_hash,
-                    )
+    for dir_name, total_size in dir_rows:
+        if not dir_name or dir_name == "/":
+            continue
+        child_path = target_prefix + dir_name
+        if child_path not in seen:
+            seen.add(child_path)
+            dir_ignored = get_ignored_status(
+                child_path + "/", tracking_map, exclusion_spec
+            )
+            results.append(
+                FileItemSchema(
+                    name=dir_name,
+                    path=child_path,
+                    type="directory",
+                    size=total_size or 0,
+                    ignored=dir_ignored,
                )
+            )

    results.sort(key=lambda x: (x.type != "directory", x.name.lower()))
    return BrowseResponseSchema(files=results, last_scan_time=last_scan_time)
@@ -38,13 +38,13 @@ export default defineConfig({
    {
       command: 'cd ../backend && rm -f e2e_test.db* && DATABASE_URL="sqlite:///e2e_test.db" TAPEHOARD_TEST_MODE="true" TAPEHOARD_CORS_ORIGINS="*,http://localhost:5174,http://127.0.0.1:5174" uv run python -m app.start_test_server --host 127.0.0.1 --port 8001',
      url: 'http://127.0.0.1:8001/health',
-      reuseExistingServer: !process.env.CI,
+      reuseExistingServer: false,
      timeout: 120 * 1000,
    },
    {
      command: 'VITE_API_URL=http://127.0.0.1:8001 npm run dev -- --port 5174',
      url: 'http://localhost:5174',
-      reuseExistingServer: !process.env.CI,
+      reuseExistingServer: false,
      timeout: 120 * 1000,
    },
  ],
@@ -44,6 +44,23 @@ test.describe('TapeHoard Golden Path', () => {
    await requestContext.post(`${API_URL}/system/settings`, {
        data: { key: 'restore_destinations', value: JSON.stringify([RESTORE_DEST]) }
    });
+
+    // Index-only principle: scan first so /system/browse can show files
+    const scanResp = await requestContext.post(`${API_URL}/system/scan`);
+    if (!scanResp.ok()) {
+        console.error('Failed to trigger initial scan');
+    }
+    // Wait for scan to complete
+    const deadline = Date.now() + 30000;
+    while (Date.now() < deadline) {
+        const statusResp = await requestContext.get(`${API_URL}/system/scan/status`);
+        const status = await statusResp.json();
+        if (!status.is_running) {
+            break;
+        }
+        await new Promise(r => setTimeout(r, 500));
+    }
+
    await requestContext.dispose();
  });
Author	SHA1	Message	Date
adamlamers	1ff21e3c2c	filesystem view uses index only Continuous Integration / backend-tests (push) Successful in 33s Details Continuous Integration / e2e-tests (push) Successful in 12m19s Details Continuous Integration / frontend-check (push) Successful in 27s Details	2026-05-04 20:44:49 -04:00
adamlamers	544bd14cbb	don't reuse playwright server	2026-05-04 20:19:04 -04:00