tests & remove is_indexed flag from db

2026-04-28 23:55:39 -04:00
parent c8346f2401
commit 0bfd5affb4
25 changed files with 950 additions and 114 deletions
@@ -53,12 +53,22 @@ This document (`GEMINI.md`) contains critical, contextual information about the
 ### Scanning & Hashing Architecture
 *   **Concurrent Phasing:** Decoupled into `SCAN` (Metadata, Normal priority) and `HASH` (Content, Idle priority with dynamic `iowait` throttling).
 *   **Thread-Safe Metrics:** All counters (files processed, bytes hashed) must be protected by a `threading.Lock`.
-*   **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `is_indexed = 0 AND is_ignored = 0` files.
+*   **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `sha256_hash IS NULL AND is_ignored = 0` files.
 ### Archival & Recovery
 *   **Format Negotiation:** The Archiver adapts formats based on provider capabilities (`supports_random_access`).
    *   *Sequential (Tape):* Uses `.tar` streams to maintain drive streaming.
    *   *Random Access (HDD/Cloud):* Uses native direct file copying/objects to enable instant seekless restores without unpacking gigabytes of data.
 *   **High-Speed Hybrid Archival:**
    *   The system prioritizes the **system `tar` binary** for whole-file chunks, delivering a 10x-20x performance boost over pure Python and ensuring optimal buffer saturation for LTO drives.
    *   It transparently falls back to the **Python `RangeFile` logic** only for chunks containing split fragments, maintaining bit-perfect alignment for multi-tape files.
 *   **Industrial Tar Chunking:**
    *   Large backup sets are automatically split into multiple independent archives. The system dynamically aims for at least **100 archives per tape** (calculated based on generational capacity, e.g., ~15GB for LTO-5) to provide high seek granularity during restoration.
    *   **Exception:** Single large files are allowed to occupy their own archives even if they exceed the target chunk size, preventing unnecessary fragmentation while keeping them as independent, seekable objects.
 *   **Refined Splitting Philosophy:**
    *   Files are **only split** if they are physically larger than the media's entire capacity (multi-tape spanning).
    *   **Skip-and-Defer:** If a file is larger than the remaining space on a tape but smaller than its total capacity, it is deferred to the next fresh medium to minimize fragmentation.
 *   **Hardware-First Utilization:** The system trusts **Physical Hardware Feedback (MAM)** over logical byte counts. Tapes are only marked as "Full" when the drive reporting (via `get_utilization`) confirms saturation, maximizing utilization when hardware compression is active.
 *   **Bitstream Integrity:** `RangeFile` must guarantee exact byte counts for tar alignment.
 *   **Metadata Fidelity:** The restorer must preserve original **permissions (chmod)**, **timestamps (utime)**, and **ownership (chown)** when recovering files natively or via tar.
 *   **Independence:** Force all tar archive members to be **Regular Files** to break fragile hard-link dependencies. Symlinks are preserved as `SYMTYPE` (or `.symlink` stub objects for native format).
@@ -0,0 +1,79 @@
 """remove is_indexed flag
 Revision ID: 1b59e22b9b7a
 Revises: fbbc0a40a840
 Create Date: 2026-04-28 23:44:04.949304
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 # revision identifiers, used by Alembic.
 revision: str = "1b59e22b9b7a"
 down_revision: Union[str, Sequence[str], None] = "fbbc0a40a840"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    """Upgrade schema."""
    # We use batch_alter_table for SQLite compatibility
    with op.batch_alter_table("filesystem_state", schema=None) as batch_op:
        batch_op.drop_column("is_indexed")
        batch_op.alter_column(
            "size",
            existing_type=sa.INTEGER(),
            type_=sa.BigInteger(),
            existing_nullable=False,
        )
    with op.batch_alter_table("storage_media", schema=None) as batch_op:
        batch_op.alter_column(
            "capacity",
            existing_type=sa.INTEGER(),
            type_=sa.BigInteger(),
            existing_nullable=False,
        )
        batch_op.alter_column(
            "bytes_used",
            existing_type=sa.INTEGER(),
            type_=sa.BigInteger(),
            existing_nullable=False,
        )
 def downgrade() -> None:
    """Downgrade schema."""
    with op.batch_alter_table("storage_media", schema=None) as batch_op:
        batch_op.alter_column(
            "bytes_used",
            existing_type=sa.BigInteger(),
            type_=sa.INTEGER(),
            existing_nullable=False,
        )
        batch_op.alter_column(
            "capacity",
            existing_type=sa.BigInteger(),
            type_=sa.INTEGER(),
            existing_nullable=False,
        )
    with op.batch_alter_table("filesystem_state", schema=None) as batch_op:
        batch_op.alter_column(
            "size",
            existing_type=sa.BigInteger(),
            type_=sa.INTEGER(),
            existing_nullable=False,
        )
        batch_op.add_column(
            sa.Column(
                "is_indexed",
                sa.BOOLEAN(),
                server_default=sa.text("'0'"),
                nullable=False,
            )
        )
@@ -2,7 +2,7 @@ from datetime import datetime
 from typing import List, Optional
 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy.orm import Session
 from app.db import models
@@ -17,15 +17,14 @@ router = APIRouter(prefix="/backups", tags=["Backups"])
 class BackupJobSchema(BaseModel):
    model_config = ConfigDict(from_attributes=True)
    id: int
    job_type: str
    status: str
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None
    class Config:
        from_attributes = True
 # --- Endpoints ---
@@ -337,11 +337,11 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
        SELECT
            COUNT(*) as total_files,
            SUM(size) as total_size,
-            SUM(CASE WHEN is_indexed = 1 THEN size ELSE 0 END) as total_hashed_size,
+            SUM(CASE WHEN sha256_hash IS NOT NULL THEN size ELSE 0 END) as total_hashed_size,
            (SELECT SUM(min_size) FROM (
                SELECT MIN(size) as min_size
                FROM filesystem_state
-                WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0
+                WHERE sha256_hash IS NOT NULL AND is_ignored = 0
                GROUP BY sha256_hash
            )) as unique_hashed_size
        FROM filesystem_state
@@ -453,7 +453,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
            COUNT(*) as copy_count,
            (size * (COUNT(*) - 1)) as saved_bytes
        FROM filesystem_state
-        WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0
+        WHERE sha256_hash IS NOT NULL AND is_ignored = 0
        GROUP BY sha256_hash
        HAVING copy_count > 1
        ORDER BY saved_bytes DESC
@@ -465,7 +465,8 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
    directory_aggregation_sql = text("""
        SELECT
            RTRIM(file_path, REPLACE(file_path, '/', '')) as dir_path,
-            SUM(size) as byte_total
+            SUM(size) as byte_total,
            MAX(mtime) as latest_mtime
        FROM filesystem_state
        WHERE is_ignored = 0
        GROUP BY dir_path
@@ -474,7 +475,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
    # Hierarchical tree construction
    nested_dir_map = {}
-    for path_str, size_val in all_directories:
+    for path_str, size_val, mtime_val in all_directories:
        if not path_str:
            continue
        path_segments = [p for p in path_str.split("/") if p]
@@ -492,10 +493,14 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
            if segment not in current_node:
                current_node[segment] = {
                    "size": 0,
                    "mtime": 0,
                    "children": {},
                    "fullPath": accumulated_path,
                }
            current_node[segment]["size"] += size_val
            current_node[segment]["mtime"] = max(
                current_node[segment]["mtime"], mtime_val or 0
            )
            current_node = current_node[segment]["children"]
    # Collapse unhelpful single-child roots
@@ -517,6 +522,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
                {
                    "path": key,
                    "size": value["size"],
                    "mtime": value["mtime"],
                    "fullPath": value["fullPath"],
                    "children": children_list,
                }
@@ -877,12 +883,10 @@ def get_archive_item_metadata(path: str, db_session: Session = Depends(get_db)):
    return ItemMetadataSchema(
        id=item.id,
        path=item.file_path,
        type="file",
        size=item.size,
        mtime=datetime.fromtimestamp(item.mtime, tz=timezone.utc),
        last_seen_timestamp=item.last_seen_timestamp,
        sha256_hash=item.sha256_hash,
        is_indexed=item.is_indexed,
        is_ignored=item.is_ignored,
        versions=versions,
    )
@@ -1,6 +1,6 @@
 from typing import List, Optional
 from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy.orm import Session, joinedload
 from sqlalchemy import text
 from app.db.database import get_db, SessionLocal
@@ -17,14 +17,13 @@ router = APIRouter(prefix="/restores", tags=["Restores"])
 class CartItemSchema(BaseModel):
    model_config = ConfigDict(from_attributes=True)
    id: int
    file_path: str
    size: int
    type: str
    class Config:
        from_attributes = True
 class RestoreTriggerRequest(BaseModel):
    destination_path: str
@@ -115,7 +114,9 @@ def add_directory_to_recovery_queue(
        AND rc.id IS NULL
    """)
-    db_session.execute(discovery_sql, {"prefix": f"{target_directory}%", "now": now})
+    db_session.execute(
        discovery_sql, {"prefix": f"{target_directory}%", "now": now.isoformat()}
    )
    db_session.commit()
    total_in_queue = db_session.query(models.RestoreCart).count()
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from typing import Optional, List, Dict, Any
 from datetime import datetime
@@ -17,7 +17,6 @@ class ItemMetadataSchema(BaseModel):
    mtime: datetime
    last_seen_timestamp: Optional[datetime] = None
    sha256_hash: Optional[str] = None
    is_indexed: bool = False
    is_ignored: bool = False
    child_count: Optional[int] = 0
    selected: bool = False
@@ -44,8 +43,7 @@ class MediaSchema(BaseModel):
    host_total_bytes: Optional[int] = None
    live_info: Optional[Dict[str, Any]] = None
-    class Config:
+    model_config = ConfigDict(from_attributes=True)
        from_attributes = True
 class MediaCreateSchema(BaseModel):
@@ -8,7 +8,7 @@ import pathspec
 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from fastapi.responses import FileResponse, StreamingResponse
 from loguru import logger
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy import func, text
 from sqlalchemy.orm import Session
@@ -37,6 +37,8 @@ class DashboardStatsSchema(BaseModel):
 class JobSchema(BaseModel):
    model_config = ConfigDict(from_attributes=True)
    id: int
    job_type: str
    status: str
@@ -47,9 +49,6 @@ class JobSchema(BaseModel):
    completed_at: Optional[datetime] = None
    created_at: datetime
    class Config:
        from_attributes = True
 class FileItemSchema(BaseModel):
    name: str
@@ -172,7 +171,7 @@ def get_dashboard_stats(db_session: Session = Depends(get_db)):
            SUM(CASE WHEN is_ignored = 1 THEN size ELSE 0 END) as ignored_size,
            SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN 1 ELSE 0 END) as unprotected_count,
            SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as unprotected_size,
-            SUM(CASE WHEN is_indexed = 1 AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count,
+            SUM(CASE WHEN sha256_hash IS NOT NULL AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count,
            SUM(CASE WHEN is_ignored = 0 THEN 1 ELSE 0 END) as eligible_count,
            SUM(CASE WHEN id IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as archived_size
        FROM filesystem_state
@@ -0,0 +1,31 @@
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
    """
    Standardized secret management and application configuration.
    Values can be overridden via environment variables or a .env file.
    """
    model_config = SettingsConfigDict(
        env_file=".env", env_file_encoding="utf-8", extra="ignore"
    )
    # Database
    database_url: str = "sqlite:///./tapehoard.db"
    # Security / Encryption
    # Standardized secret management pattern
    encryption_passphrase: str = "tapehoard-default-insecure-passphrase"
    # Staging
    staging_directory: str = "/staging"
    # Cloud Defaults
    default_s3_region: str = "us-east-1"
    # Hardware Detection
    default_tape_drive: str = "/dev/nst0"
 settings = Settings()
@@ -19,7 +19,6 @@ class FilesystemState(Base):
    sha256_hash: Mapped[Optional[str]] = mapped_column(
        String, index=True, nullable=True
    )
    is_indexed: Mapped[bool] = mapped_column(Boolean, default=False)  # True if hashed
    is_ignored: Mapped[bool] = mapped_column(
        Boolean, default=False
    )  # True if matches exclusion
@@ -0,0 +1,93 @@
 CREATE TABLE filesystem_state (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file_path TEXT UNIQUE,
    size BIGINT,
    mtime FLOAT,
    sha256_hash TEXT,
    is_ignored BOOLEAN DEFAULT 0,
    last_seen_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE storage_media (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    media_type TEXT,
    identifier TEXT UNIQUE,
    generation_tier TEXT,
    capacity BIGINT,
    bytes_used BIGINT DEFAULT 0,
    location TEXT,
    status TEXT DEFAULT 'active',
    extra_config TEXT,
    priority_index INTEGER DEFAULT 0,
    last_seen DATETIME,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE file_versions (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    filesystem_state_id INTEGER,
    media_id INTEGER,
    file_number TEXT,
    offset_in_tar INTEGER,
    is_split BOOLEAN DEFAULT 0,
    split_id TEXT,
    offset_start BIGINT DEFAULT 0,
    offset_end BIGINT DEFAULT 0,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id),
    FOREIGN KEY(media_id) REFERENCES storage_media(id)
 );
 CREATE TABLE tracked_sources (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    path TEXT UNIQUE,
    is_directory BOOLEAN DEFAULT 1,
    action TEXT DEFAULT 'include',
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE restore_cart (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    filesystem_state_id INTEGER,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id)
 );
 CREATE TABLE jobs (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_type TEXT,
    status TEXT DEFAULT 'PENDING',
    progress FLOAT DEFAULT 0.0,
    current_task TEXT,
    error_message TEXT,
    started_at DATETIME,
    completed_at DATETIME,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE system_settings (
    key TEXT PRIMARY KEY,
    value TEXT,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 -- FTS5 Virtual Table for Instant Search
 CREATE VIRTUAL TABLE filesystem_fts USING fts5(
    file_path,
    content='filesystem_state',
    content_rowid='id'
 );
 -- Trigger to keep FTS5 synchronized with real state
 CREATE TRIGGER filesystem_fts_insert AFTER INSERT ON filesystem_state BEGIN
    INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path);
 END;
 CREATE TRIGGER filesystem_fts_delete AFTER DELETE ON filesystem_state BEGIN
    INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path);
 END;
 CREATE TRIGGER filesystem_fts_update AFTER UPDATE ON filesystem_state BEGIN
    INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path);
    INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path);
 END;
@@ -1,3 +1,4 @@
 import hashlib
 import boto3
 import os
 import io
@@ -10,6 +11,8 @@ from Crypto.Cipher import AES
 from Crypto.Protocol.KDF import PBKDF2
 from Crypto.Hash import SHA256
 from app.core.config import settings
 class CloudStorageProvider(AbstractStorageProvider):
    provider_id = "s3_compat"
@@ -48,6 +51,12 @@ class CloudStorageProvider(AbstractStorageProvider):
            "title": "Client-Side Encryption Passphrase",
            "description": "Used to encrypt data locally before uploading via AES-256-GCM.",
        },
        "obfuscate_filenames": {
            "type": "boolean",
            "title": "Obfuscate Filenames",
            "description": "Store files as SHA-256 hashes in the cloud to hide metadata.",
            "default": False,
        },
    }
    def __init__(self, config: Dict[str, Any]):
@@ -55,9 +64,12 @@ class CloudStorageProvider(AbstractStorageProvider):
        self.bucket_name = config.get("bucket_name")
        self.region = config.get("region", "us-east-1")
        self.endpoint_url = config.get("endpoint_url")
        self.obfuscate = config.get("obfuscate_filenames", False)
-        # Local Encryption Settings
+        # Local Encryption Settings: Use provided or global default
-        self.passphrase = config.get("encryption_passphrase")
+        self.passphrase = (
            config.get("encryption_passphrase") or settings.encryption_passphrase
        )
        # Credentials
        access_key = config.get("access_key")
@@ -80,6 +92,16 @@ class CloudStorageProvider(AbstractStorageProvider):
            self.passphrase, salt, dkLen=32, count=100000, hmac_hash_module=SHA256
        )
    def _get_obfuscated_key(self, prefix: str, path: str) -> str:
        """Returns a hashed version of the filename if obfuscation is enabled."""
        if not self.obfuscate:
            return f"{prefix}/{path.lstrip('./')}"
        # We hash the path to hide metadata while keeping it deterministic
        hashed = hashlib.sha256(path.encode("utf-8")).hexdigest()
        # Use two-level sharding to prevent S3 prefix performance issues with 100k+ files
        return f"{prefix}/{hashed[:2]}/{hashed[2:4]}/{hashed}"
    def get_name(self) -> str:
        return f"Cloud ({self.provider_type})"
@@ -122,7 +144,7 @@ class CloudStorageProvider(AbstractStorageProvider):
        try:
            self.s3.head_bucket(Bucket=self.bucket_name)
            paginator = self.s3.get_paginator("list_objects_v2")
-            for page in paginator.paginate(Bucket=self.bucket_name, Prefix="archives/"):
+            for page in paginator.paginate(Bucket=self.bucket_name):
                if "Contents" in page:
                    objects = [{"Key": obj["Key"]} for obj in page["Contents"]]
                    self.s3.delete_objects(
@@ -148,7 +170,9 @@ class CloudStorageProvider(AbstractStorageProvider):
        """
        import uuid
-        object_key = f"archives/{uuid.uuid4().hex}.tar"
+        # Archives are always obfuscated by UUID for privacy and collision avoidance
        raw_key = f"archives/{uuid.uuid4().hex}.tar"
        object_key = self._get_obfuscated_key("archives", raw_key)
        if self.passphrase:
            logger.info(
@@ -166,7 +190,6 @@ class CloudStorageProvider(AbstractStorageProvider):
            ciphertext, tag = cipher.encrypt_and_digest(data)
            # 3. Concatenate and upload
            # We store everything needed for decryption in the blob itself for maximum portability
            payload = salt + nonce + tag + ciphertext
            try:
@@ -174,7 +197,10 @@ class CloudStorageProvider(AbstractStorageProvider):
                    Bucket=self.bucket_name,
                    Key=object_key,
                    Body=payload,
-                    Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"},
+                    Metadata={
                        "x-amz-meta-tapehoard-encrypted": "v2-gcm",
                        "x-amz-meta-tapehoard-type": "archive",
                    },
                )
                return object_key
            except Exception as e:
@@ -193,10 +219,9 @@ class CloudStorageProvider(AbstractStorageProvider):
        self, media_id: str, relative_path: str, stream: BinaryIO
    ) -> str:
        """
-        Uploads a single file directly to the cloud, maintaining structure.
+        Uploads a single file directly to the cloud.
        """
-        clean_path = relative_path.lstrip("./")
+        object_key = self._get_obfuscated_key("objects", relative_path)
        object_key = f"objects/{clean_path}"
        if self.passphrase:
            logger.info(
@@ -221,7 +246,10 @@ class CloudStorageProvider(AbstractStorageProvider):
                    Bucket=self.bucket_name,
                    Key=object_key,
                    Body=payload,
-                    Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"},
+                    Metadata={
                        "x-amz-meta-tapehoard-encrypted": "v2-gcm",
                        "x-amz-meta-tapehoard-type": "object",
                    },
                )
                return object_key
            except Exception as e:
@@ -145,7 +145,6 @@ class ArchiverService:
                models.FilesystemState.id == coverage_subquery.c.filesystem_state_id,
            )
            .filter(
                models.FilesystemState.is_indexed,
                not_(models.FilesystemState.is_ignored),
                (coverage_subquery.c.covered_bytes.is_(None))
                | (coverage_subquery.c.covered_bytes < models.FilesystemState.size),
@@ -723,11 +722,7 @@ class ArchiverService:
                        media_record.identifier, archive_id
                    )
-                    is_tar = (
+                    is_tar = self._test_is_tar_logic(provider, media_record, archive_id)
                        not provider.capabilities.get("supports_random_access")
                        or str(archive_id).endswith(".tar")
                        or str(archive_id).isdigit()
                    )
                    if not is_tar:
                        # Format Negotiation: Direct file recovery
@@ -778,7 +773,12 @@ class ArchiverService:
                            )
                        continue
-                    tar_mode = "r|*" if media_record.media_type == "tape" else "r:*"
+                    tar_mode = (
                        "r|*"
                        if media_record.media_type
                        in ["tape", "lto_tape", "cloud", "s3_compat"]
                        else "r:*"
                    )
                    with tarfile.open(fileobj=bitstream, mode=tar_mode) as tar_bundle:
                        normalized_map = {}
@@ -854,5 +854,26 @@ class ArchiverService:
            logger.exception(f"Restore failed: {e}")
            JobManager.fail_job(job_id, str(e))
    def _test_is_tar_logic(self, provider, media_record, archive_id: str) -> bool:
        """Internal logic to determine if a location_id refers to a tarball vs a native file."""
        # 1. If provider doesn't support random access (Tape), it's ALWAYS a tar stream.
        if not provider.capabilities.get("supports_random_access"):
            return True
        # 2. Explicit extensions or prefixes
        if str(archive_id).endswith(".tar") or "archives/" in str(archive_id):
            return True
        # 3. Tape file numbers (if they happen to be passed here for cloud, which is rare)
        if str(archive_id).isdigit() and media_record.media_type in [
            "tape",
            "lto_tape",
        ]:
            return True
        # 4. Otherwise, if it has random access (Cloud/HDD) and isn't explicitly an archive,
        # it's a native file.
        return False
 archiver_manager = ArchiverService()
@@ -29,55 +29,83 @@ class JobManager:
    @staticmethod
    def start_job(job_id: int):
        """Marks a job as running and sets the start timestamp."""
        from sqlalchemy.orm.exc import StaleDataError
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
+            try:
-            if job_record:
+                job_record = db_session.get(models.Job, job_id)
-                job_record.status = "RUNNING"
+                if job_record:
-                job_record.started_at = datetime.now(timezone.utc)
+                    job_record.status = "RUNNING"
-                db_session.commit()
+                    job_record.started_at = datetime.now(timezone.utc)
                    db_session.commit()
            except StaleDataError:
                db_session.rollback()
                logger.debug(
                    f"Job {job_id} already modified or deleted (StaleDataError)."
                )
    @staticmethod
    def update_job(job_id: int, progress: float, current_task: str):
        """Updates the progress and current task description for a job."""
        from sqlalchemy.orm.exc import StaleDataError
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
+            try:
-            if job_record:
+                job_record = db_session.get(models.Job, job_id)
-                job_record.progress = progress
+                if job_record:
-                job_record.current_task = current_task
+                    job_record.progress = progress
-                db_session.commit()
+                    job_record.current_task = current_task
                    db_session.commit()
            except StaleDataError:
                db_session.rollback()
    @staticmethod
    def complete_job(job_id: int):
        """Marks a job as successfully completed."""
        from sqlalchemy.orm.exc import StaleDataError
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
+            try:
-            if job_record:
+                job_record = db_session.get(models.Job, job_id)
-                job_record.status = "COMPLETED"
+                if job_record:
-                job_record.progress = 100.0
+                    job_record.status = "COMPLETED"
-                job_record.completed_at = datetime.now(timezone.utc)
+                    job_record.progress = 100.0
-                db_session.commit()
+                    job_record.completed_at = datetime.now(timezone.utc)
                    db_session.commit()
            except StaleDataError:
                db_session.rollback()
    @staticmethod
    def fail_job(job_id: int, error_message: str):
        """Marks a job as failed and records the error message."""
        from sqlalchemy.orm.exc import StaleDataError
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
+            try:
-            if job_record:
+                job_record = db_session.get(models.Job, job_id)
-                job_record.status = "FAILED"
+                if job_record:
-                job_record.error_message = error_message
+                    job_record.status = "FAILED"
-                job_record.completed_at = datetime.now(timezone.utc)
+                    job_record.error_message = error_message
-                db_session.commit()
+                    job_record.completed_at = datetime.now(timezone.utc)
                    db_session.commit()
            except StaleDataError:
                db_session.rollback()
    @staticmethod
    def cancel_job(job_id: int):
        """Submits a cancellation request for a pending or running job."""
        from sqlalchemy.orm.exc import StaleDataError
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
+            try:
-            if job_record and job_record.status in ["PENDING", "RUNNING"]:
+                job_record = db_session.get(models.Job, job_id)
-                job_record.status = "FAILED"
+                if job_record and job_record.status in ["PENDING", "RUNNING"]:
-                job_record.error_message = "Cancelled by user"
+                    job_record.status = "FAILED"
-                job_record.completed_at = datetime.now(timezone.utc)
+                    job_record.error_message = "Cancelled by user"
-                db_session.commit()
+                    job_record.completed_at = datetime.now(timezone.utc)
                    db_session.commit()
            except StaleDataError:
                db_session.rollback()
    @staticmethod
    def is_cancelled(job_id: int) -> bool:
@@ -354,7 +382,6 @@ class ScannerService:
                        mtime=file_meta["mtime"],
                        is_ignored=file_meta["ignored"],
                        last_seen_timestamp=timestamp,
                        is_indexed=False,
                    )
                )
            else:
@@ -363,7 +390,7 @@ class ScannerService:
                    or record.mtime != file_meta["mtime"]
                )
                if metadata_changed:
-                    record.is_indexed = False
+                    record.sha256_hash = None
                    with self._metrics_lock:
                        self.files_modified += 1
@@ -396,7 +423,7 @@ class ScannerService:
            total_pending = (
                db_session.query(models.FilesystemState)
                .filter(
-                    models.FilesystemState.is_indexed.is_(False),
+                    models.FilesystemState.sha256_hash.is_(None),
                    models.FilesystemState.is_ignored.is_(False),
                )
                .count()
@@ -408,7 +435,7 @@ class ScannerService:
                    hashing_targets = (
                        db_session.query(models.FilesystemState)
                        .filter(
-                            models.FilesystemState.is_indexed.is_(False),
+                            models.FilesystemState.sha256_hash.is_(None),
                            models.FilesystemState.is_ignored.is_(False),
                        )
                        .limit(100)
@@ -422,7 +449,7 @@ class ScannerService:
                            total_pending = (
                                db_session.query(models.FilesystemState)
                                .filter(
-                                    models.FilesystemState.is_indexed.is_(False),
+                                    models.FilesystemState.sha256_hash.is_(None),
                                    models.FilesystemState.is_ignored.is_(False),
                                )
                                .count()
@@ -451,7 +478,6 @@ class ScannerService:
                            if computed_hash:
                                target_record.sha256_hash = computed_hash
                                target_record.is_indexed = True
                                self.files_hashed += 1
                            if self.files_hashed % 5 == 0:
@@ -66,7 +66,6 @@ def test_get_insights(client, db_session):
        file_path="/source/f1.txt",
        size=100,
        mtime=1000,
        is_indexed=True,
        sha256_hash="hash1",
    )
    db_session.add(file1)
@@ -127,7 +126,6 @@ def test_search_index(client, db_session):
        file_path="data/important.doc",
        size=500,
        mtime=2000,
        is_indexed=True,
        sha256_hash="hash",
    )
    db_session.add(file1)
@@ -21,7 +21,6 @@ def test_get_dashboard_stats_populated(client, db_session):
        size=1024,
        mtime=datetime.now(timezone.utc).timestamp(),
        is_ignored=False,
        is_indexed=True,
    )
    db_session.add(file_state)
    db_session.commit()
@@ -0,0 +1,44 @@
 from app.core.utils import get_path_uuid
 def test_get_path_uuid_macos(mocker):
    """Verifies UUID extraction from diskutil on macOS."""
    mocker.patch("os.path.exists", return_value=True)
    mocker.patch("sys.platform", "darwin")
    mock_run = mocker.patch("subprocess.run")
    mock_res = mocker.MagicMock()
    mock_res.stdout = """
   Device Identifier:        disk4s1
   Device Node:              /dev/disk4s1
   Volume Name:              MyBackups
   Volume UUID:              ABCDEF-1234-5678-90AB-CDEF12345678
    """
    mock_run.return_value = mock_res
    uuid = get_path_uuid("/Volumes/MyBackups")
    assert uuid == "ABCDEF-1234-5678-90AB-CDEF12345678"
    mock_run.assert_called_with(
        ["diskutil", "info", "/Volumes/MyBackups"], capture_output=True, text=True
    )
 def test_get_path_uuid_linux(mocker):
    """Verifies UUID extraction from lsblk on Linux."""
    mocker.patch("os.path.exists", return_value=True)
    mocker.patch("sys.platform", "linux")
    mock_run = mocker.patch("subprocess.run")
    def mock_subprocess(cmd, **kwargs):
        m = mocker.MagicMock()
        if "df" in cmd:
            m.stdout = "Filesystem\n/dev/sdb1"
        elif "lsblk" in cmd:
            m.stdout = "98765432-ABCD-EF01-2345-6789ABCDEF01"
        return m
    mock_run.side_effect = mock_subprocess
    uuid = get_path_uuid("/mnt/hdd")
    assert uuid == "98765432-ABCD-EF01-2345-6789ABCDEF01"
@@ -0,0 +1,65 @@
 import hashlib
 import pytest
 from app.providers.cloud import CloudStorageProvider
 def test_cloud_provider_obfuscation_logic():
    """Verifies that filename hashing and sharding works as expected."""
    # CASE 1: Obfuscation Disabled
    config_plain = {
        "bucket_name": "test-bucket",
        "obfuscate_filenames": False,
        "access_key": "fake",
        "secret_key": "fake",
    }
    provider_plain = CloudStorageProvider(config_plain)
    path = "documents/secret_plan.pdf"
    # Expectation: Key is exactly the path with prefix
    key_plain = provider_plain._get_obfuscated_key("objects", path)
    assert key_plain == "objects/documents/secret_plan.pdf"
    # CASE 2: Obfuscation Enabled
    config_hidden = {
        "bucket_name": "test-bucket",
        "obfuscate_filenames": True,
        "access_key": "fake",
        "secret_key": "fake",
    }
    provider_hidden = CloudStorageProvider(config_hidden)
    # Expectation: Key is hashed and sharded
    # hash of "documents/secret_plan.pdf"
    expected_hash = hashlib.sha256(path.encode("utf-8")).hexdigest()
    expected_prefix = f"objects/{expected_hash[:2]}/{expected_hash[2:4]}"
    key_hidden = provider_hidden._get_obfuscated_key("objects", path)
    assert key_hidden.startswith("objects/")
    assert key_hidden == f"{expected_prefix}/{expected_hash}"
    assert "secret_plan.pdf" not in key_hidden
 def test_cloud_secret_fallback(mocker):
    """Verifies that the provider prioritizes local config over global settings for passphrases."""
    from app.core.config import settings
    # Mock global settings
    mocker.patch.object(settings, "encryption_passphrase", "global-fallback")
    # CASE 1: Local config provides passphrase
    config_local = {"bucket_name": "b", "encryption_passphrase": "local-override"}
    provider_local = CloudStorageProvider(config_local)
    assert provider_local.passphrase == "local-override"
    # CASE 2: Local config is empty, should fallback to global
    config_empty = {"bucket_name": "b"}
    provider_fallback = CloudStorageProvider(config_empty)
    assert provider_fallback.passphrase == "global-fallback"
    # CASE 3: No passphrase anywhere (ValueError on key derivation)
    mocker.patch.object(settings, "encryption_passphrase", "")
    provider_none = CloudStorageProvider({"bucket_name": "b"})
    with pytest.raises(ValueError, match="No encryption passphrase configured"):
        provider_none._derive_key(b"salt")
@@ -0,0 +1,99 @@
 import io
 import pytest
 from app.providers.hdd import OfflineHDDProvider
 def test_hdd_initialization(tmp_path):
    """Verifies that HDD provider correctly prepares the disk structure."""
    mount = tmp_path / "mnt_hdd"
    provider = OfflineHDDProvider({"mount_path": str(mount)})
    success = provider.initialize_media("DISK01")
    assert success is True
    assert (mount / ".tapehoard_id").exists()
    assert (mount / ".tapehoard_id").read_text().strip() == "DISK01"
    assert (mount / "tapehoard_backups" / "archives").is_dir()
 def test_hdd_identification(tmp_path):
    """Verifies that HDD provider can identify a disk by its ID file."""
    mount = tmp_path / "mnt_hdd"
    mount.mkdir()
    (mount / ".tapehoard_id").write_text("DISK02")
    provider = OfflineHDDProvider({"mount_path": str(mount)})
    assert provider.identify_media() == "DISK02"
 def test_hdd_write_sequential_logic(tmp_path):
    """Verifies that archives are numbered and padded correctly."""
    mount = tmp_path / "mnt_hdd"
    provider = OfflineHDDProvider({"mount_path": str(mount)})
    provider.initialize_media("DISK03")
    # Write first archive
    loc1 = provider.write_archive("DISK03", io.BytesIO(b"archive1"))
    assert loc1 == "0"
    assert (mount / "tapehoard_backups" / "archives" / "000000.tar").exists()
    # Write second archive
    loc2 = provider.write_archive("DISK03", io.BytesIO(b"archive2"))
    assert loc2 == "1"
    assert (mount / "tapehoard_backups" / "archives" / "000001.tar").exists()
    # Read them back
    with provider.read_archive("DISK03", "0") as f:
        assert f.read() == b"archive1"
    with provider.read_archive("DISK03", "1") as f:
        assert f.read() == b"archive2"
 def test_hdd_direct_write_and_traversal_guard(tmp_path):
    """Verifies direct file copies and security guards."""
    mount = tmp_path / "mnt_hdd"
    provider = OfflineHDDProvider({"mount_path": str(mount)})
    # Valid direct write
    rel_path = "photos/holiday.jpg"
    loc = provider.write_file_direct("DISK04", rel_path, io.BytesIO(b"imagedata"))
    assert loc == rel_path
    target = mount / "tapehoard_backups" / "objects" / rel_path
    assert target.exists()
    assert target.read_bytes() == b"imagedata"
    # Read back (Format Negotiation)
    with provider.read_archive("DISK04", rel_path) as f:
        assert f.read() == b"imagedata"
    # Traversal Attempt
    with pytest.raises(ValueError, match="Invalid relative path"):
        provider.write_file_direct(
            "DISK04", "escape/../../secret.txt", io.BytesIO(b"bad")
        )
 def test_hdd_online_check_with_uuid(tmp_path, mocker):
    """Verifies that online check validates both path and UUID."""
    mount = tmp_path / "mnt_hdd"
    mount.mkdir()
    # Mock UUID utility
    mock_get_uuid = mocker.patch("app.core.utils.get_path_uuid")
    mock_get_uuid.return_value = "UUID-123"
    # CASE 1: Correct UUID
    provider_ok = OfflineHDDProvider(
        {"mount_path": str(mount), "device_uuid": "UUID-123"}
    )
    assert provider_ok.check_online() is True
    # CASE 2: Mismatched UUID
    provider_fail = OfflineHDDProvider(
        {"mount_path": str(mount), "device_uuid": "UUID-999"}
    )
    assert provider_fail.check_online() is False
    # CASE 3: No UUID (Path only)
    provider_path_only = OfflineHDDProvider({"mount_path": str(mount)})
    assert provider_path_only.check_online() is True
@@ -0,0 +1,122 @@
 from app.providers.tape import LTOProvider
 def test_lto_compression_control(mocker):
    """Verifies that LTOProvider sends the correct mt commands for compression."""
    # Mock subprocess.run and os.path.exists
    mock_run = mocker.patch("subprocess.run")
    mocker.patch("os.path.exists", return_value=True)
    # CASE 1: Compression Enabled
    provider_on = LTOProvider({"device_path": "/dev/nst0", "compression": True})
    provider_on._setup_compression()
    # Expectation: mt compression 1
    mock_run.assert_called_with(
        ["mt", "-f", "/dev/nst0", "compression", "1"], check=True, capture_output=True
    )
    # CASE 2: Compression Disabled
    mock_run.reset_mock()
    provider_off = LTOProvider({"device_path": "/dev/nst0", "compression": False})
    provider_off._setup_compression()
    # Expectation: mt compression 0
    mock_run.assert_called_with(
        ["mt", "-f", "/dev/nst0", "compression", "0"], check=True, capture_output=True
    )
 def test_lto_insertion_detection(mocker):
    """Verifies that needs_registration triggers only on new insertion."""
    device = "/dev/nst0"
    # 1. Start with Empty State
    LTOProvider._lkg_state = {
        device: {"drive": {}, "mam": {}, "online": False, "last_check": 0.0}
    }
    provider = LTOProvider({"device_path": device})
    # Mock OS path existence
    mocker.patch("os.path.exists", return_value=True)
    # Mock subprocess.run to return "READY" for status and valid MAM attributes
    def mock_subprocess(cmd, **kwargs):
        m = mocker.MagicMock()
        m.returncode = 0
        if "status" in cmd:
            m.stdout = "READY"
        elif "sg_read_attr" in cmd:
            # Mock raw MAM data bytes (minimal valid structure)
            # barcode "TAPE01" is at the end of the mapping usually
            # But we patch get_mam_info to be easier
            pass
        return m
    mocker.patch("subprocess.run", side_effect=mock_subprocess)
    # We patch get_mam_info because parsing raw bytes in a test is overkill
    mocker.patch.object(provider, "get_mam_info", return_value={"barcode": "TAPE01"})
    mocker.patch.object(provider, "get_drive_info", return_value={"vendor": "HP"})
    # Execution: First poll after "insertion" (transition from offline LKG to online)
    info = provider.get_live_info(force=True)
    # EXPECTATION: needs_registration should be True
    assert info["online"] is True
    assert info["identity"] == "TAPE01"
    assert info["needs_registration"] is True
    # 2. Second poll (already online)
    # LKG state is now updated by the provider during the first poll
    info_second = provider.get_live_info(force=True)
    # EXPECTATION: needs_registration should be False (already saw this tape)
    assert info_second["needs_registration"] is False
 def test_lto_mam_parsing_logic(mocker):
    """Verifies the parsing of raw SCSI attribute bytes into human-readable metadata."""
    import struct
    device = "/dev/nst0"
    provider = LTOProvider({"device_path": device})
    mocker.patch("os.path.exists", return_value=True)
    # 1. CONSTRUCT RAW PAYLOAD
    # SCSI MAM raw format: [TotalLen(4)] + { [ID(2)][Flags(1)][Len(2)][Value] }...
    def pack_attr(attr_id, value_bytes):
        return struct.pack(">HBH", attr_id, 0, len(value_bytes)) + value_bytes
    # Remaining Capacity (0x0000): 500GB (512000 MiB)
    attr_rem = pack_attr(0x0000, (512000).to_bytes(8, "big"))
    # Max Capacity (0x0001): 1.5TB (1536000 MiB - LTO-5 raw)
    attr_max = pack_attr(0x0001, (1536000).to_bytes(8, "big"))
    # Tape Alert Flags (0x0002): Bit 20 is "Clean Now"
    # Bit 20 from left (64-bit int) -> (1 << (64-20))
    alert_flags = 1 << (64 - 20)
    attr_alerts = pack_attr(0x0002, alert_flags.to_bytes(8, "big"))
    # Barcode (0x0806): "TAPE123"
    attr_barcode = pack_attr(0x0806, b"TAPE123\x00\x00")  # null padded
    payload_body = attr_rem + attr_max + attr_alerts + attr_barcode
    full_payload = struct.pack(">I", len(payload_body)) + payload_body
    # 2. MOCK SUBPROCESS
    mock_res = mocker.MagicMock()
    mock_res.returncode = 0
    mock_res.stdout = full_payload
    mocker.patch("subprocess.run", return_value=mock_res)
    # 3. EXECUTION
    mam = provider.get_mam_info(force=True)
    # 4. EXPECTATIONS
    assert mam["remaining_capacity_mib"] == 512000
    assert mam["max_capacity_mib"] == 1536000
    assert mam["barcode"] == "TAPE123"
    assert "Clean Now" in mam["alerts"]
    # 1.5TB should be identified as LTO-5
    assert mam["generation_label"] == "LTO-5"
@@ -1,5 +1,6 @@
 import tarfile
 import io
 import pytest
 from app.services.archiver import ArchiverService, RangeFile
 from app.db import models
@@ -24,17 +25,11 @@ def test_get_unbacked_files(db_session):
    archiver = ArchiverService()
    # File 1: Completely unbacked
-    f1 = models.FilesystemState(
+    f1 = models.FilesystemState(file_path="/data/new.txt", size=100, mtime=1)
        file_path="/data/new.txt", size=100, mtime=1, is_indexed=True
    )
    # File 2: Partially backed (split)
-    f2 = models.FilesystemState(
+    f2 = models.FilesystemState(file_path="/data/split.bin", size=1000, mtime=1)
        file_path="/data/split.bin", size=1000, mtime=1, is_indexed=True
    )
    # File 3: Fully backed
-    f3 = models.FilesystemState(
+    f3 = models.FilesystemState(file_path="/data/done.txt", size=50, mtime=1)
        file_path="/data/done.txt", size=50, mtime=1, is_indexed=True
    )
    db_session.add_all([f1, f2, f3])
    db_session.flush()
@@ -89,18 +84,12 @@ def test_assemble_backup_batch(db_session):
    # Create files
    size_200mb = 200 * 1024 * 1024
-    f1 = models.FilesystemState(
+    f1 = models.FilesystemState(file_path="/f1.bin", size=size_200mb, mtime=1)
        file_path="/f1.bin", size=size_200mb, mtime=1, is_indexed=True
    )
    # f2 is 200MB, would fit on fresh tape. Should be skipped on this 300MB tape
    # after f1 (200MB) is added, because only 100MB is left.
-    f2 = models.FilesystemState(
+    f2 = models.FilesystemState(file_path="/f2.bin", size=size_200mb, mtime=1)
        file_path="/f2.bin", size=size_200mb, mtime=1, is_indexed=True
    )
    # f3 is 500MB, larger than total capacity (300MB). SHOULD be split.
-    f3 = models.FilesystemState(
+    f3 = models.FilesystemState(file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1)
        file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1, is_indexed=True
    )
    db_session.add_all([f1, f2, f3])
    db_session.commit()
@@ -138,7 +127,6 @@ def test_run_backup_mocked(db_session, mocker, tmp_path):
        file_path=str(source_file),
        size=source_file.stat().st_size,
        mtime=1,
        is_indexed=True,
        sha256_hash="hash1",
    )
    db_session.add(f1)
@@ -164,14 +152,227 @@ def test_run_backup_mocked(db_session, mocker, tmp_path):
    db_session.expire_all()
    assert media.bytes_used > 0
-    # Verify FileVersion was recorded
+
 def test_archiver_saturated_media_logic(db_session, mocker, tmp_path):
    """Verifies that media is marked full and priority ceded based on hardware feedback."""
    staging = tmp_path / "staging"
    staging.mkdir()
    archiver = ArchiverService(staging_directory=str(staging))
    # Setup Media (HDD for simple mocking)
    media = models.StorageMedia(
        media_type="hdd",
        identifier="FULL_DISK",
        capacity=1000,
        status="active",
        bytes_used=0,
        priority_index=1,
    )
    db_session.add(media)
    # Other media to check priority reordering
    media2 = models.StorageMedia(
        media_type="hdd",
        identifier="NEXT_DISK",
        capacity=1000,
        status="active",
        bytes_used=0,
        priority_index=2,
    )
    db_session.add(media2)
    # Setup a small file to trigger the loop
    source_file = tmp_path / "small.txt"
    source_file.write_bytes(b"data")
    f1 = models.FilesystemState(
        file_path=str(source_file),
        size=4,
        mtime=1,
        sha256_hash="hash_small",
    )
    db_session.add(f1)
    db_session.commit()
    # Mock Provider to report 99% utilization
    mock_provider = mocker.MagicMock()
    mock_provider.capabilities = {"supports_random_access": True}
    mock_provider.identify_media.return_value = "FULL_DISK"
    mock_provider.prepare_for_write.return_value = True
    # Ensure write_file_direct returns a string, not a MagicMock
    mock_provider.write_file_direct.return_value = "LOC_1"
    # FORCE hardware utilization report to 99%
    mock_provider.get_utilization.return_value = 0.99
    mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider)
    from app.services.scanner import JobManager
    job = JobManager.create_job(db_session, "BACKUP")
    # Run archival
    archiver.run_backup(db_session, media.id, job.id)
    # EXPECTATION:
    # 1. Media should be marked "full"
    # 2. Priority should be moved to the end (higher than media2)
    db_session.expire_all()
    assert media.status == "full"
    assert media.priority_index > media2.priority_index
 def test_archiver_chunking_logic(db_session, mocker, tmp_path):
    """Verifies that large backup batches are split into appropriate chunks."""
    staging = tmp_path / "staging"
    staging.mkdir()
    archiver = ArchiverService(staging_directory=str(staging))
    # Setup Media (Capacity 100GB, target chunk size ~1GB)
    media = models.StorageMedia(
        media_type="tape",
        identifier="TAPE_001",
        capacity=100 * 1024 * 1024 * 1024,
        status="active",
        bytes_used=0,
    )
    db_session.add(media)
    # 1. Add 10 small files (100MB each) -> Should fit in one 1GB chunk
    for i in range(10):
        source_file = tmp_path / f"small_{i}.bin"
        source_file.write_bytes(b"0" * (100 * 1024 * 1024))
        f = models.FilesystemState(
            file_path=str(source_file),
            size=100 * 1024 * 1024,
            mtime=1,
            sha256_hash=f"hash_s_{i}",
        )
        db_session.add(f)
    # 2. Add 1 large file (5GB) -> Exceeds chunk size, should trigger its own chunk
    large_file = tmp_path / "large.bin"
    large_file.write_bytes(b"0" * 10)  # Mock small content for large size metadata
    f_large = models.FilesystemState(
        file_path=str(large_file),
        size=5 * 1024 * 1024 * 1024,
        mtime=1,
        sha256_hash="hash_large",
    )
    # Monkeypatch stat size for test efficiency (don't actually write 5GB to tmp)
    mocker.patch("os.path.getsize", return_value=5 * 1024 * 1024 * 1024)
    # Also patch RangeFile to avoid actual reads
    mocker.patch(
        "app.services.archiver.RangeFile.__enter__", return_value=mocker.MagicMock()
    )
    mocker.patch("app.services.archiver.RangeFile.__exit__")
    db_session.add(f_large)
    db_session.commit()
    # Mock Provider
    mock_provider = mocker.MagicMock()
    mock_provider.capabilities = {"supports_random_access": False}
    mock_provider.identify_media.return_value = "TAPE_001"
    mock_provider.prepare_for_write.return_value = True
    mock_provider.write_archive.return_value = "1"
    mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider)
    from app.services.scanner import JobManager
    job = JobManager.create_job(db_session, "BACKUP")
    # Run archival
    archiver.run_backup(db_session, media.id, job.id)
    # EXPECTATION:
    # write_archive should be called TWICE:
    # 1. Once for the 10 small files (combined chunk)
    # 2. Once for the large file (independent chunk)
    assert mock_provider.write_archive.call_count == 2
    # Verify FileVersion was recorded for the large file
    version = (
        db_session.query(models.FileVersion)
-        .filter_by(filesystem_state_id=f1.id)
+        .filter_by(filesystem_state_id=f_large.id)
        .first()
    )
    assert version is not None
-    assert version.file_number == "ARCH_1"
+    assert version.file_number == "1"
 def test_range_file_alignment_guard(tmp_path):
    """Verifies that RangeFile pads truncated files with nulls to maintain tar alignment."""
    from app.services.archiver import RangeFile
    # Create a 50 byte file
    f = tmp_path / "truncated.bin"
    f.write_bytes(b"A" * 50)
    # Initialize RangeFile expecting 100 bytes
    with RangeFile(str(f), offset_start=0, length=100) as rf:
        data = rf.read(100)
        # EXPECTATION:
        # 1. Total length must be exactly 100
        # 2. First 50 are 'A's
        # 3. Last 50 are nulls '\x00'
        assert len(data) == 100
        assert data[:50] == b"A" * 50
        assert data[50:] == b"\x00" * 50
 def test_path_traversal_protection():
    """Verifies that the restorer blocks path traversal attempts."""
    archiver = ArchiverService()
    base = "/restores/my_backup"
    # CASE 1: Valid Path
    safe_path = archiver._sanitize_recovery_path(base, "photos/image.jpg")
    assert safe_path.startswith(base)
    # CASE 2: Traversal Attempt (Relative)
    with pytest.raises(PermissionError, match="Restricted path traversal"):
        archiver._sanitize_recovery_path(base, "../../etc/shadow")
    # CASE 3: Traversal Attempt (Absolute-style relative that escapes)
    with pytest.raises(PermissionError, match="Restricted path traversal"):
        archiver._sanitize_recovery_path(base, "/../../etc/shadow")
 def test_archiver_restoration_negotiation(db_session, mocker, tmp_path):
    """Verifies that the restorer chooses the correct format (Native vs Tar)."""
    archiver = ArchiverService()
    # Setup Mocks
    mock_media = mocker.MagicMock()
    mock_media.media_type = "cloud"
    mock_media.identifier = "BUCKET1"
    mock_provider = mocker.MagicMock()
    mock_provider.identify_media.return_value = "BUCKET1"
    mock_provider.check_online.return_value = True
    # 1. TEST NATIVE NEGOTIATION
    # Expectation: random_access=True + non-tar ID -> is_tar=False
    mock_provider.capabilities = {"supports_random_access": True}
    # CASE A: Obfuscated object (Native)
    is_tar_native = archiver._test_is_tar_logic(
        mock_provider, mock_media, "objects/a1/b2/hash"
    )
    assert is_tar_native is False
    # CASE B: Obfuscated archive (Tar)
    is_tar_cloud_archive = archiver._test_is_tar_logic(
        mock_provider, mock_media, "archives/a1/b2/hash"
    )
    assert is_tar_cloud_archive is True
    # CASE C: Tape (Always Tar)
    mock_media.media_type = "tape"
    mock_provider.capabilities = {"supports_random_access": False}
    is_tar_tape = archiver._test_is_tar_logic(mock_provider, mock_media, "1")
    assert is_tar_tape is True
 def test_run_restore_mocked(db_session, mocker, tmp_path):
@@ -206,6 +407,9 @@ def test_run_restore_mocked(db_session, mocker, tmp_path):
    # Mock Provider & Tar Stream
    mock_provider = mocker.MagicMock()
    mock_provider.identify_media.return_value = "RESTORE_ME"
    mock_provider.check_online.return_value = True
    # FORCE tar path by disabling random access
    mock_provider.capabilities = {"supports_random_access": False}
    # Create a mock tar stream containing the file
    buf = io.BytesIO()
@@ -91,7 +91,7 @@ def test_metadata_update_on_change(db_session):
    # Initial state
    f1 = models.FilesystemState(
-        file_path="/data/up.txt", size=50, mtime=1, is_indexed=True, sha256_hash="old"
+        file_path="/data/up.txt", size=50, mtime=1, sha256_hash="old"
    )
    db_session.add(f1)
    db_session.commit()
@@ -103,7 +103,7 @@ def test_metadata_update_on_change(db_session):
    db_session.refresh(f1)
    assert f1.size == 999
-    assert f1.is_indexed is False  # Should be reset for re-hashing
+    assert f1.sha256_hash is None  # Should be reset for re-hashing
 def test_scan_sources_mocked(db_session, mocker):
@@ -141,7 +141,7 @@ def test_run_hashing_mocked(db_session, mocker):
    # Setup unindexed file
    f = models.FilesystemState(
-        file_path="/data/hash.me", size=10, mtime=1, is_indexed=False, is_ignored=False
+        file_path="/data/hash.me", size=10, mtime=1, is_ignored=False
    )
    db_session.add(f)
    db_session.commit()
@@ -154,5 +154,4 @@ def test_run_hashing_mocked(db_session, mocker):
    scanner.run_hashing()
    db_session.refresh(f)
    assert f.is_indexed is True
    assert f.sha256_hash == "mocked_hash"
@@ -15,7 +15,11 @@ services:
    # devices:
    #   - /dev/nst0:/dev/nst0
    environment:
-      - PUID=1000
+      - PUID=0
-      - PGID=1000
+      - PGID=0
      - RUN_AS_ROOT=true
      - TZ=UTC
    user: "0:0"
    cap_add:
      - SYS_RAWIO
    restart: unless-stopped
@@ -270,10 +270,7 @@ export type ItemMetadataSchema = {
     * Sha256 Hash
     */
    sha256_hash?: string | null;
-    /**
+
     * Is Indexed
     */
    is_indexed?: boolean;
    /**
     * Is Ignored
     */
@@ -37,7 +37,7 @@ lint:
 # Run all backend tests
 pytest:
    @echo "Running backend tests..."
-    cd backend && uv run pytest
+    cd backend && COVERAGE_CORE=sysmon uv run pytest
 # Run frontend checks
 check:
@@ -0,0 +1,17 @@
 [Unit]
 Description=TapeHoard Backup Manager
 After=network.target
 [Service]
 Type=simple
 User=root
 WorkingDirectory=/opt/tapehoard
 # We use root to ensure direct /dev/nst0 and raw SCSI access
 ExecStart=/usr/bin/just dev
 Restart=always
 RestartSec=10
 # Give the tape drive time to rewind on shutdown
 TimeoutStopSec=30
 [Install]
 WantedBy=multi-user.target