tests & remove is_indexed flag from db

2026-04-28 23:55:39 -04:00
parent c8346f2401
commit 0bfd5affb4
25 changed files with 950 additions and 114 deletions
@@ -53,12 +53,22 @@ This document (`GEMINI.md`) contains critical, contextual information about the
 ### Scanning & Hashing Architecture
 *   **Concurrent Phasing:** Decoupled into `SCAN` (Metadata, Normal priority) and `HASH` (Content, Idle priority with dynamic `iowait` throttling).
 *   **Thread-Safe Metrics:** All counters (files processed, bytes hashed) must be protected by a `threading.Lock`.
-*   **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `is_indexed = 0 AND is_ignored = 0` files.
+*   **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `sha256_hash IS NULL AND is_ignored = 0` files.

 ### Archival & Recovery
 *   **Format Negotiation:** The Archiver adapts formats based on provider capabilities (`supports_random_access`).
    *   *Sequential (Tape):* Uses `.tar` streams to maintain drive streaming.
    *   *Random Access (HDD/Cloud):* Uses native direct file copying/objects to enable instant seekless restores without unpacking gigabytes of data.
+*   **High-Speed Hybrid Archival:**
+    *   The system prioritizes the **system `tar` binary** for whole-file chunks, delivering a 10x-20x performance boost over pure Python and ensuring optimal buffer saturation for LTO drives.
+    *   It transparently falls back to the **Python `RangeFile` logic** only for chunks containing split fragments, maintaining bit-perfect alignment for multi-tape files.
+*   **Industrial Tar Chunking:**
+    *   Large backup sets are automatically split into multiple independent archives. The system dynamically aims for at least **100 archives per tape** (calculated based on generational capacity, e.g., ~15GB for LTO-5) to provide high seek granularity during restoration.
+    *   **Exception:** Single large files are allowed to occupy their own archives even if they exceed the target chunk size, preventing unnecessary fragmentation while keeping them as independent, seekable objects.
+*   **Refined Splitting Philosophy:**
+    *   Files are **only split** if they are physically larger than the media's entire capacity (multi-tape spanning).
+    *   **Skip-and-Defer:** If a file is larger than the remaining space on a tape but smaller than its total capacity, it is deferred to the next fresh medium to minimize fragmentation.
+*   **Hardware-First Utilization:** The system trusts **Physical Hardware Feedback (MAM)** over logical byte counts. Tapes are only marked as "Full" when the drive reporting (via `get_utilization`) confirms saturation, maximizing utilization when hardware compression is active.
 *   **Bitstream Integrity:** `RangeFile` must guarantee exact byte counts for tar alignment.
 *   **Metadata Fidelity:** The restorer must preserve original **permissions (chmod)**, **timestamps (utime)**, and **ownership (chown)** when recovering files natively or via tar.
 *   **Independence:** Force all tar archive members to be **Regular Files** to break fragile hard-link dependencies. Symlinks are preserved as `SYMTYPE` (or `.symlink` stub objects for native format).
@@ -0,0 +1,79 @@
+"""remove is_indexed flag
+
+Revision ID: 1b59e22b9b7a
+Revises: fbbc0a40a840
+Create Date: 2026-04-28 23:44:04.949304
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = "1b59e22b9b7a"
+down_revision: Union[str, Sequence[str], None] = "fbbc0a40a840"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # We use batch_alter_table for SQLite compatibility
+    with op.batch_alter_table("filesystem_state", schema=None) as batch_op:
+        batch_op.drop_column("is_indexed")
+        batch_op.alter_column(
+            "size",
+            existing_type=sa.INTEGER(),
+            type_=sa.BigInteger(),
+            existing_nullable=False,
+        )
+
+    with op.batch_alter_table("storage_media", schema=None) as batch_op:
+        batch_op.alter_column(
+            "capacity",
+            existing_type=sa.INTEGER(),
+            type_=sa.BigInteger(),
+            existing_nullable=False,
+        )
+        batch_op.alter_column(
+            "bytes_used",
+            existing_type=sa.INTEGER(),
+            type_=sa.BigInteger(),
+            existing_nullable=False,
+        )
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    with op.batch_alter_table("storage_media", schema=None) as batch_op:
+        batch_op.alter_column(
+            "bytes_used",
+            existing_type=sa.BigInteger(),
+            type_=sa.INTEGER(),
+            existing_nullable=False,
+        )
+        batch_op.alter_column(
+            "capacity",
+            existing_type=sa.BigInteger(),
+            type_=sa.INTEGER(),
+            existing_nullable=False,
+        )
+
+    with op.batch_alter_table("filesystem_state", schema=None) as batch_op:
+        batch_op.alter_column(
+            "size",
+            existing_type=sa.BigInteger(),
+            type_=sa.INTEGER(),
+            existing_nullable=False,
+        )
+        batch_op.add_column(
+            sa.Column(
+                "is_indexed",
+                sa.BOOLEAN(),
+                server_default=sa.text("'0'"),
+                nullable=False,
+            )
+        )
@@ -2,7 +2,7 @@ from datetime import datetime
 from typing import List, Optional

 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy.orm import Session

 from app.db import models
@@ -17,15 +17,14 @@ router = APIRouter(prefix="/backups", tags=["Backups"])


 class BackupJobSchema(BaseModel):
+    model_config = ConfigDict(from_attributes=True)
+
    id: int
    job_type: str
    status: str
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None

-    class Config:
-        from_attributes = True
-

 # --- Endpoints ---

@@ -337,11 +337,11 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
        SELECT
            COUNT(*) as total_files,
            SUM(size) as total_size,
-            SUM(CASE WHEN is_indexed = 1 THEN size ELSE 0 END) as total_hashed_size,
+            SUM(CASE WHEN sha256_hash IS NOT NULL THEN size ELSE 0 END) as total_hashed_size,
            (SELECT SUM(min_size) FROM (
                SELECT MIN(size) as min_size
                FROM filesystem_state
-                WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0
+                WHERE sha256_hash IS NOT NULL AND is_ignored = 0
                GROUP BY sha256_hash
            )) as unique_hashed_size
        FROM filesystem_state
@@ -453,7 +453,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
            COUNT(*) as copy_count,
            (size * (COUNT(*) - 1)) as saved_bytes
        FROM filesystem_state
-        WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0
+        WHERE sha256_hash IS NOT NULL AND is_ignored = 0
        GROUP BY sha256_hash
        HAVING copy_count > 1
        ORDER BY saved_bytes DESC
@@ -465,7 +465,8 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
    directory_aggregation_sql = text("""
        SELECT
            RTRIM(file_path, REPLACE(file_path, '/', '')) as dir_path,
-            SUM(size) as byte_total
+            SUM(size) as byte_total,
+            MAX(mtime) as latest_mtime
        FROM filesystem_state
        WHERE is_ignored = 0
        GROUP BY dir_path
@@ -474,7 +475,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):

    # Hierarchical tree construction
    nested_dir_map = {}
-    for path_str, size_val in all_directories:
+    for path_str, size_val, mtime_val in all_directories:
        if not path_str:
            continue
        path_segments = [p for p in path_str.split("/") if p]
@@ -492,10 +493,14 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
            if segment not in current_node:
                current_node[segment] = {
                    "size": 0,
+                    "mtime": 0,
                    "children": {},
                    "fullPath": accumulated_path,
                }
            current_node[segment]["size"] += size_val
+            current_node[segment]["mtime"] = max(
+                current_node[segment]["mtime"], mtime_val or 0
+            )
            current_node = current_node[segment]["children"]

    # Collapse unhelpful single-child roots
@@ -517,6 +522,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
                {
                    "path": key,
                    "size": value["size"],
+                    "mtime": value["mtime"],
                    "fullPath": value["fullPath"],
                    "children": children_list,
                }
@@ -877,12 +883,10 @@ def get_archive_item_metadata(path: str, db_session: Session = Depends(get_db)):
    return ItemMetadataSchema(
        id=item.id,
        path=item.file_path,
-        type="file",
        size=item.size,
        mtime=datetime.fromtimestamp(item.mtime, tz=timezone.utc),
        last_seen_timestamp=item.last_seen_timestamp,
        sha256_hash=item.sha256_hash,
-        is_indexed=item.is_indexed,
        is_ignored=item.is_ignored,
        versions=versions,
    )
@@ -1,6 +1,6 @@
 from typing import List, Optional
 from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy.orm import Session, joinedload
 from sqlalchemy import text
 from app.db.database import get_db, SessionLocal
@@ -17,14 +17,13 @@ router = APIRouter(prefix="/restores", tags=["Restores"])


 class CartItemSchema(BaseModel):
+    model_config = ConfigDict(from_attributes=True)
+
    id: int
    file_path: str
    size: int
    type: str

-    class Config:
-        from_attributes = True
-

 class RestoreTriggerRequest(BaseModel):
    destination_path: str
@@ -115,7 +114,9 @@ def add_directory_to_recovery_queue(
        AND rc.id IS NULL
    """)

-    db_session.execute(discovery_sql, {"prefix": f"{target_directory}%", "now": now})
+    db_session.execute(
+        discovery_sql, {"prefix": f"{target_directory}%", "now": now.isoformat()}
+    )
    db_session.commit()

    total_in_queue = db_session.query(models.RestoreCart).count()
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from typing import Optional, List, Dict, Any
 from datetime import datetime

@@ -17,7 +17,6 @@ class ItemMetadataSchema(BaseModel):
    mtime: datetime
    last_seen_timestamp: Optional[datetime] = None
    sha256_hash: Optional[str] = None
-    is_indexed: bool = False
    is_ignored: bool = False
    child_count: Optional[int] = 0
    selected: bool = False
@@ -44,8 +43,7 @@ class MediaSchema(BaseModel):
    host_total_bytes: Optional[int] = None
    live_info: Optional[Dict[str, Any]] = None

-    class Config:
-        from_attributes = True
+    model_config = ConfigDict(from_attributes=True)


 class MediaCreateSchema(BaseModel):
@@ -8,7 +8,7 @@ import pathspec
 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from fastapi.responses import FileResponse, StreamingResponse
 from loguru import logger
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy import func, text
 from sqlalchemy.orm import Session

@@ -37,6 +37,8 @@ class DashboardStatsSchema(BaseModel):


 class JobSchema(BaseModel):
+    model_config = ConfigDict(from_attributes=True)
+
    id: int
    job_type: str
    status: str
@@ -47,9 +49,6 @@ class JobSchema(BaseModel):
    completed_at: Optional[datetime] = None
    created_at: datetime

-    class Config:
-        from_attributes = True
-

 class FileItemSchema(BaseModel):
    name: str
@@ -172,7 +171,7 @@ def get_dashboard_stats(db_session: Session = Depends(get_db)):
            SUM(CASE WHEN is_ignored = 1 THEN size ELSE 0 END) as ignored_size,
            SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN 1 ELSE 0 END) as unprotected_count,
            SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as unprotected_size,
-            SUM(CASE WHEN is_indexed = 1 AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count,
+            SUM(CASE WHEN sha256_hash IS NOT NULL AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count,
            SUM(CASE WHEN is_ignored = 0 THEN 1 ELSE 0 END) as eligible_count,
            SUM(CASE WHEN id IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as archived_size
        FROM filesystem_state
@@ -0,0 +1,31 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    """
+    Standardized secret management and application configuration.
+    Values can be overridden via environment variables or a .env file.
+    """
+
+    model_config = SettingsConfigDict(
+        env_file=".env", env_file_encoding="utf-8", extra="ignore"
+    )
+
+    # Database
+    database_url: str = "sqlite:///./tapehoard.db"
+
+    # Security / Encryption
+    # Standardized secret management pattern
+    encryption_passphrase: str = "tapehoard-default-insecure-passphrase"
+
+    # Staging
+    staging_directory: str = "/staging"
+
+    # Cloud Defaults
+    default_s3_region: str = "us-east-1"
+
+    # Hardware Detection
+    default_tape_drive: str = "/dev/nst0"
+
+
+settings = Settings()
@@ -19,7 +19,6 @@ class FilesystemState(Base):
    sha256_hash: Mapped[Optional[str]] = mapped_column(
        String, index=True, nullable=True
    )
-    is_indexed: Mapped[bool] = mapped_column(Boolean, default=False)  # True if hashed
    is_ignored: Mapped[bool] = mapped_column(
        Boolean, default=False
    )  # True if matches exclusion
@@ -0,0 +1,93 @@
+CREATE TABLE filesystem_state (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    file_path TEXT UNIQUE,
+    size BIGINT,
+    mtime FLOAT,
+    sha256_hash TEXT,
+    is_ignored BOOLEAN DEFAULT 0,
+    last_seen_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE storage_media (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    media_type TEXT,
+    identifier TEXT UNIQUE,
+    generation_tier TEXT,
+    capacity BIGINT,
+    bytes_used BIGINT DEFAULT 0,
+    location TEXT,
+    status TEXT DEFAULT 'active',
+    extra_config TEXT,
+    priority_index INTEGER DEFAULT 0,
+    last_seen DATETIME,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE file_versions (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    filesystem_state_id INTEGER,
+    media_id INTEGER,
+    file_number TEXT,
+    offset_in_tar INTEGER,
+    is_split BOOLEAN DEFAULT 0,
+    split_id TEXT,
+    offset_start BIGINT DEFAULT 0,
+    offset_end BIGINT DEFAULT 0,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id),
+    FOREIGN KEY(media_id) REFERENCES storage_media(id)
+);
+
+CREATE TABLE tracked_sources (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    path TEXT UNIQUE,
+    is_directory BOOLEAN DEFAULT 1,
+    action TEXT DEFAULT 'include',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE restore_cart (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    filesystem_state_id INTEGER,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id)
+);
+
+CREATE TABLE jobs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    job_type TEXT,
+    status TEXT DEFAULT 'PENDING',
+    progress FLOAT DEFAULT 0.0,
+    current_task TEXT,
+    error_message TEXT,
+    started_at DATETIME,
+    completed_at DATETIME,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE system_settings (
+    key TEXT PRIMARY KEY,
+    value TEXT,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+-- FTS5 Virtual Table for Instant Search
+CREATE VIRTUAL TABLE filesystem_fts USING fts5(
+    file_path,
+    content='filesystem_state',
+    content_rowid='id'
+);
+
+-- Trigger to keep FTS5 synchronized with real state
+CREATE TRIGGER filesystem_fts_insert AFTER INSERT ON filesystem_state BEGIN
+    INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path);
+END;
+
+CREATE TRIGGER filesystem_fts_delete AFTER DELETE ON filesystem_state BEGIN
+    INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path);
+END;
+
+CREATE TRIGGER filesystem_fts_update AFTER UPDATE ON filesystem_state BEGIN
+    INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path);
+    INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path);
+END;
@@ -1,3 +1,4 @@
+import hashlib
 import boto3
 import os
 import io
@@ -10,6 +11,8 @@ from Crypto.Cipher import AES
 from Crypto.Protocol.KDF import PBKDF2
 from Crypto.Hash import SHA256

+from app.core.config import settings
+

 class CloudStorageProvider(AbstractStorageProvider):
    provider_id = "s3_compat"
@@ -48,6 +51,12 @@ class CloudStorageProvider(AbstractStorageProvider):
            "title": "Client-Side Encryption Passphrase",
            "description": "Used to encrypt data locally before uploading via AES-256-GCM.",
        },
+        "obfuscate_filenames": {
+            "type": "boolean",
+            "title": "Obfuscate Filenames",
+            "description": "Store files as SHA-256 hashes in the cloud to hide metadata.",
+            "default": False,
+        },
    }

    def __init__(self, config: Dict[str, Any]):
@@ -55,9 +64,12 @@ class CloudStorageProvider(AbstractStorageProvider):
        self.bucket_name = config.get("bucket_name")
        self.region = config.get("region", "us-east-1")
        self.endpoint_url = config.get("endpoint_url")
+        self.obfuscate = config.get("obfuscate_filenames", False)

-        # Local Encryption Settings
-        self.passphrase = config.get("encryption_passphrase")
+        # Local Encryption Settings: Use provided or global default
+        self.passphrase = (
+            config.get("encryption_passphrase") or settings.encryption_passphrase
+        )

        # Credentials
        access_key = config.get("access_key")
@@ -80,6 +92,16 @@ class CloudStorageProvider(AbstractStorageProvider):
            self.passphrase, salt, dkLen=32, count=100000, hmac_hash_module=SHA256
        )

+    def _get_obfuscated_key(self, prefix: str, path: str) -> str:
+        """Returns a hashed version of the filename if obfuscation is enabled."""
+        if not self.obfuscate:
+            return f"{prefix}/{path.lstrip('./')}"
+
+        # We hash the path to hide metadata while keeping it deterministic
+        hashed = hashlib.sha256(path.encode("utf-8")).hexdigest()
+        # Use two-level sharding to prevent S3 prefix performance issues with 100k+ files
+        return f"{prefix}/{hashed[:2]}/{hashed[2:4]}/{hashed}"
+
    def get_name(self) -> str:
        return f"Cloud ({self.provider_type})"

@@ -122,7 +144,7 @@ class CloudStorageProvider(AbstractStorageProvider):
        try:
            self.s3.head_bucket(Bucket=self.bucket_name)
            paginator = self.s3.get_paginator("list_objects_v2")
-            for page in paginator.paginate(Bucket=self.bucket_name, Prefix="archives/"):
+            for page in paginator.paginate(Bucket=self.bucket_name):
                if "Contents" in page:
                    objects = [{"Key": obj["Key"]} for obj in page["Contents"]]
                    self.s3.delete_objects(
@@ -148,7 +170,9 @@ class CloudStorageProvider(AbstractStorageProvider):
        """
        import uuid

-        object_key = f"archives/{uuid.uuid4().hex}.tar"
+        # Archives are always obfuscated by UUID for privacy and collision avoidance
+        raw_key = f"archives/{uuid.uuid4().hex}.tar"
+        object_key = self._get_obfuscated_key("archives", raw_key)

        if self.passphrase:
            logger.info(
@@ -166,7 +190,6 @@ class CloudStorageProvider(AbstractStorageProvider):
            ciphertext, tag = cipher.encrypt_and_digest(data)

            # 3. Concatenate and upload
-            # We store everything needed for decryption in the blob itself for maximum portability
            payload = salt + nonce + tag + ciphertext

            try:
@@ -174,7 +197,10 @@ class CloudStorageProvider(AbstractStorageProvider):
                    Bucket=self.bucket_name,
                    Key=object_key,
                    Body=payload,
-                    Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"},
+                    Metadata={
+                        "x-amz-meta-tapehoard-encrypted": "v2-gcm",
+                        "x-amz-meta-tapehoard-type": "archive",
+                    },
                )
                return object_key
            except Exception as e:
@@ -193,10 +219,9 @@ class CloudStorageProvider(AbstractStorageProvider):
        self, media_id: str, relative_path: str, stream: BinaryIO
    ) -> str:
        """
-        Uploads a single file directly to the cloud, maintaining structure.
+        Uploads a single file directly to the cloud.
        """
-        clean_path = relative_path.lstrip("./")
-        object_key = f"objects/{clean_path}"
+        object_key = self._get_obfuscated_key("objects", relative_path)

        if self.passphrase:
            logger.info(
@@ -221,7 +246,10 @@ class CloudStorageProvider(AbstractStorageProvider):
                    Bucket=self.bucket_name,
                    Key=object_key,
                    Body=payload,
-                    Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"},
+                    Metadata={
+                        "x-amz-meta-tapehoard-encrypted": "v2-gcm",
+                        "x-amz-meta-tapehoard-type": "object",
+                    },
                )
                return object_key
            except Exception as e:
@@ -145,7 +145,6 @@ class ArchiverService:
                models.FilesystemState.id == coverage_subquery.c.filesystem_state_id,
            )
            .filter(
-                models.FilesystemState.is_indexed,
                not_(models.FilesystemState.is_ignored),
                (coverage_subquery.c.covered_bytes.is_(None))
                | (coverage_subquery.c.covered_bytes < models.FilesystemState.size),
@@ -723,11 +722,7 @@ class ArchiverService:
                        media_record.identifier, archive_id
                    )

-                    is_tar = (
-                        not provider.capabilities.get("supports_random_access")
-                        or str(archive_id).endswith(".tar")
-                        or str(archive_id).isdigit()
-                    )
+                    is_tar = self._test_is_tar_logic(provider, media_record, archive_id)

                    if not is_tar:
                        # Format Negotiation: Direct file recovery
@@ -778,7 +773,12 @@ class ArchiverService:
                            )
                        continue

-                    tar_mode = "r|*" if media_record.media_type == "tape" else "r:*"
+                    tar_mode = (
+                        "r|*"
+                        if media_record.media_type
+                        in ["tape", "lto_tape", "cloud", "s3_compat"]
+                        else "r:*"
+                    )

                    with tarfile.open(fileobj=bitstream, mode=tar_mode) as tar_bundle:
                        normalized_map = {}
@@ -854,5 +854,26 @@ class ArchiverService:
            logger.exception(f"Restore failed: {e}")
            JobManager.fail_job(job_id, str(e))

+    def _test_is_tar_logic(self, provider, media_record, archive_id: str) -> bool:
+        """Internal logic to determine if a location_id refers to a tarball vs a native file."""
+        # 1. If provider doesn't support random access (Tape), it's ALWAYS a tar stream.
+        if not provider.capabilities.get("supports_random_access"):
+            return True
+
+        # 2. Explicit extensions or prefixes
+        if str(archive_id).endswith(".tar") or "archives/" in str(archive_id):
+            return True
+
+        # 3. Tape file numbers (if they happen to be passed here for cloud, which is rare)
+        if str(archive_id).isdigit() and media_record.media_type in [
+            "tape",
+            "lto_tape",
+        ]:
+            return True
+
+        # 4. Otherwise, if it has random access (Cloud/HDD) and isn't explicitly an archive,
+        # it's a native file.
+        return False
+

 archiver_manager = ArchiverService()
@@ -29,55 +29,83 @@ class JobManager:
    @staticmethod
    def start_job(job_id: int):
        """Marks a job as running and sets the start timestamp."""
+        from sqlalchemy.orm.exc import StaleDataError
+
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
-            if job_record:
-                job_record.status = "RUNNING"
-                job_record.started_at = datetime.now(timezone.utc)
-                db_session.commit()
+            try:
+                job_record = db_session.get(models.Job, job_id)
+                if job_record:
+                    job_record.status = "RUNNING"
+                    job_record.started_at = datetime.now(timezone.utc)
+                    db_session.commit()
+            except StaleDataError:
+                db_session.rollback()
+                logger.debug(
+                    f"Job {job_id} already modified or deleted (StaleDataError)."
+                )

    @staticmethod
    def update_job(job_id: int, progress: float, current_task: str):
        """Updates the progress and current task description for a job."""
+        from sqlalchemy.orm.exc import StaleDataError
+
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
-            if job_record:
-                job_record.progress = progress
-                job_record.current_task = current_task
-                db_session.commit()
+            try:
+                job_record = db_session.get(models.Job, job_id)
+                if job_record:
+                    job_record.progress = progress
+                    job_record.current_task = current_task
+                    db_session.commit()
+            except StaleDataError:
+                db_session.rollback()

    @staticmethod
    def complete_job(job_id: int):
        """Marks a job as successfully completed."""
+        from sqlalchemy.orm.exc import StaleDataError
+
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
-            if job_record:
-                job_record.status = "COMPLETED"
-                job_record.progress = 100.0
-                job_record.completed_at = datetime.now(timezone.utc)
-                db_session.commit()
+            try:
+                job_record = db_session.get(models.Job, job_id)
+                if job_record:
+                    job_record.status = "COMPLETED"
+                    job_record.progress = 100.0
+                    job_record.completed_at = datetime.now(timezone.utc)
+                    db_session.commit()
+            except StaleDataError:
+                db_session.rollback()

    @staticmethod
    def fail_job(job_id: int, error_message: str):
        """Marks a job as failed and records the error message."""
+        from sqlalchemy.orm.exc import StaleDataError
+
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
-            if job_record:
-                job_record.status = "FAILED"
-                job_record.error_message = error_message
-                job_record.completed_at = datetime.now(timezone.utc)
-                db_session.commit()
+            try:
+                job_record = db_session.get(models.Job, job_id)
+                if job_record:
+                    job_record.status = "FAILED"
+                    job_record.error_message = error_message
+                    job_record.completed_at = datetime.now(timezone.utc)
+                    db_session.commit()
+            except StaleDataError:
+                db_session.rollback()

    @staticmethod
    def cancel_job(job_id: int):
        """Submits a cancellation request for a pending or running job."""
+        from sqlalchemy.orm.exc import StaleDataError
+
        with SessionLocal() as db_session:
-            job_record = db_session.get(models.Job, job_id)
-            if job_record and job_record.status in ["PENDING", "RUNNING"]:
-                job_record.status = "FAILED"
-                job_record.error_message = "Cancelled by user"
-                job_record.completed_at = datetime.now(timezone.utc)
-                db_session.commit()
+            try:
+                job_record = db_session.get(models.Job, job_id)
+                if job_record and job_record.status in ["PENDING", "RUNNING"]:
+                    job_record.status = "FAILED"
+                    job_record.error_message = "Cancelled by user"
+                    job_record.completed_at = datetime.now(timezone.utc)
+                    db_session.commit()
+            except StaleDataError:
+                db_session.rollback()

    @staticmethod
    def is_cancelled(job_id: int) -> bool:
@@ -354,7 +382,6 @@ class ScannerService:
                        mtime=file_meta["mtime"],
                        is_ignored=file_meta["ignored"],
                        last_seen_timestamp=timestamp,
-                        is_indexed=False,
                    )
                )
            else:
@@ -363,7 +390,7 @@ class ScannerService:
                    or record.mtime != file_meta["mtime"]
                )
                if metadata_changed:
-                    record.is_indexed = False
+                    record.sha256_hash = None
                    with self._metrics_lock:
                        self.files_modified += 1

@@ -396,7 +423,7 @@ class ScannerService:
            total_pending = (
                db_session.query(models.FilesystemState)
                .filter(
-                    models.FilesystemState.is_indexed.is_(False),
+                    models.FilesystemState.sha256_hash.is_(None),
                    models.FilesystemState.is_ignored.is_(False),
                )
                .count()
@@ -408,7 +435,7 @@ class ScannerService:
                    hashing_targets = (
                        db_session.query(models.FilesystemState)
                        .filter(
-                            models.FilesystemState.is_indexed.is_(False),
+                            models.FilesystemState.sha256_hash.is_(None),
                            models.FilesystemState.is_ignored.is_(False),
                        )
                        .limit(100)
@@ -422,7 +449,7 @@ class ScannerService:
                            total_pending = (
                                db_session.query(models.FilesystemState)
                                .filter(
-                                    models.FilesystemState.is_indexed.is_(False),
+                                    models.FilesystemState.sha256_hash.is_(None),
                                    models.FilesystemState.is_ignored.is_(False),
                                )
                                .count()
@@ -451,7 +478,6 @@ class ScannerService:

                            if computed_hash:
                                target_record.sha256_hash = computed_hash
-                                target_record.is_indexed = True
                                self.files_hashed += 1

                            if self.files_hashed % 5 == 0:
@@ -66,7 +66,6 @@ def test_get_insights(client, db_session):
        file_path="/source/f1.txt",
        size=100,
        mtime=1000,
-        is_indexed=True,
        sha256_hash="hash1",
    )
    db_session.add(file1)
@@ -127,7 +126,6 @@ def test_search_index(client, db_session):
        file_path="data/important.doc",
        size=500,
        mtime=2000,
-        is_indexed=True,
        sha256_hash="hash",
    )
    db_session.add(file1)
@@ -21,7 +21,6 @@ def test_get_dashboard_stats_populated(client, db_session):
        size=1024,
        mtime=datetime.now(timezone.utc).timestamp(),
        is_ignored=False,
-        is_indexed=True,
    )
    db_session.add(file_state)
    db_session.commit()
@@ -0,0 +1,44 @@
+from app.core.utils import get_path_uuid
+
+
+def test_get_path_uuid_macos(mocker):
+    """Verifies UUID extraction from diskutil on macOS."""
+    mocker.patch("os.path.exists", return_value=True)
+    mocker.patch("sys.platform", "darwin")
+
+    mock_run = mocker.patch("subprocess.run")
+    mock_res = mocker.MagicMock()
+    mock_res.stdout = """
+   Device Identifier:        disk4s1
+   Device Node:              /dev/disk4s1
+   Volume Name:              MyBackups
+   Volume UUID:              ABCDEF-1234-5678-90AB-CDEF12345678
+    """
+    mock_run.return_value = mock_res
+
+    uuid = get_path_uuid("/Volumes/MyBackups")
+    assert uuid == "ABCDEF-1234-5678-90AB-CDEF12345678"
+    mock_run.assert_called_with(
+        ["diskutil", "info", "/Volumes/MyBackups"], capture_output=True, text=True
+    )
+
+
+def test_get_path_uuid_linux(mocker):
+    """Verifies UUID extraction from lsblk on Linux."""
+    mocker.patch("os.path.exists", return_value=True)
+    mocker.patch("sys.platform", "linux")
+
+    mock_run = mocker.patch("subprocess.run")
+
+    def mock_subprocess(cmd, **kwargs):
+        m = mocker.MagicMock()
+        if "df" in cmd:
+            m.stdout = "Filesystem\n/dev/sdb1"
+        elif "lsblk" in cmd:
+            m.stdout = "98765432-ABCD-EF01-2345-6789ABCDEF01"
+        return m
+
+    mock_run.side_effect = mock_subprocess
+
+    uuid = get_path_uuid("/mnt/hdd")
+    assert uuid == "98765432-ABCD-EF01-2345-6789ABCDEF01"
@@ -0,0 +1,65 @@
+import hashlib
+import pytest
+from app.providers.cloud import CloudStorageProvider
+
+
+def test_cloud_provider_obfuscation_logic():
+    """Verifies that filename hashing and sharding works as expected."""
+
+    # CASE 1: Obfuscation Disabled
+    config_plain = {
+        "bucket_name": "test-bucket",
+        "obfuscate_filenames": False,
+        "access_key": "fake",
+        "secret_key": "fake",
+    }
+    provider_plain = CloudStorageProvider(config_plain)
+    path = "documents/secret_plan.pdf"
+
+    # Expectation: Key is exactly the path with prefix
+    key_plain = provider_plain._get_obfuscated_key("objects", path)
+    assert key_plain == "objects/documents/secret_plan.pdf"
+
+    # CASE 2: Obfuscation Enabled
+    config_hidden = {
+        "bucket_name": "test-bucket",
+        "obfuscate_filenames": True,
+        "access_key": "fake",
+        "secret_key": "fake",
+    }
+    provider_hidden = CloudStorageProvider(config_hidden)
+
+    # Expectation: Key is hashed and sharded
+    # hash of "documents/secret_plan.pdf"
+    expected_hash = hashlib.sha256(path.encode("utf-8")).hexdigest()
+    expected_prefix = f"objects/{expected_hash[:2]}/{expected_hash[2:4]}"
+
+    key_hidden = provider_hidden._get_obfuscated_key("objects", path)
+
+    assert key_hidden.startswith("objects/")
+    assert key_hidden == f"{expected_prefix}/{expected_hash}"
+    assert "secret_plan.pdf" not in key_hidden
+
+
+def test_cloud_secret_fallback(mocker):
+    """Verifies that the provider prioritizes local config over global settings for passphrases."""
+    from app.core.config import settings
+
+    # Mock global settings
+    mocker.patch.object(settings, "encryption_passphrase", "global-fallback")
+
+    # CASE 1: Local config provides passphrase
+    config_local = {"bucket_name": "b", "encryption_passphrase": "local-override"}
+    provider_local = CloudStorageProvider(config_local)
+    assert provider_local.passphrase == "local-override"
+
+    # CASE 2: Local config is empty, should fallback to global
+    config_empty = {"bucket_name": "b"}
+    provider_fallback = CloudStorageProvider(config_empty)
+    assert provider_fallback.passphrase == "global-fallback"
+
+    # CASE 3: No passphrase anywhere (ValueError on key derivation)
+    mocker.patch.object(settings, "encryption_passphrase", "")
+    provider_none = CloudStorageProvider({"bucket_name": "b"})
+    with pytest.raises(ValueError, match="No encryption passphrase configured"):
+        provider_none._derive_key(b"salt")
@@ -0,0 +1,99 @@
+import io
+import pytest
+from app.providers.hdd import OfflineHDDProvider
+
+
+def test_hdd_initialization(tmp_path):
+    """Verifies that HDD provider correctly prepares the disk structure."""
+    mount = tmp_path / "mnt_hdd"
+    provider = OfflineHDDProvider({"mount_path": str(mount)})
+
+    success = provider.initialize_media("DISK01")
+    assert success is True
+    assert (mount / ".tapehoard_id").exists()
+    assert (mount / ".tapehoard_id").read_text().strip() == "DISK01"
+    assert (mount / "tapehoard_backups" / "archives").is_dir()
+
+
+def test_hdd_identification(tmp_path):
+    """Verifies that HDD provider can identify a disk by its ID file."""
+    mount = tmp_path / "mnt_hdd"
+    mount.mkdir()
+    (mount / ".tapehoard_id").write_text("DISK02")
+
+    provider = OfflineHDDProvider({"mount_path": str(mount)})
+    assert provider.identify_media() == "DISK02"
+
+
+def test_hdd_write_sequential_logic(tmp_path):
+    """Verifies that archives are numbered and padded correctly."""
+    mount = tmp_path / "mnt_hdd"
+    provider = OfflineHDDProvider({"mount_path": str(mount)})
+    provider.initialize_media("DISK03")
+
+    # Write first archive
+    loc1 = provider.write_archive("DISK03", io.BytesIO(b"archive1"))
+    assert loc1 == "0"
+    assert (mount / "tapehoard_backups" / "archives" / "000000.tar").exists()
+
+    # Write second archive
+    loc2 = provider.write_archive("DISK03", io.BytesIO(b"archive2"))
+    assert loc2 == "1"
+    assert (mount / "tapehoard_backups" / "archives" / "000001.tar").exists()
+
+    # Read them back
+    with provider.read_archive("DISK03", "0") as f:
+        assert f.read() == b"archive1"
+    with provider.read_archive("DISK03", "1") as f:
+        assert f.read() == b"archive2"
+
+
+def test_hdd_direct_write_and_traversal_guard(tmp_path):
+    """Verifies direct file copies and security guards."""
+    mount = tmp_path / "mnt_hdd"
+    provider = OfflineHDDProvider({"mount_path": str(mount)})
+
+    # Valid direct write
+    rel_path = "photos/holiday.jpg"
+    loc = provider.write_file_direct("DISK04", rel_path, io.BytesIO(b"imagedata"))
+    assert loc == rel_path
+
+    target = mount / "tapehoard_backups" / "objects" / rel_path
+    assert target.exists()
+    assert target.read_bytes() == b"imagedata"
+
+    # Read back (Format Negotiation)
+    with provider.read_archive("DISK04", rel_path) as f:
+        assert f.read() == b"imagedata"
+
+    # Traversal Attempt
+    with pytest.raises(ValueError, match="Invalid relative path"):
+        provider.write_file_direct(
+            "DISK04", "escape/../../secret.txt", io.BytesIO(b"bad")
+        )
+
+
+def test_hdd_online_check_with_uuid(tmp_path, mocker):
+    """Verifies that online check validates both path and UUID."""
+    mount = tmp_path / "mnt_hdd"
+    mount.mkdir()
+
+    # Mock UUID utility
+    mock_get_uuid = mocker.patch("app.core.utils.get_path_uuid")
+    mock_get_uuid.return_value = "UUID-123"
+
+    # CASE 1: Correct UUID
+    provider_ok = OfflineHDDProvider(
+        {"mount_path": str(mount), "device_uuid": "UUID-123"}
+    )
+    assert provider_ok.check_online() is True
+
+    # CASE 2: Mismatched UUID
+    provider_fail = OfflineHDDProvider(
+        {"mount_path": str(mount), "device_uuid": "UUID-999"}
+    )
+    assert provider_fail.check_online() is False
+
+    # CASE 3: No UUID (Path only)
+    provider_path_only = OfflineHDDProvider({"mount_path": str(mount)})
+    assert provider_path_only.check_online() is True
@@ -0,0 +1,122 @@
+from app.providers.tape import LTOProvider
+
+
+def test_lto_compression_control(mocker):
+    """Verifies that LTOProvider sends the correct mt commands for compression."""
+
+    # Mock subprocess.run and os.path.exists
+    mock_run = mocker.patch("subprocess.run")
+    mocker.patch("os.path.exists", return_value=True)
+
+    # CASE 1: Compression Enabled
+    provider_on = LTOProvider({"device_path": "/dev/nst0", "compression": True})
+    provider_on._setup_compression()
+
+    # Expectation: mt compression 1
+    mock_run.assert_called_with(
+        ["mt", "-f", "/dev/nst0", "compression", "1"], check=True, capture_output=True
+    )
+
+    # CASE 2: Compression Disabled
+    mock_run.reset_mock()
+    provider_off = LTOProvider({"device_path": "/dev/nst0", "compression": False})
+    provider_off._setup_compression()
+
+    # Expectation: mt compression 0
+    mock_run.assert_called_with(
+        ["mt", "-f", "/dev/nst0", "compression", "0"], check=True, capture_output=True
+    )
+
+
+def test_lto_insertion_detection(mocker):
+    """Verifies that needs_registration triggers only on new insertion."""
+
+    device = "/dev/nst0"
+    # 1. Start with Empty State
+    LTOProvider._lkg_state = {
+        device: {"drive": {}, "mam": {}, "online": False, "last_check": 0.0}
+    }
+    provider = LTOProvider({"device_path": device})
+
+    # Mock OS path existence
+    mocker.patch("os.path.exists", return_value=True)
+
+    # Mock subprocess.run to return "READY" for status and valid MAM attributes
+    def mock_subprocess(cmd, **kwargs):
+        m = mocker.MagicMock()
+        m.returncode = 0
+        if "status" in cmd:
+            m.stdout = "READY"
+        elif "sg_read_attr" in cmd:
+            # Mock raw MAM data bytes (minimal valid structure)
+            # barcode "TAPE01" is at the end of the mapping usually
+            # But we patch get_mam_info to be easier
+            pass
+        return m
+
+    mocker.patch("subprocess.run", side_effect=mock_subprocess)
+
+    # We patch get_mam_info because parsing raw bytes in a test is overkill
+    mocker.patch.object(provider, "get_mam_info", return_value={"barcode": "TAPE01"})
+    mocker.patch.object(provider, "get_drive_info", return_value={"vendor": "HP"})
+
+    # Execution: First poll after "insertion" (transition from offline LKG to online)
+    info = provider.get_live_info(force=True)
+
+    # EXPECTATION: needs_registration should be True
+    assert info["online"] is True
+    assert info["identity"] == "TAPE01"
+    assert info["needs_registration"] is True
+
+    # 2. Second poll (already online)
+    # LKG state is now updated by the provider during the first poll
+    info_second = provider.get_live_info(force=True)
+
+    # EXPECTATION: needs_registration should be False (already saw this tape)
+    assert info_second["needs_registration"] is False
+
+
+def test_lto_mam_parsing_logic(mocker):
+    """Verifies the parsing of raw SCSI attribute bytes into human-readable metadata."""
+    import struct
+
+    device = "/dev/nst0"
+    provider = LTOProvider({"device_path": device})
+    mocker.patch("os.path.exists", return_value=True)
+
+    # 1. CONSTRUCT RAW PAYLOAD
+    # SCSI MAM raw format: [TotalLen(4)] + { [ID(2)][Flags(1)][Len(2)][Value] }...
+
+    def pack_attr(attr_id, value_bytes):
+        return struct.pack(">HBH", attr_id, 0, len(value_bytes)) + value_bytes
+
+    # Remaining Capacity (0x0000): 500GB (512000 MiB)
+    attr_rem = pack_attr(0x0000, (512000).to_bytes(8, "big"))
+    # Max Capacity (0x0001): 1.5TB (1536000 MiB - LTO-5 raw)
+    attr_max = pack_attr(0x0001, (1536000).to_bytes(8, "big"))
+    # Tape Alert Flags (0x0002): Bit 20 is "Clean Now"
+    # Bit 20 from left (64-bit int) -> (1 << (64-20))
+    alert_flags = 1 << (64 - 20)
+    attr_alerts = pack_attr(0x0002, alert_flags.to_bytes(8, "big"))
+    # Barcode (0x0806): "TAPE123"
+    attr_barcode = pack_attr(0x0806, b"TAPE123\x00\x00")  # null padded
+
+    payload_body = attr_rem + attr_max + attr_alerts + attr_barcode
+    full_payload = struct.pack(">I", len(payload_body)) + payload_body
+
+    # 2. MOCK SUBPROCESS
+    mock_res = mocker.MagicMock()
+    mock_res.returncode = 0
+    mock_res.stdout = full_payload
+    mocker.patch("subprocess.run", return_value=mock_res)
+
+    # 3. EXECUTION
+    mam = provider.get_mam_info(force=True)
+
+    # 4. EXPECTATIONS
+    assert mam["remaining_capacity_mib"] == 512000
+    assert mam["max_capacity_mib"] == 1536000
+    assert mam["barcode"] == "TAPE123"
+    assert "Clean Now" in mam["alerts"]
+    # 1.5TB should be identified as LTO-5
+    assert mam["generation_label"] == "LTO-5"
@@ -1,5 +1,6 @@
 import tarfile
 import io
+import pytest
 from app.services.archiver import ArchiverService, RangeFile
 from app.db import models

@@ -24,17 +25,11 @@ def test_get_unbacked_files(db_session):
    archiver = ArchiverService()

    # File 1: Completely unbacked
-    f1 = models.FilesystemState(
-        file_path="/data/new.txt", size=100, mtime=1, is_indexed=True
-    )
+    f1 = models.FilesystemState(file_path="/data/new.txt", size=100, mtime=1)
    # File 2: Partially backed (split)
-    f2 = models.FilesystemState(
-        file_path="/data/split.bin", size=1000, mtime=1, is_indexed=True
-    )
+    f2 = models.FilesystemState(file_path="/data/split.bin", size=1000, mtime=1)
    # File 3: Fully backed
-    f3 = models.FilesystemState(
-        file_path="/data/done.txt", size=50, mtime=1, is_indexed=True
-    )
+    f3 = models.FilesystemState(file_path="/data/done.txt", size=50, mtime=1)
    db_session.add_all([f1, f2, f3])
    db_session.flush()

@@ -89,18 +84,12 @@ def test_assemble_backup_batch(db_session):

    # Create files
    size_200mb = 200 * 1024 * 1024
-    f1 = models.FilesystemState(
-        file_path="/f1.bin", size=size_200mb, mtime=1, is_indexed=True
-    )
+    f1 = models.FilesystemState(file_path="/f1.bin", size=size_200mb, mtime=1)
    # f2 is 200MB, would fit on fresh tape. Should be skipped on this 300MB tape
    # after f1 (200MB) is added, because only 100MB is left.
-    f2 = models.FilesystemState(
-        file_path="/f2.bin", size=size_200mb, mtime=1, is_indexed=True
-    )
+    f2 = models.FilesystemState(file_path="/f2.bin", size=size_200mb, mtime=1)
    # f3 is 500MB, larger than total capacity (300MB). SHOULD be split.
-    f3 = models.FilesystemState(
-        file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1, is_indexed=True
-    )
+    f3 = models.FilesystemState(file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1)
    db_session.add_all([f1, f2, f3])
    db_session.commit()

@@ -138,7 +127,6 @@ def test_run_backup_mocked(db_session, mocker, tmp_path):
        file_path=str(source_file),
        size=source_file.stat().st_size,
        mtime=1,
-        is_indexed=True,
        sha256_hash="hash1",
    )
    db_session.add(f1)
@@ -164,14 +152,227 @@ def test_run_backup_mocked(db_session, mocker, tmp_path):
    db_session.expire_all()
    assert media.bytes_used > 0

-    # Verify FileVersion was recorded
+
+def test_archiver_saturated_media_logic(db_session, mocker, tmp_path):
+    """Verifies that media is marked full and priority ceded based on hardware feedback."""
+    staging = tmp_path / "staging"
+    staging.mkdir()
+    archiver = ArchiverService(staging_directory=str(staging))
+
+    # Setup Media (HDD for simple mocking)
+    media = models.StorageMedia(
+        media_type="hdd",
+        identifier="FULL_DISK",
+        capacity=1000,
+        status="active",
+        bytes_used=0,
+        priority_index=1,
+    )
+    db_session.add(media)
+
+    # Other media to check priority reordering
+    media2 = models.StorageMedia(
+        media_type="hdd",
+        identifier="NEXT_DISK",
+        capacity=1000,
+        status="active",
+        bytes_used=0,
+        priority_index=2,
+    )
+    db_session.add(media2)
+
+    # Setup a small file to trigger the loop
+    source_file = tmp_path / "small.txt"
+    source_file.write_bytes(b"data")
+    f1 = models.FilesystemState(
+        file_path=str(source_file),
+        size=4,
+        mtime=1,
+        sha256_hash="hash_small",
+    )
+    db_session.add(f1)
+    db_session.commit()
+
+    # Mock Provider to report 99% utilization
+    mock_provider = mocker.MagicMock()
+    mock_provider.capabilities = {"supports_random_access": True}
+    mock_provider.identify_media.return_value = "FULL_DISK"
+    mock_provider.prepare_for_write.return_value = True
+    # Ensure write_file_direct returns a string, not a MagicMock
+    mock_provider.write_file_direct.return_value = "LOC_1"
+    # FORCE hardware utilization report to 99%
+    mock_provider.get_utilization.return_value = 0.99
+
+    mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider)
+
+    from app.services.scanner import JobManager
+
+    job = JobManager.create_job(db_session, "BACKUP")
+
+    # Run archival
+    archiver.run_backup(db_session, media.id, job.id)
+
+    # EXPECTATION:
+    # 1. Media should be marked "full"
+    # 2. Priority should be moved to the end (higher than media2)
+    db_session.expire_all()
+    assert media.status == "full"
+    assert media.priority_index > media2.priority_index
+
+
+def test_archiver_chunking_logic(db_session, mocker, tmp_path):
+    """Verifies that large backup batches are split into appropriate chunks."""
+    staging = tmp_path / "staging"
+    staging.mkdir()
+    archiver = ArchiverService(staging_directory=str(staging))
+
+    # Setup Media (Capacity 100GB, target chunk size ~1GB)
+    media = models.StorageMedia(
+        media_type="tape",
+        identifier="TAPE_001",
+        capacity=100 * 1024 * 1024 * 1024,
+        status="active",
+        bytes_used=0,
+    )
+    db_session.add(media)
+
+    # 1. Add 10 small files (100MB each) -> Should fit in one 1GB chunk
+    for i in range(10):
+        source_file = tmp_path / f"small_{i}.bin"
+        source_file.write_bytes(b"0" * (100 * 1024 * 1024))
+        f = models.FilesystemState(
+            file_path=str(source_file),
+            size=100 * 1024 * 1024,
+            mtime=1,
+            sha256_hash=f"hash_s_{i}",
+        )
+        db_session.add(f)
+
+    # 2. Add 1 large file (5GB) -> Exceeds chunk size, should trigger its own chunk
+    large_file = tmp_path / "large.bin"
+    large_file.write_bytes(b"0" * 10)  # Mock small content for large size metadata
+    f_large = models.FilesystemState(
+        file_path=str(large_file),
+        size=5 * 1024 * 1024 * 1024,
+        mtime=1,
+        sha256_hash="hash_large",
+    )
+    # Monkeypatch stat size for test efficiency (don't actually write 5GB to tmp)
+    mocker.patch("os.path.getsize", return_value=5 * 1024 * 1024 * 1024)
+    # Also patch RangeFile to avoid actual reads
+    mocker.patch(
+        "app.services.archiver.RangeFile.__enter__", return_value=mocker.MagicMock()
+    )
+    mocker.patch("app.services.archiver.RangeFile.__exit__")
+
+    db_session.add(f_large)
+    db_session.commit()
+
+    # Mock Provider
+    mock_provider = mocker.MagicMock()
+    mock_provider.capabilities = {"supports_random_access": False}
+    mock_provider.identify_media.return_value = "TAPE_001"
+    mock_provider.prepare_for_write.return_value = True
+    mock_provider.write_archive.return_value = "1"
+
+    mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider)
+
+    from app.services.scanner import JobManager
+
+    job = JobManager.create_job(db_session, "BACKUP")
+
+    # Run archival
+    archiver.run_backup(db_session, media.id, job.id)
+
+    # EXPECTATION:
+    # write_archive should be called TWICE:
+    # 1. Once for the 10 small files (combined chunk)
+    # 2. Once for the large file (independent chunk)
+    assert mock_provider.write_archive.call_count == 2
+
+    # Verify FileVersion was recorded for the large file
    version = (
        db_session.query(models.FileVersion)
-        .filter_by(filesystem_state_id=f1.id)
+        .filter_by(filesystem_state_id=f_large.id)
        .first()
    )
    assert version is not None
-    assert version.file_number == "ARCH_1"
+    assert version.file_number == "1"
+
+
+def test_range_file_alignment_guard(tmp_path):
+    """Verifies that RangeFile pads truncated files with nulls to maintain tar alignment."""
+    from app.services.archiver import RangeFile
+
+    # Create a 50 byte file
+    f = tmp_path / "truncated.bin"
+    f.write_bytes(b"A" * 50)
+
+    # Initialize RangeFile expecting 100 bytes
+    with RangeFile(str(f), offset_start=0, length=100) as rf:
+        data = rf.read(100)
+
+        # EXPECTATION:
+        # 1. Total length must be exactly 100
+        # 2. First 50 are 'A's
+        # 3. Last 50 are nulls '\x00'
+        assert len(data) == 100
+        assert data[:50] == b"A" * 50
+        assert data[50:] == b"\x00" * 50
+
+
+def test_path_traversal_protection():
+    """Verifies that the restorer blocks path traversal attempts."""
+    archiver = ArchiverService()
+    base = "/restores/my_backup"
+
+    # CASE 1: Valid Path
+    safe_path = archiver._sanitize_recovery_path(base, "photos/image.jpg")
+    assert safe_path.startswith(base)
+
+    # CASE 2: Traversal Attempt (Relative)
+    with pytest.raises(PermissionError, match="Restricted path traversal"):
+        archiver._sanitize_recovery_path(base, "../../etc/shadow")
+
+    # CASE 3: Traversal Attempt (Absolute-style relative that escapes)
+    with pytest.raises(PermissionError, match="Restricted path traversal"):
+        archiver._sanitize_recovery_path(base, "/../../etc/shadow")
+
+
+def test_archiver_restoration_negotiation(db_session, mocker, tmp_path):
+    """Verifies that the restorer chooses the correct format (Native vs Tar)."""
+    archiver = ArchiverService()
+
+    # Setup Mocks
+    mock_media = mocker.MagicMock()
+    mock_media.media_type = "cloud"
+    mock_media.identifier = "BUCKET1"
+
+    mock_provider = mocker.MagicMock()
+    mock_provider.identify_media.return_value = "BUCKET1"
+    mock_provider.check_online.return_value = True
+
+    # 1. TEST NATIVE NEGOTIATION
+    # Expectation: random_access=True + non-tar ID -> is_tar=False
+    mock_provider.capabilities = {"supports_random_access": True}
+
+    # CASE A: Obfuscated object (Native)
+    is_tar_native = archiver._test_is_tar_logic(
+        mock_provider, mock_media, "objects/a1/b2/hash"
+    )
+    assert is_tar_native is False
+
+    # CASE B: Obfuscated archive (Tar)
+    is_tar_cloud_archive = archiver._test_is_tar_logic(
+        mock_provider, mock_media, "archives/a1/b2/hash"
+    )
+    assert is_tar_cloud_archive is True
+
+    # CASE C: Tape (Always Tar)
+    mock_media.media_type = "tape"
+    mock_provider.capabilities = {"supports_random_access": False}
+    is_tar_tape = archiver._test_is_tar_logic(mock_provider, mock_media, "1")
+    assert is_tar_tape is True


 def test_run_restore_mocked(db_session, mocker, tmp_path):
@@ -206,6 +407,9 @@ def test_run_restore_mocked(db_session, mocker, tmp_path):
    # Mock Provider & Tar Stream
    mock_provider = mocker.MagicMock()
    mock_provider.identify_media.return_value = "RESTORE_ME"
+    mock_provider.check_online.return_value = True
+    # FORCE tar path by disabling random access
+    mock_provider.capabilities = {"supports_random_access": False}

    # Create a mock tar stream containing the file
    buf = io.BytesIO()
@@ -91,7 +91,7 @@ def test_metadata_update_on_change(db_session):

    # Initial state
    f1 = models.FilesystemState(
-        file_path="/data/up.txt", size=50, mtime=1, is_indexed=True, sha256_hash="old"
+        file_path="/data/up.txt", size=50, mtime=1, sha256_hash="old"
    )
    db_session.add(f1)
    db_session.commit()
@@ -103,7 +103,7 @@ def test_metadata_update_on_change(db_session):

    db_session.refresh(f1)
    assert f1.size == 999
-    assert f1.is_indexed is False  # Should be reset for re-hashing
+    assert f1.sha256_hash is None  # Should be reset for re-hashing


 def test_scan_sources_mocked(db_session, mocker):
@@ -141,7 +141,7 @@ def test_run_hashing_mocked(db_session, mocker):

    # Setup unindexed file
    f = models.FilesystemState(
-        file_path="/data/hash.me", size=10, mtime=1, is_indexed=False, is_ignored=False
+        file_path="/data/hash.me", size=10, mtime=1, is_ignored=False
    )
    db_session.add(f)
    db_session.commit()
@@ -154,5 +154,4 @@ def test_run_hashing_mocked(db_session, mocker):
    scanner.run_hashing()

    db_session.refresh(f)
-    assert f.is_indexed is True
    assert f.sha256_hash == "mocked_hash"
@@ -15,7 +15,11 @@ services:
    # devices:
    #   - /dev/nst0:/dev/nst0
    environment:
-      - PUID=1000
-      - PGID=1000
+      - PUID=0
+      - PGID=0
+      - RUN_AS_ROOT=true
      - TZ=UTC
+    user: "0:0"
+    cap_add:
+      - SYS_RAWIO
    restart: unless-stopped
@@ -270,10 +270,7 @@ export type ItemMetadataSchema = {
     * Sha256 Hash
     */
    sha256_hash?: string | null;
-    /**
-     * Is Indexed
-     */
-    is_indexed?: boolean;
+
    /**
     * Is Ignored
     */
@@ -37,7 +37,7 @@ lint:
 # Run all backend tests
 pytest:
    @echo "Running backend tests..."
-    cd backend && uv run pytest
+    cd backend && COVERAGE_CORE=sysmon uv run pytest

 # Run frontend checks
 check:
@@ -0,0 +1,17 @@
+[Unit]
+Description=TapeHoard Backup Manager
+After=network.target
+
+[Service]
+Type=simple
+User=root
+WorkingDirectory=/opt/tapehoard
+# We use root to ensure direct /dev/nst0 and raw SCSI access
+ExecStart=/usr/bin/just dev
+Restart=always
+RestartSec=10
+# Give the tape drive time to rewind on shutdown
+TimeoutStopSec=30
+
+[Install]
+WantedBy=multi-user.target