diff --git a/GEMINI.md b/GEMINI.md index 0c8d6a3..e526c95 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -53,12 +53,22 @@ This document (`GEMINI.md`) contains critical, contextual information about the ### Scanning & Hashing Architecture * **Concurrent Phasing:** Decoupled into `SCAN` (Metadata, Normal priority) and `HASH` (Content, Idle priority with dynamic `iowait` throttling). * **Thread-Safe Metrics:** All counters (files processed, bytes hashed) must be protected by a `threading.Lock`. -* **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `is_indexed = 0 AND is_ignored = 0` files. +* **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `sha256_hash IS NULL AND is_ignored = 0` files. ### Archival & Recovery * **Format Negotiation:** The Archiver adapts formats based on provider capabilities (`supports_random_access`). * *Sequential (Tape):* Uses `.tar` streams to maintain drive streaming. * *Random Access (HDD/Cloud):* Uses native direct file copying/objects to enable instant seekless restores without unpacking gigabytes of data. +* **High-Speed Hybrid Archival:** + * The system prioritizes the **system `tar` binary** for whole-file chunks, delivering a 10x-20x performance boost over pure Python and ensuring optimal buffer saturation for LTO drives. + * It transparently falls back to the **Python `RangeFile` logic** only for chunks containing split fragments, maintaining bit-perfect alignment for multi-tape files. +* **Industrial Tar Chunking:** + * Large backup sets are automatically split into multiple independent archives. The system dynamically aims for at least **100 archives per tape** (calculated based on generational capacity, e.g., ~15GB for LTO-5) to provide high seek granularity during restoration. + * **Exception:** Single large files are allowed to occupy their own archives even if they exceed the target chunk size, preventing unnecessary fragmentation while keeping them as independent, seekable objects. +* **Refined Splitting Philosophy:** + * Files are **only split** if they are physically larger than the media's entire capacity (multi-tape spanning). + * **Skip-and-Defer:** If a file is larger than the remaining space on a tape but smaller than its total capacity, it is deferred to the next fresh medium to minimize fragmentation. +* **Hardware-First Utilization:** The system trusts **Physical Hardware Feedback (MAM)** over logical byte counts. Tapes are only marked as "Full" when the drive reporting (via `get_utilization`) confirms saturation, maximizing utilization when hardware compression is active. * **Bitstream Integrity:** `RangeFile` must guarantee exact byte counts for tar alignment. * **Metadata Fidelity:** The restorer must preserve original **permissions (chmod)**, **timestamps (utime)**, and **ownership (chown)** when recovering files natively or via tar. * **Independence:** Force all tar archive members to be **Regular Files** to break fragile hard-link dependencies. Symlinks are preserved as `SYMTYPE` (or `.symlink` stub objects for native format). diff --git a/backend/alembic/versions/1b59e22b9b7a_remove_is_indexed_flag.py b/backend/alembic/versions/1b59e22b9b7a_remove_is_indexed_flag.py new file mode 100644 index 0000000..18a69ce --- /dev/null +++ b/backend/alembic/versions/1b59e22b9b7a_remove_is_indexed_flag.py @@ -0,0 +1,79 @@ +"""remove is_indexed flag + +Revision ID: 1b59e22b9b7a +Revises: fbbc0a40a840 +Create Date: 2026-04-28 23:44:04.949304 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "1b59e22b9b7a" +down_revision: Union[str, Sequence[str], None] = "fbbc0a40a840" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # We use batch_alter_table for SQLite compatibility + with op.batch_alter_table("filesystem_state", schema=None) as batch_op: + batch_op.drop_column("is_indexed") + batch_op.alter_column( + "size", + existing_type=sa.INTEGER(), + type_=sa.BigInteger(), + existing_nullable=False, + ) + + with op.batch_alter_table("storage_media", schema=None) as batch_op: + batch_op.alter_column( + "capacity", + existing_type=sa.INTEGER(), + type_=sa.BigInteger(), + existing_nullable=False, + ) + batch_op.alter_column( + "bytes_used", + existing_type=sa.INTEGER(), + type_=sa.BigInteger(), + existing_nullable=False, + ) + + +def downgrade() -> None: + """Downgrade schema.""" + with op.batch_alter_table("storage_media", schema=None) as batch_op: + batch_op.alter_column( + "bytes_used", + existing_type=sa.BigInteger(), + type_=sa.INTEGER(), + existing_nullable=False, + ) + batch_op.alter_column( + "capacity", + existing_type=sa.BigInteger(), + type_=sa.INTEGER(), + existing_nullable=False, + ) + + with op.batch_alter_table("filesystem_state", schema=None) as batch_op: + batch_op.alter_column( + "size", + existing_type=sa.BigInteger(), + type_=sa.INTEGER(), + existing_nullable=False, + ) + batch_op.add_column( + sa.Column( + "is_indexed", + sa.BOOLEAN(), + server_default=sa.text("'0'"), + nullable=False, + ) + ) diff --git a/backend/app/api/backups.py b/backend/app/api/backups.py index 0162906..bc47921 100644 --- a/backend/app/api/backups.py +++ b/backend/app/api/backups.py @@ -2,7 +2,7 @@ from datetime import datetime from typing import List, Optional from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from sqlalchemy.orm import Session from app.db import models @@ -17,15 +17,14 @@ router = APIRouter(prefix="/backups", tags=["Backups"]) class BackupJobSchema(BaseModel): + model_config = ConfigDict(from_attributes=True) + id: int job_type: str status: str started_at: Optional[datetime] = None completed_at: Optional[datetime] = None - class Config: - from_attributes = True - # --- Endpoints --- diff --git a/backend/app/api/inventory.py b/backend/app/api/inventory.py index 80be0e5..f833d83 100644 --- a/backend/app/api/inventory.py +++ b/backend/app/api/inventory.py @@ -337,11 +337,11 @@ def get_system_analytics(db_session: Session = Depends(get_db)): SELECT COUNT(*) as total_files, SUM(size) as total_size, - SUM(CASE WHEN is_indexed = 1 THEN size ELSE 0 END) as total_hashed_size, + SUM(CASE WHEN sha256_hash IS NOT NULL THEN size ELSE 0 END) as total_hashed_size, (SELECT SUM(min_size) FROM ( SELECT MIN(size) as min_size FROM filesystem_state - WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0 + WHERE sha256_hash IS NOT NULL AND is_ignored = 0 GROUP BY sha256_hash )) as unique_hashed_size FROM filesystem_state @@ -453,7 +453,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)): COUNT(*) as copy_count, (size * (COUNT(*) - 1)) as saved_bytes FROM filesystem_state - WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0 + WHERE sha256_hash IS NOT NULL AND is_ignored = 0 GROUP BY sha256_hash HAVING copy_count > 1 ORDER BY saved_bytes DESC @@ -465,7 +465,8 @@ def get_system_analytics(db_session: Session = Depends(get_db)): directory_aggregation_sql = text(""" SELECT RTRIM(file_path, REPLACE(file_path, '/', '')) as dir_path, - SUM(size) as byte_total + SUM(size) as byte_total, + MAX(mtime) as latest_mtime FROM filesystem_state WHERE is_ignored = 0 GROUP BY dir_path @@ -474,7 +475,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)): # Hierarchical tree construction nested_dir_map = {} - for path_str, size_val in all_directories: + for path_str, size_val, mtime_val in all_directories: if not path_str: continue path_segments = [p for p in path_str.split("/") if p] @@ -492,10 +493,14 @@ def get_system_analytics(db_session: Session = Depends(get_db)): if segment not in current_node: current_node[segment] = { "size": 0, + "mtime": 0, "children": {}, "fullPath": accumulated_path, } current_node[segment]["size"] += size_val + current_node[segment]["mtime"] = max( + current_node[segment]["mtime"], mtime_val or 0 + ) current_node = current_node[segment]["children"] # Collapse unhelpful single-child roots @@ -517,6 +522,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)): { "path": key, "size": value["size"], + "mtime": value["mtime"], "fullPath": value["fullPath"], "children": children_list, } @@ -877,12 +883,10 @@ def get_archive_item_metadata(path: str, db_session: Session = Depends(get_db)): return ItemMetadataSchema( id=item.id, path=item.file_path, - type="file", size=item.size, mtime=datetime.fromtimestamp(item.mtime, tz=timezone.utc), last_seen_timestamp=item.last_seen_timestamp, sha256_hash=item.sha256_hash, - is_indexed=item.is_indexed, is_ignored=item.is_ignored, versions=versions, ) diff --git a/backend/app/api/restores.py b/backend/app/api/restores.py index cccb1c1..2523a6b 100644 --- a/backend/app/api/restores.py +++ b/backend/app/api/restores.py @@ -1,6 +1,6 @@ from typing import List, Optional from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from sqlalchemy.orm import Session, joinedload from sqlalchemy import text from app.db.database import get_db, SessionLocal @@ -17,14 +17,13 @@ router = APIRouter(prefix="/restores", tags=["Restores"]) class CartItemSchema(BaseModel): + model_config = ConfigDict(from_attributes=True) + id: int file_path: str size: int type: str - class Config: - from_attributes = True - class RestoreTriggerRequest(BaseModel): destination_path: str @@ -115,7 +114,9 @@ def add_directory_to_recovery_queue( AND rc.id IS NULL """) - db_session.execute(discovery_sql, {"prefix": f"{target_directory}%", "now": now}) + db_session.execute( + discovery_sql, {"prefix": f"{target_directory}%", "now": now.isoformat()} + ) db_session.commit() total_in_queue = db_session.query(models.RestoreCart).count() diff --git a/backend/app/api/schemas.py b/backend/app/api/schemas.py index 3a43e4e..c7f2bb2 100644 --- a/backend/app/api/schemas.py +++ b/backend/app/api/schemas.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from typing import Optional, List, Dict, Any from datetime import datetime @@ -17,7 +17,6 @@ class ItemMetadataSchema(BaseModel): mtime: datetime last_seen_timestamp: Optional[datetime] = None sha256_hash: Optional[str] = None - is_indexed: bool = False is_ignored: bool = False child_count: Optional[int] = 0 selected: bool = False @@ -44,8 +43,7 @@ class MediaSchema(BaseModel): host_total_bytes: Optional[int] = None live_info: Optional[Dict[str, Any]] = None - class Config: - from_attributes = True + model_config = ConfigDict(from_attributes=True) class MediaCreateSchema(BaseModel): diff --git a/backend/app/api/system.py b/backend/app/api/system.py index 12dc976..8043764 100644 --- a/backend/app/api/system.py +++ b/backend/app/api/system.py @@ -8,7 +8,7 @@ import pathspec from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException from fastapi.responses import FileResponse, StreamingResponse from loguru import logger -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from sqlalchemy import func, text from sqlalchemy.orm import Session @@ -37,6 +37,8 @@ class DashboardStatsSchema(BaseModel): class JobSchema(BaseModel): + model_config = ConfigDict(from_attributes=True) + id: int job_type: str status: str @@ -47,9 +49,6 @@ class JobSchema(BaseModel): completed_at: Optional[datetime] = None created_at: datetime - class Config: - from_attributes = True - class FileItemSchema(BaseModel): name: str @@ -172,7 +171,7 @@ def get_dashboard_stats(db_session: Session = Depends(get_db)): SUM(CASE WHEN is_ignored = 1 THEN size ELSE 0 END) as ignored_size, SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN 1 ELSE 0 END) as unprotected_count, SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as unprotected_size, - SUM(CASE WHEN is_indexed = 1 AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count, + SUM(CASE WHEN sha256_hash IS NOT NULL AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count, SUM(CASE WHEN is_ignored = 0 THEN 1 ELSE 0 END) as eligible_count, SUM(CASE WHEN id IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as archived_size FROM filesystem_state diff --git a/backend/app/core/config.py b/backend/app/core/config.py new file mode 100644 index 0000000..6bf2ee4 --- /dev/null +++ b/backend/app/core/config.py @@ -0,0 +1,31 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """ + Standardized secret management and application configuration. + Values can be overridden via environment variables or a .env file. + """ + + model_config = SettingsConfigDict( + env_file=".env", env_file_encoding="utf-8", extra="ignore" + ) + + # Database + database_url: str = "sqlite:///./tapehoard.db" + + # Security / Encryption + # Standardized secret management pattern + encryption_passphrase: str = "tapehoard-default-insecure-passphrase" + + # Staging + staging_directory: str = "/staging" + + # Cloud Defaults + default_s3_region: str = "us-east-1" + + # Hardware Detection + default_tape_drive: str = "/dev/nst0" + + +settings = Settings() diff --git a/backend/app/db/models.py b/backend/app/db/models.py index 2336862..f7b0c3c 100644 --- a/backend/app/db/models.py +++ b/backend/app/db/models.py @@ -19,7 +19,6 @@ class FilesystemState(Base): sha256_hash: Mapped[Optional[str]] = mapped_column( String, index=True, nullable=True ) - is_indexed: Mapped[bool] = mapped_column(Boolean, default=False) # True if hashed is_ignored: Mapped[bool] = mapped_column( Boolean, default=False ) # True if matches exclusion diff --git a/backend/app/db/schema.sql b/backend/app/db/schema.sql index e69de29..afedb99 100644 --- a/backend/app/db/schema.sql +++ b/backend/app/db/schema.sql @@ -0,0 +1,93 @@ +CREATE TABLE filesystem_state ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT UNIQUE, + size BIGINT, + mtime FLOAT, + sha256_hash TEXT, + is_ignored BOOLEAN DEFAULT 0, + last_seen_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE storage_media ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + media_type TEXT, + identifier TEXT UNIQUE, + generation_tier TEXT, + capacity BIGINT, + bytes_used BIGINT DEFAULT 0, + location TEXT, + status TEXT DEFAULT 'active', + extra_config TEXT, + priority_index INTEGER DEFAULT 0, + last_seen DATETIME, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE file_versions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filesystem_state_id INTEGER, + media_id INTEGER, + file_number TEXT, + offset_in_tar INTEGER, + is_split BOOLEAN DEFAULT 0, + split_id TEXT, + offset_start BIGINT DEFAULT 0, + offset_end BIGINT DEFAULT 0, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id), + FOREIGN KEY(media_id) REFERENCES storage_media(id) +); + +CREATE TABLE tracked_sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + path TEXT UNIQUE, + is_directory BOOLEAN DEFAULT 1, + action TEXT DEFAULT 'include', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE restore_cart ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filesystem_state_id INTEGER, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id) +); + +CREATE TABLE jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + job_type TEXT, + status TEXT DEFAULT 'PENDING', + progress FLOAT DEFAULT 0.0, + current_task TEXT, + error_message TEXT, + started_at DATETIME, + completed_at DATETIME, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE system_settings ( + key TEXT PRIMARY KEY, + value TEXT, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +-- FTS5 Virtual Table for Instant Search +CREATE VIRTUAL TABLE filesystem_fts USING fts5( + file_path, + content='filesystem_state', + content_rowid='id' +); + +-- Trigger to keep FTS5 synchronized with real state +CREATE TRIGGER filesystem_fts_insert AFTER INSERT ON filesystem_state BEGIN + INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path); +END; + +CREATE TRIGGER filesystem_fts_delete AFTER DELETE ON filesystem_state BEGIN + INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path); +END; + +CREATE TRIGGER filesystem_fts_update AFTER UPDATE ON filesystem_state BEGIN + INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path); + INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path); +END; diff --git a/backend/app/providers/cloud.py b/backend/app/providers/cloud.py index 60305fd..a58038e 100644 --- a/backend/app/providers/cloud.py +++ b/backend/app/providers/cloud.py @@ -1,3 +1,4 @@ +import hashlib import boto3 import os import io @@ -10,6 +11,8 @@ from Crypto.Cipher import AES from Crypto.Protocol.KDF import PBKDF2 from Crypto.Hash import SHA256 +from app.core.config import settings + class CloudStorageProvider(AbstractStorageProvider): provider_id = "s3_compat" @@ -48,6 +51,12 @@ class CloudStorageProvider(AbstractStorageProvider): "title": "Client-Side Encryption Passphrase", "description": "Used to encrypt data locally before uploading via AES-256-GCM.", }, + "obfuscate_filenames": { + "type": "boolean", + "title": "Obfuscate Filenames", + "description": "Store files as SHA-256 hashes in the cloud to hide metadata.", + "default": False, + }, } def __init__(self, config: Dict[str, Any]): @@ -55,9 +64,12 @@ class CloudStorageProvider(AbstractStorageProvider): self.bucket_name = config.get("bucket_name") self.region = config.get("region", "us-east-1") self.endpoint_url = config.get("endpoint_url") + self.obfuscate = config.get("obfuscate_filenames", False) - # Local Encryption Settings - self.passphrase = config.get("encryption_passphrase") + # Local Encryption Settings: Use provided or global default + self.passphrase = ( + config.get("encryption_passphrase") or settings.encryption_passphrase + ) # Credentials access_key = config.get("access_key") @@ -80,6 +92,16 @@ class CloudStorageProvider(AbstractStorageProvider): self.passphrase, salt, dkLen=32, count=100000, hmac_hash_module=SHA256 ) + def _get_obfuscated_key(self, prefix: str, path: str) -> str: + """Returns a hashed version of the filename if obfuscation is enabled.""" + if not self.obfuscate: + return f"{prefix}/{path.lstrip('./')}" + + # We hash the path to hide metadata while keeping it deterministic + hashed = hashlib.sha256(path.encode("utf-8")).hexdigest() + # Use two-level sharding to prevent S3 prefix performance issues with 100k+ files + return f"{prefix}/{hashed[:2]}/{hashed[2:4]}/{hashed}" + def get_name(self) -> str: return f"Cloud ({self.provider_type})" @@ -122,7 +144,7 @@ class CloudStorageProvider(AbstractStorageProvider): try: self.s3.head_bucket(Bucket=self.bucket_name) paginator = self.s3.get_paginator("list_objects_v2") - for page in paginator.paginate(Bucket=self.bucket_name, Prefix="archives/"): + for page in paginator.paginate(Bucket=self.bucket_name): if "Contents" in page: objects = [{"Key": obj["Key"]} for obj in page["Contents"]] self.s3.delete_objects( @@ -148,7 +170,9 @@ class CloudStorageProvider(AbstractStorageProvider): """ import uuid - object_key = f"archives/{uuid.uuid4().hex}.tar" + # Archives are always obfuscated by UUID for privacy and collision avoidance + raw_key = f"archives/{uuid.uuid4().hex}.tar" + object_key = self._get_obfuscated_key("archives", raw_key) if self.passphrase: logger.info( @@ -166,7 +190,6 @@ class CloudStorageProvider(AbstractStorageProvider): ciphertext, tag = cipher.encrypt_and_digest(data) # 3. Concatenate and upload - # We store everything needed for decryption in the blob itself for maximum portability payload = salt + nonce + tag + ciphertext try: @@ -174,7 +197,10 @@ class CloudStorageProvider(AbstractStorageProvider): Bucket=self.bucket_name, Key=object_key, Body=payload, - Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"}, + Metadata={ + "x-amz-meta-tapehoard-encrypted": "v2-gcm", + "x-amz-meta-tapehoard-type": "archive", + }, ) return object_key except Exception as e: @@ -193,10 +219,9 @@ class CloudStorageProvider(AbstractStorageProvider): self, media_id: str, relative_path: str, stream: BinaryIO ) -> str: """ - Uploads a single file directly to the cloud, maintaining structure. + Uploads a single file directly to the cloud. """ - clean_path = relative_path.lstrip("./") - object_key = f"objects/{clean_path}" + object_key = self._get_obfuscated_key("objects", relative_path) if self.passphrase: logger.info( @@ -221,7 +246,10 @@ class CloudStorageProvider(AbstractStorageProvider): Bucket=self.bucket_name, Key=object_key, Body=payload, - Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"}, + Metadata={ + "x-amz-meta-tapehoard-encrypted": "v2-gcm", + "x-amz-meta-tapehoard-type": "object", + }, ) return object_key except Exception as e: diff --git a/backend/app/services/archiver.py b/backend/app/services/archiver.py index 1dd863a..ce8f633 100644 --- a/backend/app/services/archiver.py +++ b/backend/app/services/archiver.py @@ -145,7 +145,6 @@ class ArchiverService: models.FilesystemState.id == coverage_subquery.c.filesystem_state_id, ) .filter( - models.FilesystemState.is_indexed, not_(models.FilesystemState.is_ignored), (coverage_subquery.c.covered_bytes.is_(None)) | (coverage_subquery.c.covered_bytes < models.FilesystemState.size), @@ -723,11 +722,7 @@ class ArchiverService: media_record.identifier, archive_id ) - is_tar = ( - not provider.capabilities.get("supports_random_access") - or str(archive_id).endswith(".tar") - or str(archive_id).isdigit() - ) + is_tar = self._test_is_tar_logic(provider, media_record, archive_id) if not is_tar: # Format Negotiation: Direct file recovery @@ -778,7 +773,12 @@ class ArchiverService: ) continue - tar_mode = "r|*" if media_record.media_type == "tape" else "r:*" + tar_mode = ( + "r|*" + if media_record.media_type + in ["tape", "lto_tape", "cloud", "s3_compat"] + else "r:*" + ) with tarfile.open(fileobj=bitstream, mode=tar_mode) as tar_bundle: normalized_map = {} @@ -854,5 +854,26 @@ class ArchiverService: logger.exception(f"Restore failed: {e}") JobManager.fail_job(job_id, str(e)) + def _test_is_tar_logic(self, provider, media_record, archive_id: str) -> bool: + """Internal logic to determine if a location_id refers to a tarball vs a native file.""" + # 1. If provider doesn't support random access (Tape), it's ALWAYS a tar stream. + if not provider.capabilities.get("supports_random_access"): + return True + + # 2. Explicit extensions or prefixes + if str(archive_id).endswith(".tar") or "archives/" in str(archive_id): + return True + + # 3. Tape file numbers (if they happen to be passed here for cloud, which is rare) + if str(archive_id).isdigit() and media_record.media_type in [ + "tape", + "lto_tape", + ]: + return True + + # 4. Otherwise, if it has random access (Cloud/HDD) and isn't explicitly an archive, + # it's a native file. + return False + archiver_manager = ArchiverService() diff --git a/backend/app/services/scanner.py b/backend/app/services/scanner.py index 80bc765..bd9ffe9 100644 --- a/backend/app/services/scanner.py +++ b/backend/app/services/scanner.py @@ -29,55 +29,83 @@ class JobManager: @staticmethod def start_job(job_id: int): """Marks a job as running and sets the start timestamp.""" + from sqlalchemy.orm.exc import StaleDataError + with SessionLocal() as db_session: - job_record = db_session.get(models.Job, job_id) - if job_record: - job_record.status = "RUNNING" - job_record.started_at = datetime.now(timezone.utc) - db_session.commit() + try: + job_record = db_session.get(models.Job, job_id) + if job_record: + job_record.status = "RUNNING" + job_record.started_at = datetime.now(timezone.utc) + db_session.commit() + except StaleDataError: + db_session.rollback() + logger.debug( + f"Job {job_id} already modified or deleted (StaleDataError)." + ) @staticmethod def update_job(job_id: int, progress: float, current_task: str): """Updates the progress and current task description for a job.""" + from sqlalchemy.orm.exc import StaleDataError + with SessionLocal() as db_session: - job_record = db_session.get(models.Job, job_id) - if job_record: - job_record.progress = progress - job_record.current_task = current_task - db_session.commit() + try: + job_record = db_session.get(models.Job, job_id) + if job_record: + job_record.progress = progress + job_record.current_task = current_task + db_session.commit() + except StaleDataError: + db_session.rollback() @staticmethod def complete_job(job_id: int): """Marks a job as successfully completed.""" + from sqlalchemy.orm.exc import StaleDataError + with SessionLocal() as db_session: - job_record = db_session.get(models.Job, job_id) - if job_record: - job_record.status = "COMPLETED" - job_record.progress = 100.0 - job_record.completed_at = datetime.now(timezone.utc) - db_session.commit() + try: + job_record = db_session.get(models.Job, job_id) + if job_record: + job_record.status = "COMPLETED" + job_record.progress = 100.0 + job_record.completed_at = datetime.now(timezone.utc) + db_session.commit() + except StaleDataError: + db_session.rollback() @staticmethod def fail_job(job_id: int, error_message: str): """Marks a job as failed and records the error message.""" + from sqlalchemy.orm.exc import StaleDataError + with SessionLocal() as db_session: - job_record = db_session.get(models.Job, job_id) - if job_record: - job_record.status = "FAILED" - job_record.error_message = error_message - job_record.completed_at = datetime.now(timezone.utc) - db_session.commit() + try: + job_record = db_session.get(models.Job, job_id) + if job_record: + job_record.status = "FAILED" + job_record.error_message = error_message + job_record.completed_at = datetime.now(timezone.utc) + db_session.commit() + except StaleDataError: + db_session.rollback() @staticmethod def cancel_job(job_id: int): """Submits a cancellation request for a pending or running job.""" + from sqlalchemy.orm.exc import StaleDataError + with SessionLocal() as db_session: - job_record = db_session.get(models.Job, job_id) - if job_record and job_record.status in ["PENDING", "RUNNING"]: - job_record.status = "FAILED" - job_record.error_message = "Cancelled by user" - job_record.completed_at = datetime.now(timezone.utc) - db_session.commit() + try: + job_record = db_session.get(models.Job, job_id) + if job_record and job_record.status in ["PENDING", "RUNNING"]: + job_record.status = "FAILED" + job_record.error_message = "Cancelled by user" + job_record.completed_at = datetime.now(timezone.utc) + db_session.commit() + except StaleDataError: + db_session.rollback() @staticmethod def is_cancelled(job_id: int) -> bool: @@ -354,7 +382,6 @@ class ScannerService: mtime=file_meta["mtime"], is_ignored=file_meta["ignored"], last_seen_timestamp=timestamp, - is_indexed=False, ) ) else: @@ -363,7 +390,7 @@ class ScannerService: or record.mtime != file_meta["mtime"] ) if metadata_changed: - record.is_indexed = False + record.sha256_hash = None with self._metrics_lock: self.files_modified += 1 @@ -396,7 +423,7 @@ class ScannerService: total_pending = ( db_session.query(models.FilesystemState) .filter( - models.FilesystemState.is_indexed.is_(False), + models.FilesystemState.sha256_hash.is_(None), models.FilesystemState.is_ignored.is_(False), ) .count() @@ -408,7 +435,7 @@ class ScannerService: hashing_targets = ( db_session.query(models.FilesystemState) .filter( - models.FilesystemState.is_indexed.is_(False), + models.FilesystemState.sha256_hash.is_(None), models.FilesystemState.is_ignored.is_(False), ) .limit(100) @@ -422,7 +449,7 @@ class ScannerService: total_pending = ( db_session.query(models.FilesystemState) .filter( - models.FilesystemState.is_indexed.is_(False), + models.FilesystemState.sha256_hash.is_(None), models.FilesystemState.is_ignored.is_(False), ) .count() @@ -451,7 +478,6 @@ class ScannerService: if computed_hash: target_record.sha256_hash = computed_hash - target_record.is_indexed = True self.files_hashed += 1 if self.files_hashed % 5 == 0: diff --git a/backend/tests/test_api_inventory.py b/backend/tests/test_api_inventory.py index a0f426a..292f780 100644 --- a/backend/tests/test_api_inventory.py +++ b/backend/tests/test_api_inventory.py @@ -66,7 +66,6 @@ def test_get_insights(client, db_session): file_path="/source/f1.txt", size=100, mtime=1000, - is_indexed=True, sha256_hash="hash1", ) db_session.add(file1) @@ -127,7 +126,6 @@ def test_search_index(client, db_session): file_path="data/important.doc", size=500, mtime=2000, - is_indexed=True, sha256_hash="hash", ) db_session.add(file1) diff --git a/backend/tests/test_api_system.py b/backend/tests/test_api_system.py index 3564c74..dd08554 100644 --- a/backend/tests/test_api_system.py +++ b/backend/tests/test_api_system.py @@ -21,7 +21,6 @@ def test_get_dashboard_stats_populated(client, db_session): size=1024, mtime=datetime.now(timezone.utc).timestamp(), is_ignored=False, - is_indexed=True, ) db_session.add(file_state) db_session.commit() diff --git a/backend/tests/test_core_utils.py b/backend/tests/test_core_utils.py new file mode 100644 index 0000000..e66b3f3 --- /dev/null +++ b/backend/tests/test_core_utils.py @@ -0,0 +1,44 @@ +from app.core.utils import get_path_uuid + + +def test_get_path_uuid_macos(mocker): + """Verifies UUID extraction from diskutil on macOS.""" + mocker.patch("os.path.exists", return_value=True) + mocker.patch("sys.platform", "darwin") + + mock_run = mocker.patch("subprocess.run") + mock_res = mocker.MagicMock() + mock_res.stdout = """ + Device Identifier: disk4s1 + Device Node: /dev/disk4s1 + Volume Name: MyBackups + Volume UUID: ABCDEF-1234-5678-90AB-CDEF12345678 + """ + mock_run.return_value = mock_res + + uuid = get_path_uuid("/Volumes/MyBackups") + assert uuid == "ABCDEF-1234-5678-90AB-CDEF12345678" + mock_run.assert_called_with( + ["diskutil", "info", "/Volumes/MyBackups"], capture_output=True, text=True + ) + + +def test_get_path_uuid_linux(mocker): + """Verifies UUID extraction from lsblk on Linux.""" + mocker.patch("os.path.exists", return_value=True) + mocker.patch("sys.platform", "linux") + + mock_run = mocker.patch("subprocess.run") + + def mock_subprocess(cmd, **kwargs): + m = mocker.MagicMock() + if "df" in cmd: + m.stdout = "Filesystem\n/dev/sdb1" + elif "lsblk" in cmd: + m.stdout = "98765432-ABCD-EF01-2345-6789ABCDEF01" + return m + + mock_run.side_effect = mock_subprocess + + uuid = get_path_uuid("/mnt/hdd") + assert uuid == "98765432-ABCD-EF01-2345-6789ABCDEF01" diff --git a/backend/tests/test_provider_cloud.py b/backend/tests/test_provider_cloud.py new file mode 100644 index 0000000..0aa407d --- /dev/null +++ b/backend/tests/test_provider_cloud.py @@ -0,0 +1,65 @@ +import hashlib +import pytest +from app.providers.cloud import CloudStorageProvider + + +def test_cloud_provider_obfuscation_logic(): + """Verifies that filename hashing and sharding works as expected.""" + + # CASE 1: Obfuscation Disabled + config_plain = { + "bucket_name": "test-bucket", + "obfuscate_filenames": False, + "access_key": "fake", + "secret_key": "fake", + } + provider_plain = CloudStorageProvider(config_plain) + path = "documents/secret_plan.pdf" + + # Expectation: Key is exactly the path with prefix + key_plain = provider_plain._get_obfuscated_key("objects", path) + assert key_plain == "objects/documents/secret_plan.pdf" + + # CASE 2: Obfuscation Enabled + config_hidden = { + "bucket_name": "test-bucket", + "obfuscate_filenames": True, + "access_key": "fake", + "secret_key": "fake", + } + provider_hidden = CloudStorageProvider(config_hidden) + + # Expectation: Key is hashed and sharded + # hash of "documents/secret_plan.pdf" + expected_hash = hashlib.sha256(path.encode("utf-8")).hexdigest() + expected_prefix = f"objects/{expected_hash[:2]}/{expected_hash[2:4]}" + + key_hidden = provider_hidden._get_obfuscated_key("objects", path) + + assert key_hidden.startswith("objects/") + assert key_hidden == f"{expected_prefix}/{expected_hash}" + assert "secret_plan.pdf" not in key_hidden + + +def test_cloud_secret_fallback(mocker): + """Verifies that the provider prioritizes local config over global settings for passphrases.""" + from app.core.config import settings + + # Mock global settings + mocker.patch.object(settings, "encryption_passphrase", "global-fallback") + + # CASE 1: Local config provides passphrase + config_local = {"bucket_name": "b", "encryption_passphrase": "local-override"} + provider_local = CloudStorageProvider(config_local) + assert provider_local.passphrase == "local-override" + + # CASE 2: Local config is empty, should fallback to global + config_empty = {"bucket_name": "b"} + provider_fallback = CloudStorageProvider(config_empty) + assert provider_fallback.passphrase == "global-fallback" + + # CASE 3: No passphrase anywhere (ValueError on key derivation) + mocker.patch.object(settings, "encryption_passphrase", "") + provider_none = CloudStorageProvider({"bucket_name": "b"}) + with pytest.raises(ValueError, match="No encryption passphrase configured"): + provider_none._derive_key(b"salt") diff --git a/backend/tests/test_provider_hdd.py b/backend/tests/test_provider_hdd.py new file mode 100644 index 0000000..d28857f --- /dev/null +++ b/backend/tests/test_provider_hdd.py @@ -0,0 +1,99 @@ +import io +import pytest +from app.providers.hdd import OfflineHDDProvider + + +def test_hdd_initialization(tmp_path): + """Verifies that HDD provider correctly prepares the disk structure.""" + mount = tmp_path / "mnt_hdd" + provider = OfflineHDDProvider({"mount_path": str(mount)}) + + success = provider.initialize_media("DISK01") + assert success is True + assert (mount / ".tapehoard_id").exists() + assert (mount / ".tapehoard_id").read_text().strip() == "DISK01" + assert (mount / "tapehoard_backups" / "archives").is_dir() + + +def test_hdd_identification(tmp_path): + """Verifies that HDD provider can identify a disk by its ID file.""" + mount = tmp_path / "mnt_hdd" + mount.mkdir() + (mount / ".tapehoard_id").write_text("DISK02") + + provider = OfflineHDDProvider({"mount_path": str(mount)}) + assert provider.identify_media() == "DISK02" + + +def test_hdd_write_sequential_logic(tmp_path): + """Verifies that archives are numbered and padded correctly.""" + mount = tmp_path / "mnt_hdd" + provider = OfflineHDDProvider({"mount_path": str(mount)}) + provider.initialize_media("DISK03") + + # Write first archive + loc1 = provider.write_archive("DISK03", io.BytesIO(b"archive1")) + assert loc1 == "0" + assert (mount / "tapehoard_backups" / "archives" / "000000.tar").exists() + + # Write second archive + loc2 = provider.write_archive("DISK03", io.BytesIO(b"archive2")) + assert loc2 == "1" + assert (mount / "tapehoard_backups" / "archives" / "000001.tar").exists() + + # Read them back + with provider.read_archive("DISK03", "0") as f: + assert f.read() == b"archive1" + with provider.read_archive("DISK03", "1") as f: + assert f.read() == b"archive2" + + +def test_hdd_direct_write_and_traversal_guard(tmp_path): + """Verifies direct file copies and security guards.""" + mount = tmp_path / "mnt_hdd" + provider = OfflineHDDProvider({"mount_path": str(mount)}) + + # Valid direct write + rel_path = "photos/holiday.jpg" + loc = provider.write_file_direct("DISK04", rel_path, io.BytesIO(b"imagedata")) + assert loc == rel_path + + target = mount / "tapehoard_backups" / "objects" / rel_path + assert target.exists() + assert target.read_bytes() == b"imagedata" + + # Read back (Format Negotiation) + with provider.read_archive("DISK04", rel_path) as f: + assert f.read() == b"imagedata" + + # Traversal Attempt + with pytest.raises(ValueError, match="Invalid relative path"): + provider.write_file_direct( + "DISK04", "escape/../../secret.txt", io.BytesIO(b"bad") + ) + + +def test_hdd_online_check_with_uuid(tmp_path, mocker): + """Verifies that online check validates both path and UUID.""" + mount = tmp_path / "mnt_hdd" + mount.mkdir() + + # Mock UUID utility + mock_get_uuid = mocker.patch("app.core.utils.get_path_uuid") + mock_get_uuid.return_value = "UUID-123" + + # CASE 1: Correct UUID + provider_ok = OfflineHDDProvider( + {"mount_path": str(mount), "device_uuid": "UUID-123"} + ) + assert provider_ok.check_online() is True + + # CASE 2: Mismatched UUID + provider_fail = OfflineHDDProvider( + {"mount_path": str(mount), "device_uuid": "UUID-999"} + ) + assert provider_fail.check_online() is False + + # CASE 3: No UUID (Path only) + provider_path_only = OfflineHDDProvider({"mount_path": str(mount)}) + assert provider_path_only.check_online() is True diff --git a/backend/tests/test_provider_tape.py b/backend/tests/test_provider_tape.py new file mode 100644 index 0000000..5c974a1 --- /dev/null +++ b/backend/tests/test_provider_tape.py @@ -0,0 +1,122 @@ +from app.providers.tape import LTOProvider + + +def test_lto_compression_control(mocker): + """Verifies that LTOProvider sends the correct mt commands for compression.""" + + # Mock subprocess.run and os.path.exists + mock_run = mocker.patch("subprocess.run") + mocker.patch("os.path.exists", return_value=True) + + # CASE 1: Compression Enabled + provider_on = LTOProvider({"device_path": "/dev/nst0", "compression": True}) + provider_on._setup_compression() + + # Expectation: mt compression 1 + mock_run.assert_called_with( + ["mt", "-f", "/dev/nst0", "compression", "1"], check=True, capture_output=True + ) + + # CASE 2: Compression Disabled + mock_run.reset_mock() + provider_off = LTOProvider({"device_path": "/dev/nst0", "compression": False}) + provider_off._setup_compression() + + # Expectation: mt compression 0 + mock_run.assert_called_with( + ["mt", "-f", "/dev/nst0", "compression", "0"], check=True, capture_output=True + ) + + +def test_lto_insertion_detection(mocker): + """Verifies that needs_registration triggers only on new insertion.""" + + device = "/dev/nst0" + # 1. Start with Empty State + LTOProvider._lkg_state = { + device: {"drive": {}, "mam": {}, "online": False, "last_check": 0.0} + } + provider = LTOProvider({"device_path": device}) + + # Mock OS path existence + mocker.patch("os.path.exists", return_value=True) + + # Mock subprocess.run to return "READY" for status and valid MAM attributes + def mock_subprocess(cmd, **kwargs): + m = mocker.MagicMock() + m.returncode = 0 + if "status" in cmd: + m.stdout = "READY" + elif "sg_read_attr" in cmd: + # Mock raw MAM data bytes (minimal valid structure) + # barcode "TAPE01" is at the end of the mapping usually + # But we patch get_mam_info to be easier + pass + return m + + mocker.patch("subprocess.run", side_effect=mock_subprocess) + + # We patch get_mam_info because parsing raw bytes in a test is overkill + mocker.patch.object(provider, "get_mam_info", return_value={"barcode": "TAPE01"}) + mocker.patch.object(provider, "get_drive_info", return_value={"vendor": "HP"}) + + # Execution: First poll after "insertion" (transition from offline LKG to online) + info = provider.get_live_info(force=True) + + # EXPECTATION: needs_registration should be True + assert info["online"] is True + assert info["identity"] == "TAPE01" + assert info["needs_registration"] is True + + # 2. Second poll (already online) + # LKG state is now updated by the provider during the first poll + info_second = provider.get_live_info(force=True) + + # EXPECTATION: needs_registration should be False (already saw this tape) + assert info_second["needs_registration"] is False + + +def test_lto_mam_parsing_logic(mocker): + """Verifies the parsing of raw SCSI attribute bytes into human-readable metadata.""" + import struct + + device = "/dev/nst0" + provider = LTOProvider({"device_path": device}) + mocker.patch("os.path.exists", return_value=True) + + # 1. CONSTRUCT RAW PAYLOAD + # SCSI MAM raw format: [TotalLen(4)] + { [ID(2)][Flags(1)][Len(2)][Value] }... + + def pack_attr(attr_id, value_bytes): + return struct.pack(">HBH", attr_id, 0, len(value_bytes)) + value_bytes + + # Remaining Capacity (0x0000): 500GB (512000 MiB) + attr_rem = pack_attr(0x0000, (512000).to_bytes(8, "big")) + # Max Capacity (0x0001): 1.5TB (1536000 MiB - LTO-5 raw) + attr_max = pack_attr(0x0001, (1536000).to_bytes(8, "big")) + # Tape Alert Flags (0x0002): Bit 20 is "Clean Now" + # Bit 20 from left (64-bit int) -> (1 << (64-20)) + alert_flags = 1 << (64 - 20) + attr_alerts = pack_attr(0x0002, alert_flags.to_bytes(8, "big")) + # Barcode (0x0806): "TAPE123" + attr_barcode = pack_attr(0x0806, b"TAPE123\x00\x00") # null padded + + payload_body = attr_rem + attr_max + attr_alerts + attr_barcode + full_payload = struct.pack(">I", len(payload_body)) + payload_body + + # 2. MOCK SUBPROCESS + mock_res = mocker.MagicMock() + mock_res.returncode = 0 + mock_res.stdout = full_payload + mocker.patch("subprocess.run", return_value=mock_res) + + # 3. EXECUTION + mam = provider.get_mam_info(force=True) + + # 4. EXPECTATIONS + assert mam["remaining_capacity_mib"] == 512000 + assert mam["max_capacity_mib"] == 1536000 + assert mam["barcode"] == "TAPE123" + assert "Clean Now" in mam["alerts"] + # 1.5TB should be identified as LTO-5 + assert mam["generation_label"] == "LTO-5" diff --git a/backend/tests/test_service_archiver.py b/backend/tests/test_service_archiver.py index a15090d..a6b38ac 100644 --- a/backend/tests/test_service_archiver.py +++ b/backend/tests/test_service_archiver.py @@ -1,5 +1,6 @@ import tarfile import io +import pytest from app.services.archiver import ArchiverService, RangeFile from app.db import models @@ -24,17 +25,11 @@ def test_get_unbacked_files(db_session): archiver = ArchiverService() # File 1: Completely unbacked - f1 = models.FilesystemState( - file_path="/data/new.txt", size=100, mtime=1, is_indexed=True - ) + f1 = models.FilesystemState(file_path="/data/new.txt", size=100, mtime=1) # File 2: Partially backed (split) - f2 = models.FilesystemState( - file_path="/data/split.bin", size=1000, mtime=1, is_indexed=True - ) + f2 = models.FilesystemState(file_path="/data/split.bin", size=1000, mtime=1) # File 3: Fully backed - f3 = models.FilesystemState( - file_path="/data/done.txt", size=50, mtime=1, is_indexed=True - ) + f3 = models.FilesystemState(file_path="/data/done.txt", size=50, mtime=1) db_session.add_all([f1, f2, f3]) db_session.flush() @@ -89,18 +84,12 @@ def test_assemble_backup_batch(db_session): # Create files size_200mb = 200 * 1024 * 1024 - f1 = models.FilesystemState( - file_path="/f1.bin", size=size_200mb, mtime=1, is_indexed=True - ) + f1 = models.FilesystemState(file_path="/f1.bin", size=size_200mb, mtime=1) # f2 is 200MB, would fit on fresh tape. Should be skipped on this 300MB tape # after f1 (200MB) is added, because only 100MB is left. - f2 = models.FilesystemState( - file_path="/f2.bin", size=size_200mb, mtime=1, is_indexed=True - ) + f2 = models.FilesystemState(file_path="/f2.bin", size=size_200mb, mtime=1) # f3 is 500MB, larger than total capacity (300MB). SHOULD be split. - f3 = models.FilesystemState( - file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1, is_indexed=True - ) + f3 = models.FilesystemState(file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1) db_session.add_all([f1, f2, f3]) db_session.commit() @@ -138,7 +127,6 @@ def test_run_backup_mocked(db_session, mocker, tmp_path): file_path=str(source_file), size=source_file.stat().st_size, mtime=1, - is_indexed=True, sha256_hash="hash1", ) db_session.add(f1) @@ -164,14 +152,227 @@ def test_run_backup_mocked(db_session, mocker, tmp_path): db_session.expire_all() assert media.bytes_used > 0 - # Verify FileVersion was recorded + +def test_archiver_saturated_media_logic(db_session, mocker, tmp_path): + """Verifies that media is marked full and priority ceded based on hardware feedback.""" + staging = tmp_path / "staging" + staging.mkdir() + archiver = ArchiverService(staging_directory=str(staging)) + + # Setup Media (HDD for simple mocking) + media = models.StorageMedia( + media_type="hdd", + identifier="FULL_DISK", + capacity=1000, + status="active", + bytes_used=0, + priority_index=1, + ) + db_session.add(media) + + # Other media to check priority reordering + media2 = models.StorageMedia( + media_type="hdd", + identifier="NEXT_DISK", + capacity=1000, + status="active", + bytes_used=0, + priority_index=2, + ) + db_session.add(media2) + + # Setup a small file to trigger the loop + source_file = tmp_path / "small.txt" + source_file.write_bytes(b"data") + f1 = models.FilesystemState( + file_path=str(source_file), + size=4, + mtime=1, + sha256_hash="hash_small", + ) + db_session.add(f1) + db_session.commit() + + # Mock Provider to report 99% utilization + mock_provider = mocker.MagicMock() + mock_provider.capabilities = {"supports_random_access": True} + mock_provider.identify_media.return_value = "FULL_DISK" + mock_provider.prepare_for_write.return_value = True + # Ensure write_file_direct returns a string, not a MagicMock + mock_provider.write_file_direct.return_value = "LOC_1" + # FORCE hardware utilization report to 99% + mock_provider.get_utilization.return_value = 0.99 + + mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider) + + from app.services.scanner import JobManager + + job = JobManager.create_job(db_session, "BACKUP") + + # Run archival + archiver.run_backup(db_session, media.id, job.id) + + # EXPECTATION: + # 1. Media should be marked "full" + # 2. Priority should be moved to the end (higher than media2) + db_session.expire_all() + assert media.status == "full" + assert media.priority_index > media2.priority_index + + +def test_archiver_chunking_logic(db_session, mocker, tmp_path): + """Verifies that large backup batches are split into appropriate chunks.""" + staging = tmp_path / "staging" + staging.mkdir() + archiver = ArchiverService(staging_directory=str(staging)) + + # Setup Media (Capacity 100GB, target chunk size ~1GB) + media = models.StorageMedia( + media_type="tape", + identifier="TAPE_001", + capacity=100 * 1024 * 1024 * 1024, + status="active", + bytes_used=0, + ) + db_session.add(media) + + # 1. Add 10 small files (100MB each) -> Should fit in one 1GB chunk + for i in range(10): + source_file = tmp_path / f"small_{i}.bin" + source_file.write_bytes(b"0" * (100 * 1024 * 1024)) + f = models.FilesystemState( + file_path=str(source_file), + size=100 * 1024 * 1024, + mtime=1, + sha256_hash=f"hash_s_{i}", + ) + db_session.add(f) + + # 2. Add 1 large file (5GB) -> Exceeds chunk size, should trigger its own chunk + large_file = tmp_path / "large.bin" + large_file.write_bytes(b"0" * 10) # Mock small content for large size metadata + f_large = models.FilesystemState( + file_path=str(large_file), + size=5 * 1024 * 1024 * 1024, + mtime=1, + sha256_hash="hash_large", + ) + # Monkeypatch stat size for test efficiency (don't actually write 5GB to tmp) + mocker.patch("os.path.getsize", return_value=5 * 1024 * 1024 * 1024) + # Also patch RangeFile to avoid actual reads + mocker.patch( + "app.services.archiver.RangeFile.__enter__", return_value=mocker.MagicMock() + ) + mocker.patch("app.services.archiver.RangeFile.__exit__") + + db_session.add(f_large) + db_session.commit() + + # Mock Provider + mock_provider = mocker.MagicMock() + mock_provider.capabilities = {"supports_random_access": False} + mock_provider.identify_media.return_value = "TAPE_001" + mock_provider.prepare_for_write.return_value = True + mock_provider.write_archive.return_value = "1" + + mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider) + + from app.services.scanner import JobManager + + job = JobManager.create_job(db_session, "BACKUP") + + # Run archival + archiver.run_backup(db_session, media.id, job.id) + + # EXPECTATION: + # write_archive should be called TWICE: + # 1. Once for the 10 small files (combined chunk) + # 2. Once for the large file (independent chunk) + assert mock_provider.write_archive.call_count == 2 + + # Verify FileVersion was recorded for the large file version = ( db_session.query(models.FileVersion) - .filter_by(filesystem_state_id=f1.id) + .filter_by(filesystem_state_id=f_large.id) .first() ) assert version is not None - assert version.file_number == "ARCH_1" + assert version.file_number == "1" + + +def test_range_file_alignment_guard(tmp_path): + """Verifies that RangeFile pads truncated files with nulls to maintain tar alignment.""" + from app.services.archiver import RangeFile + + # Create a 50 byte file + f = tmp_path / "truncated.bin" + f.write_bytes(b"A" * 50) + + # Initialize RangeFile expecting 100 bytes + with RangeFile(str(f), offset_start=0, length=100) as rf: + data = rf.read(100) + + # EXPECTATION: + # 1. Total length must be exactly 100 + # 2. First 50 are 'A's + # 3. Last 50 are nulls '\x00' + assert len(data) == 100 + assert data[:50] == b"A" * 50 + assert data[50:] == b"\x00" * 50 + + +def test_path_traversal_protection(): + """Verifies that the restorer blocks path traversal attempts.""" + archiver = ArchiverService() + base = "/restores/my_backup" + + # CASE 1: Valid Path + safe_path = archiver._sanitize_recovery_path(base, "photos/image.jpg") + assert safe_path.startswith(base) + + # CASE 2: Traversal Attempt (Relative) + with pytest.raises(PermissionError, match="Restricted path traversal"): + archiver._sanitize_recovery_path(base, "../../etc/shadow") + + # CASE 3: Traversal Attempt (Absolute-style relative that escapes) + with pytest.raises(PermissionError, match="Restricted path traversal"): + archiver._sanitize_recovery_path(base, "/../../etc/shadow") + + +def test_archiver_restoration_negotiation(db_session, mocker, tmp_path): + """Verifies that the restorer chooses the correct format (Native vs Tar).""" + archiver = ArchiverService() + + # Setup Mocks + mock_media = mocker.MagicMock() + mock_media.media_type = "cloud" + mock_media.identifier = "BUCKET1" + + mock_provider = mocker.MagicMock() + mock_provider.identify_media.return_value = "BUCKET1" + mock_provider.check_online.return_value = True + + # 1. TEST NATIVE NEGOTIATION + # Expectation: random_access=True + non-tar ID -> is_tar=False + mock_provider.capabilities = {"supports_random_access": True} + + # CASE A: Obfuscated object (Native) + is_tar_native = archiver._test_is_tar_logic( + mock_provider, mock_media, "objects/a1/b2/hash" + ) + assert is_tar_native is False + + # CASE B: Obfuscated archive (Tar) + is_tar_cloud_archive = archiver._test_is_tar_logic( + mock_provider, mock_media, "archives/a1/b2/hash" + ) + assert is_tar_cloud_archive is True + + # CASE C: Tape (Always Tar) + mock_media.media_type = "tape" + mock_provider.capabilities = {"supports_random_access": False} + is_tar_tape = archiver._test_is_tar_logic(mock_provider, mock_media, "1") + assert is_tar_tape is True def test_run_restore_mocked(db_session, mocker, tmp_path): @@ -206,6 +407,9 @@ def test_run_restore_mocked(db_session, mocker, tmp_path): # Mock Provider & Tar Stream mock_provider = mocker.MagicMock() mock_provider.identify_media.return_value = "RESTORE_ME" + mock_provider.check_online.return_value = True + # FORCE tar path by disabling random access + mock_provider.capabilities = {"supports_random_access": False} # Create a mock tar stream containing the file buf = io.BytesIO() diff --git a/backend/tests/test_service_scanner.py b/backend/tests/test_service_scanner.py index 3eea095..cd9606f 100644 --- a/backend/tests/test_service_scanner.py +++ b/backend/tests/test_service_scanner.py @@ -91,7 +91,7 @@ def test_metadata_update_on_change(db_session): # Initial state f1 = models.FilesystemState( - file_path="/data/up.txt", size=50, mtime=1, is_indexed=True, sha256_hash="old" + file_path="/data/up.txt", size=50, mtime=1, sha256_hash="old" ) db_session.add(f1) db_session.commit() @@ -103,7 +103,7 @@ def test_metadata_update_on_change(db_session): db_session.refresh(f1) assert f1.size == 999 - assert f1.is_indexed is False # Should be reset for re-hashing + assert f1.sha256_hash is None # Should be reset for re-hashing def test_scan_sources_mocked(db_session, mocker): @@ -141,7 +141,7 @@ def test_run_hashing_mocked(db_session, mocker): # Setup unindexed file f = models.FilesystemState( - file_path="/data/hash.me", size=10, mtime=1, is_indexed=False, is_ignored=False + file_path="/data/hash.me", size=10, mtime=1, is_ignored=False ) db_session.add(f) db_session.commit() @@ -154,5 +154,4 @@ def test_run_hashing_mocked(db_session, mocker): scanner.run_hashing() db_session.refresh(f) - assert f.is_indexed is True assert f.sha256_hash == "mocked_hash" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index c4a6c67..85b1ad6 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,7 +15,11 @@ services: # devices: # - /dev/nst0:/dev/nst0 environment: - - PUID=1000 - - PGID=1000 + - PUID=0 + - PGID=0 + - RUN_AS_ROOT=true - TZ=UTC + user: "0:0" + cap_add: + - SYS_RAWIO restart: unless-stopped diff --git a/frontend/src/lib/api/types.gen.ts b/frontend/src/lib/api/types.gen.ts index edb1f90..d474671 100644 --- a/frontend/src/lib/api/types.gen.ts +++ b/frontend/src/lib/api/types.gen.ts @@ -270,10 +270,7 @@ export type ItemMetadataSchema = { * Sha256 Hash */ sha256_hash?: string | null; - /** - * Is Indexed - */ - is_indexed?: boolean; + /** * Is Ignored */ diff --git a/justfile b/justfile index b2690fb..7fbcc15 100644 --- a/justfile +++ b/justfile @@ -37,7 +37,7 @@ lint: # Run all backend tests pytest: @echo "Running backend tests..." - cd backend && uv run pytest + cd backend && COVERAGE_CORE=sysmon uv run pytest # Run frontend checks check: diff --git a/tapehoard.service b/tapehoard.service new file mode 100644 index 0000000..20355ce --- /dev/null +++ b/tapehoard.service @@ -0,0 +1,17 @@ +[Unit] +Description=TapeHoard Backup Manager +After=network.target + +[Service] +Type=simple +User=root +WorkingDirectory=/opt/tapehoard +# We use root to ensure direct /dev/nst0 and raw SCSI access +ExecStart=/usr/bin/just dev +Restart=always +RestartSec=10 +# Give the tape drive time to rewind on shutdown +TimeoutStopSec=30 + +[Install] +WantedBy=multi-user.target