tests & remove is_indexed flag from db
This commit is contained in:
@@ -53,12 +53,22 @@ This document (`GEMINI.md`) contains critical, contextual information about the
|
||||
### Scanning & Hashing Architecture
|
||||
* **Concurrent Phasing:** Decoupled into `SCAN` (Metadata, Normal priority) and `HASH` (Content, Idle priority with dynamic `iowait` throttling).
|
||||
* **Thread-Safe Metrics:** All counters (files processed, bytes hashed) must be protected by a `threading.Lock`.
|
||||
* **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `is_indexed = 0 AND is_ignored = 0` files.
|
||||
* **Hashing Progress:** Hashing jobs calculate progress against a dynamically updating snapshot of total `sha256_hash IS NULL AND is_ignored = 0` files.
|
||||
|
||||
### Archival & Recovery
|
||||
* **Format Negotiation:** The Archiver adapts formats based on provider capabilities (`supports_random_access`).
|
||||
* *Sequential (Tape):* Uses `.tar` streams to maintain drive streaming.
|
||||
* *Random Access (HDD/Cloud):* Uses native direct file copying/objects to enable instant seekless restores without unpacking gigabytes of data.
|
||||
* **High-Speed Hybrid Archival:**
|
||||
* The system prioritizes the **system `tar` binary** for whole-file chunks, delivering a 10x-20x performance boost over pure Python and ensuring optimal buffer saturation for LTO drives.
|
||||
* It transparently falls back to the **Python `RangeFile` logic** only for chunks containing split fragments, maintaining bit-perfect alignment for multi-tape files.
|
||||
* **Industrial Tar Chunking:**
|
||||
* Large backup sets are automatically split into multiple independent archives. The system dynamically aims for at least **100 archives per tape** (calculated based on generational capacity, e.g., ~15GB for LTO-5) to provide high seek granularity during restoration.
|
||||
* **Exception:** Single large files are allowed to occupy their own archives even if they exceed the target chunk size, preventing unnecessary fragmentation while keeping them as independent, seekable objects.
|
||||
* **Refined Splitting Philosophy:**
|
||||
* Files are **only split** if they are physically larger than the media's entire capacity (multi-tape spanning).
|
||||
* **Skip-and-Defer:** If a file is larger than the remaining space on a tape but smaller than its total capacity, it is deferred to the next fresh medium to minimize fragmentation.
|
||||
* **Hardware-First Utilization:** The system trusts **Physical Hardware Feedback (MAM)** over logical byte counts. Tapes are only marked as "Full" when the drive reporting (via `get_utilization`) confirms saturation, maximizing utilization when hardware compression is active.
|
||||
* **Bitstream Integrity:** `RangeFile` must guarantee exact byte counts for tar alignment.
|
||||
* **Metadata Fidelity:** The restorer must preserve original **permissions (chmod)**, **timestamps (utime)**, and **ownership (chown)** when recovering files natively or via tar.
|
||||
* **Independence:** Force all tar archive members to be **Regular Files** to break fragile hard-link dependencies. Symlinks are preserved as `SYMTYPE` (or `.symlink` stub objects for native format).
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
"""remove is_indexed flag
|
||||
|
||||
Revision ID: 1b59e22b9b7a
|
||||
Revises: fbbc0a40a840
|
||||
Create Date: 2026-04-28 23:44:04.949304
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "1b59e22b9b7a"
|
||||
down_revision: Union[str, Sequence[str], None] = "fbbc0a40a840"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade schema."""
|
||||
# We use batch_alter_table for SQLite compatibility
|
||||
with op.batch_alter_table("filesystem_state", schema=None) as batch_op:
|
||||
batch_op.drop_column("is_indexed")
|
||||
batch_op.alter_column(
|
||||
"size",
|
||||
existing_type=sa.INTEGER(),
|
||||
type_=sa.BigInteger(),
|
||||
existing_nullable=False,
|
||||
)
|
||||
|
||||
with op.batch_alter_table("storage_media", schema=None) as batch_op:
|
||||
batch_op.alter_column(
|
||||
"capacity",
|
||||
existing_type=sa.INTEGER(),
|
||||
type_=sa.BigInteger(),
|
||||
existing_nullable=False,
|
||||
)
|
||||
batch_op.alter_column(
|
||||
"bytes_used",
|
||||
existing_type=sa.INTEGER(),
|
||||
type_=sa.BigInteger(),
|
||||
existing_nullable=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade schema."""
|
||||
with op.batch_alter_table("storage_media", schema=None) as batch_op:
|
||||
batch_op.alter_column(
|
||||
"bytes_used",
|
||||
existing_type=sa.BigInteger(),
|
||||
type_=sa.INTEGER(),
|
||||
existing_nullable=False,
|
||||
)
|
||||
batch_op.alter_column(
|
||||
"capacity",
|
||||
existing_type=sa.BigInteger(),
|
||||
type_=sa.INTEGER(),
|
||||
existing_nullable=False,
|
||||
)
|
||||
|
||||
with op.batch_alter_table("filesystem_state", schema=None) as batch_op:
|
||||
batch_op.alter_column(
|
||||
"size",
|
||||
existing_type=sa.BigInteger(),
|
||||
type_=sa.INTEGER(),
|
||||
existing_nullable=False,
|
||||
)
|
||||
batch_op.add_column(
|
||||
sa.Column(
|
||||
"is_indexed",
|
||||
sa.BOOLEAN(),
|
||||
server_default=sa.text("'0'"),
|
||||
nullable=False,
|
||||
)
|
||||
)
|
||||
@@ -2,7 +2,7 @@ from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.db import models
|
||||
@@ -17,15 +17,14 @@ router = APIRouter(prefix="/backups", tags=["Backups"])
|
||||
|
||||
|
||||
class BackupJobSchema(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
id: int
|
||||
job_type: str
|
||||
status: str
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
# --- Endpoints ---
|
||||
|
||||
|
||||
@@ -337,11 +337,11 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
|
||||
SELECT
|
||||
COUNT(*) as total_files,
|
||||
SUM(size) as total_size,
|
||||
SUM(CASE WHEN is_indexed = 1 THEN size ELSE 0 END) as total_hashed_size,
|
||||
SUM(CASE WHEN sha256_hash IS NOT NULL THEN size ELSE 0 END) as total_hashed_size,
|
||||
(SELECT SUM(min_size) FROM (
|
||||
SELECT MIN(size) as min_size
|
||||
FROM filesystem_state
|
||||
WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0
|
||||
WHERE sha256_hash IS NOT NULL AND is_ignored = 0
|
||||
GROUP BY sha256_hash
|
||||
)) as unique_hashed_size
|
||||
FROM filesystem_state
|
||||
@@ -453,7 +453,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
|
||||
COUNT(*) as copy_count,
|
||||
(size * (COUNT(*) - 1)) as saved_bytes
|
||||
FROM filesystem_state
|
||||
WHERE is_indexed = 1 AND sha256_hash IS NOT NULL AND is_ignored = 0
|
||||
WHERE sha256_hash IS NOT NULL AND is_ignored = 0
|
||||
GROUP BY sha256_hash
|
||||
HAVING copy_count > 1
|
||||
ORDER BY saved_bytes DESC
|
||||
@@ -465,7 +465,8 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
|
||||
directory_aggregation_sql = text("""
|
||||
SELECT
|
||||
RTRIM(file_path, REPLACE(file_path, '/', '')) as dir_path,
|
||||
SUM(size) as byte_total
|
||||
SUM(size) as byte_total,
|
||||
MAX(mtime) as latest_mtime
|
||||
FROM filesystem_state
|
||||
WHERE is_ignored = 0
|
||||
GROUP BY dir_path
|
||||
@@ -474,7 +475,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
|
||||
|
||||
# Hierarchical tree construction
|
||||
nested_dir_map = {}
|
||||
for path_str, size_val in all_directories:
|
||||
for path_str, size_val, mtime_val in all_directories:
|
||||
if not path_str:
|
||||
continue
|
||||
path_segments = [p for p in path_str.split("/") if p]
|
||||
@@ -492,10 +493,14 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
|
||||
if segment not in current_node:
|
||||
current_node[segment] = {
|
||||
"size": 0,
|
||||
"mtime": 0,
|
||||
"children": {},
|
||||
"fullPath": accumulated_path,
|
||||
}
|
||||
current_node[segment]["size"] += size_val
|
||||
current_node[segment]["mtime"] = max(
|
||||
current_node[segment]["mtime"], mtime_val or 0
|
||||
)
|
||||
current_node = current_node[segment]["children"]
|
||||
|
||||
# Collapse unhelpful single-child roots
|
||||
@@ -517,6 +522,7 @@ def get_system_analytics(db_session: Session = Depends(get_db)):
|
||||
{
|
||||
"path": key,
|
||||
"size": value["size"],
|
||||
"mtime": value["mtime"],
|
||||
"fullPath": value["fullPath"],
|
||||
"children": children_list,
|
||||
}
|
||||
@@ -877,12 +883,10 @@ def get_archive_item_metadata(path: str, db_session: Session = Depends(get_db)):
|
||||
return ItemMetadataSchema(
|
||||
id=item.id,
|
||||
path=item.file_path,
|
||||
type="file",
|
||||
size=item.size,
|
||||
mtime=datetime.fromtimestamp(item.mtime, tz=timezone.utc),
|
||||
last_seen_timestamp=item.last_seen_timestamp,
|
||||
sha256_hash=item.sha256_hash,
|
||||
is_indexed=item.is_indexed,
|
||||
is_ignored=item.is_ignored,
|
||||
versions=versions,
|
||||
)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import List, Optional
|
||||
from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from sqlalchemy.orm import Session, joinedload
|
||||
from sqlalchemy import text
|
||||
from app.db.database import get_db, SessionLocal
|
||||
@@ -17,14 +17,13 @@ router = APIRouter(prefix="/restores", tags=["Restores"])
|
||||
|
||||
|
||||
class CartItemSchema(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
id: int
|
||||
file_path: str
|
||||
size: int
|
||||
type: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class RestoreTriggerRequest(BaseModel):
|
||||
destination_path: str
|
||||
@@ -115,7 +114,9 @@ def add_directory_to_recovery_queue(
|
||||
AND rc.id IS NULL
|
||||
""")
|
||||
|
||||
db_session.execute(discovery_sql, {"prefix": f"{target_directory}%", "now": now})
|
||||
db_session.execute(
|
||||
discovery_sql, {"prefix": f"{target_directory}%", "now": now.isoformat()}
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
total_in_queue = db_session.query(models.RestoreCart).count()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from typing import Optional, List, Dict, Any
|
||||
from datetime import datetime
|
||||
|
||||
@@ -17,7 +17,6 @@ class ItemMetadataSchema(BaseModel):
|
||||
mtime: datetime
|
||||
last_seen_timestamp: Optional[datetime] = None
|
||||
sha256_hash: Optional[str] = None
|
||||
is_indexed: bool = False
|
||||
is_ignored: bool = False
|
||||
child_count: Optional[int] = 0
|
||||
selected: bool = False
|
||||
@@ -44,8 +43,7 @@ class MediaSchema(BaseModel):
|
||||
host_total_bytes: Optional[int] = None
|
||||
live_info: Optional[Dict[str, Any]] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class MediaCreateSchema(BaseModel):
|
||||
|
||||
@@ -8,7 +8,7 @@ import pathspec
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
||||
from fastapi.responses import FileResponse, StreamingResponse
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from sqlalchemy import func, text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -37,6 +37,8 @@ class DashboardStatsSchema(BaseModel):
|
||||
|
||||
|
||||
class JobSchema(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
id: int
|
||||
job_type: str
|
||||
status: str
|
||||
@@ -47,9 +49,6 @@ class JobSchema(BaseModel):
|
||||
completed_at: Optional[datetime] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class FileItemSchema(BaseModel):
|
||||
name: str
|
||||
@@ -172,7 +171,7 @@ def get_dashboard_stats(db_session: Session = Depends(get_db)):
|
||||
SUM(CASE WHEN is_ignored = 1 THEN size ELSE 0 END) as ignored_size,
|
||||
SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN 1 ELSE 0 END) as unprotected_count,
|
||||
SUM(CASE WHEN is_ignored = 0 AND id NOT IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as unprotected_size,
|
||||
SUM(CASE WHEN is_indexed = 1 AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count,
|
||||
SUM(CASE WHEN sha256_hash IS NOT NULL AND is_ignored = 0 THEN 1 ELSE 0 END) as hashed_count,
|
||||
SUM(CASE WHEN is_ignored = 0 THEN 1 ELSE 0 END) as eligible_count,
|
||||
SUM(CASE WHEN id IN (SELECT filesystem_state_id FROM file_versions) THEN size ELSE 0 END) as archived_size
|
||||
FROM filesystem_state
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""
|
||||
Standardized secret management and application configuration.
|
||||
Values can be overridden via environment variables or a .env file.
|
||||
"""
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env", env_file_encoding="utf-8", extra="ignore"
|
||||
)
|
||||
|
||||
# Database
|
||||
database_url: str = "sqlite:///./tapehoard.db"
|
||||
|
||||
# Security / Encryption
|
||||
# Standardized secret management pattern
|
||||
encryption_passphrase: str = "tapehoard-default-insecure-passphrase"
|
||||
|
||||
# Staging
|
||||
staging_directory: str = "/staging"
|
||||
|
||||
# Cloud Defaults
|
||||
default_s3_region: str = "us-east-1"
|
||||
|
||||
# Hardware Detection
|
||||
default_tape_drive: str = "/dev/nst0"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
@@ -19,7 +19,6 @@ class FilesystemState(Base):
|
||||
sha256_hash: Mapped[Optional[str]] = mapped_column(
|
||||
String, index=True, nullable=True
|
||||
)
|
||||
is_indexed: Mapped[bool] = mapped_column(Boolean, default=False) # True if hashed
|
||||
is_ignored: Mapped[bool] = mapped_column(
|
||||
Boolean, default=False
|
||||
) # True if matches exclusion
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
CREATE TABLE filesystem_state (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_path TEXT UNIQUE,
|
||||
size BIGINT,
|
||||
mtime FLOAT,
|
||||
sha256_hash TEXT,
|
||||
is_ignored BOOLEAN DEFAULT 0,
|
||||
last_seen_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE storage_media (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
media_type TEXT,
|
||||
identifier TEXT UNIQUE,
|
||||
generation_tier TEXT,
|
||||
capacity BIGINT,
|
||||
bytes_used BIGINT DEFAULT 0,
|
||||
location TEXT,
|
||||
status TEXT DEFAULT 'active',
|
||||
extra_config TEXT,
|
||||
priority_index INTEGER DEFAULT 0,
|
||||
last_seen DATETIME,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE file_versions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
filesystem_state_id INTEGER,
|
||||
media_id INTEGER,
|
||||
file_number TEXT,
|
||||
offset_in_tar INTEGER,
|
||||
is_split BOOLEAN DEFAULT 0,
|
||||
split_id TEXT,
|
||||
offset_start BIGINT DEFAULT 0,
|
||||
offset_end BIGINT DEFAULT 0,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id),
|
||||
FOREIGN KEY(media_id) REFERENCES storage_media(id)
|
||||
);
|
||||
|
||||
CREATE TABLE tracked_sources (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
path TEXT UNIQUE,
|
||||
is_directory BOOLEAN DEFAULT 1,
|
||||
action TEXT DEFAULT 'include',
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE restore_cart (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
filesystem_state_id INTEGER,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY(filesystem_state_id) REFERENCES filesystem_state(id)
|
||||
);
|
||||
|
||||
CREATE TABLE jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_type TEXT,
|
||||
status TEXT DEFAULT 'PENDING',
|
||||
progress FLOAT DEFAULT 0.0,
|
||||
current_task TEXT,
|
||||
error_message TEXT,
|
||||
started_at DATETIME,
|
||||
completed_at DATETIME,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE system_settings (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- FTS5 Virtual Table for Instant Search
|
||||
CREATE VIRTUAL TABLE filesystem_fts USING fts5(
|
||||
file_path,
|
||||
content='filesystem_state',
|
||||
content_rowid='id'
|
||||
);
|
||||
|
||||
-- Trigger to keep FTS5 synchronized with real state
|
||||
CREATE TRIGGER filesystem_fts_insert AFTER INSERT ON filesystem_state BEGIN
|
||||
INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER filesystem_fts_delete AFTER DELETE ON filesystem_state BEGIN
|
||||
INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER filesystem_fts_update AFTER UPDATE ON filesystem_state BEGIN
|
||||
INSERT INTO filesystem_fts(filesystem_fts, rowid, file_path) VALUES('delete', old.id, old.file_path);
|
||||
INSERT INTO filesystem_fts(rowid, file_path) VALUES (new.id, new.file_path);
|
||||
END;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import boto3
|
||||
import os
|
||||
import io
|
||||
@@ -10,6 +11,8 @@ from Crypto.Cipher import AES
|
||||
from Crypto.Protocol.KDF import PBKDF2
|
||||
from Crypto.Hash import SHA256
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
class CloudStorageProvider(AbstractStorageProvider):
|
||||
provider_id = "s3_compat"
|
||||
@@ -48,6 +51,12 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
"title": "Client-Side Encryption Passphrase",
|
||||
"description": "Used to encrypt data locally before uploading via AES-256-GCM.",
|
||||
},
|
||||
"obfuscate_filenames": {
|
||||
"type": "boolean",
|
||||
"title": "Obfuscate Filenames",
|
||||
"description": "Store files as SHA-256 hashes in the cloud to hide metadata.",
|
||||
"default": False,
|
||||
},
|
||||
}
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
@@ -55,9 +64,12 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
self.bucket_name = config.get("bucket_name")
|
||||
self.region = config.get("region", "us-east-1")
|
||||
self.endpoint_url = config.get("endpoint_url")
|
||||
self.obfuscate = config.get("obfuscate_filenames", False)
|
||||
|
||||
# Local Encryption Settings
|
||||
self.passphrase = config.get("encryption_passphrase")
|
||||
# Local Encryption Settings: Use provided or global default
|
||||
self.passphrase = (
|
||||
config.get("encryption_passphrase") or settings.encryption_passphrase
|
||||
)
|
||||
|
||||
# Credentials
|
||||
access_key = config.get("access_key")
|
||||
@@ -80,6 +92,16 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
self.passphrase, salt, dkLen=32, count=100000, hmac_hash_module=SHA256
|
||||
)
|
||||
|
||||
def _get_obfuscated_key(self, prefix: str, path: str) -> str:
|
||||
"""Returns a hashed version of the filename if obfuscation is enabled."""
|
||||
if not self.obfuscate:
|
||||
return f"{prefix}/{path.lstrip('./')}"
|
||||
|
||||
# We hash the path to hide metadata while keeping it deterministic
|
||||
hashed = hashlib.sha256(path.encode("utf-8")).hexdigest()
|
||||
# Use two-level sharding to prevent S3 prefix performance issues with 100k+ files
|
||||
return f"{prefix}/{hashed[:2]}/{hashed[2:4]}/{hashed}"
|
||||
|
||||
def get_name(self) -> str:
|
||||
return f"Cloud ({self.provider_type})"
|
||||
|
||||
@@ -122,7 +144,7 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
try:
|
||||
self.s3.head_bucket(Bucket=self.bucket_name)
|
||||
paginator = self.s3.get_paginator("list_objects_v2")
|
||||
for page in paginator.paginate(Bucket=self.bucket_name, Prefix="archives/"):
|
||||
for page in paginator.paginate(Bucket=self.bucket_name):
|
||||
if "Contents" in page:
|
||||
objects = [{"Key": obj["Key"]} for obj in page["Contents"]]
|
||||
self.s3.delete_objects(
|
||||
@@ -148,7 +170,9 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
"""
|
||||
import uuid
|
||||
|
||||
object_key = f"archives/{uuid.uuid4().hex}.tar"
|
||||
# Archives are always obfuscated by UUID for privacy and collision avoidance
|
||||
raw_key = f"archives/{uuid.uuid4().hex}.tar"
|
||||
object_key = self._get_obfuscated_key("archives", raw_key)
|
||||
|
||||
if self.passphrase:
|
||||
logger.info(
|
||||
@@ -166,7 +190,6 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
ciphertext, tag = cipher.encrypt_and_digest(data)
|
||||
|
||||
# 3. Concatenate and upload
|
||||
# We store everything needed for decryption in the blob itself for maximum portability
|
||||
payload = salt + nonce + tag + ciphertext
|
||||
|
||||
try:
|
||||
@@ -174,7 +197,10 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
Bucket=self.bucket_name,
|
||||
Key=object_key,
|
||||
Body=payload,
|
||||
Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"},
|
||||
Metadata={
|
||||
"x-amz-meta-tapehoard-encrypted": "v2-gcm",
|
||||
"x-amz-meta-tapehoard-type": "archive",
|
||||
},
|
||||
)
|
||||
return object_key
|
||||
except Exception as e:
|
||||
@@ -193,10 +219,9 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
self, media_id: str, relative_path: str, stream: BinaryIO
|
||||
) -> str:
|
||||
"""
|
||||
Uploads a single file directly to the cloud, maintaining structure.
|
||||
Uploads a single file directly to the cloud.
|
||||
"""
|
||||
clean_path = relative_path.lstrip("./")
|
||||
object_key = f"objects/{clean_path}"
|
||||
object_key = self._get_obfuscated_key("objects", relative_path)
|
||||
|
||||
if self.passphrase:
|
||||
logger.info(
|
||||
@@ -221,7 +246,10 @@ class CloudStorageProvider(AbstractStorageProvider):
|
||||
Bucket=self.bucket_name,
|
||||
Key=object_key,
|
||||
Body=payload,
|
||||
Metadata={"x-amz-meta-tapehoard-encrypted": "v2-gcm"},
|
||||
Metadata={
|
||||
"x-amz-meta-tapehoard-encrypted": "v2-gcm",
|
||||
"x-amz-meta-tapehoard-type": "object",
|
||||
},
|
||||
)
|
||||
return object_key
|
||||
except Exception as e:
|
||||
|
||||
@@ -145,7 +145,6 @@ class ArchiverService:
|
||||
models.FilesystemState.id == coverage_subquery.c.filesystem_state_id,
|
||||
)
|
||||
.filter(
|
||||
models.FilesystemState.is_indexed,
|
||||
not_(models.FilesystemState.is_ignored),
|
||||
(coverage_subquery.c.covered_bytes.is_(None))
|
||||
| (coverage_subquery.c.covered_bytes < models.FilesystemState.size),
|
||||
@@ -723,11 +722,7 @@ class ArchiverService:
|
||||
media_record.identifier, archive_id
|
||||
)
|
||||
|
||||
is_tar = (
|
||||
not provider.capabilities.get("supports_random_access")
|
||||
or str(archive_id).endswith(".tar")
|
||||
or str(archive_id).isdigit()
|
||||
)
|
||||
is_tar = self._test_is_tar_logic(provider, media_record, archive_id)
|
||||
|
||||
if not is_tar:
|
||||
# Format Negotiation: Direct file recovery
|
||||
@@ -778,7 +773,12 @@ class ArchiverService:
|
||||
)
|
||||
continue
|
||||
|
||||
tar_mode = "r|*" if media_record.media_type == "tape" else "r:*"
|
||||
tar_mode = (
|
||||
"r|*"
|
||||
if media_record.media_type
|
||||
in ["tape", "lto_tape", "cloud", "s3_compat"]
|
||||
else "r:*"
|
||||
)
|
||||
|
||||
with tarfile.open(fileobj=bitstream, mode=tar_mode) as tar_bundle:
|
||||
normalized_map = {}
|
||||
@@ -854,5 +854,26 @@ class ArchiverService:
|
||||
logger.exception(f"Restore failed: {e}")
|
||||
JobManager.fail_job(job_id, str(e))
|
||||
|
||||
def _test_is_tar_logic(self, provider, media_record, archive_id: str) -> bool:
|
||||
"""Internal logic to determine if a location_id refers to a tarball vs a native file."""
|
||||
# 1. If provider doesn't support random access (Tape), it's ALWAYS a tar stream.
|
||||
if not provider.capabilities.get("supports_random_access"):
|
||||
return True
|
||||
|
||||
# 2. Explicit extensions or prefixes
|
||||
if str(archive_id).endswith(".tar") or "archives/" in str(archive_id):
|
||||
return True
|
||||
|
||||
# 3. Tape file numbers (if they happen to be passed here for cloud, which is rare)
|
||||
if str(archive_id).isdigit() and media_record.media_type in [
|
||||
"tape",
|
||||
"lto_tape",
|
||||
]:
|
||||
return True
|
||||
|
||||
# 4. Otherwise, if it has random access (Cloud/HDD) and isn't explicitly an archive,
|
||||
# it's a native file.
|
||||
return False
|
||||
|
||||
|
||||
archiver_manager = ArchiverService()
|
||||
|
||||
@@ -29,55 +29,83 @@ class JobManager:
|
||||
@staticmethod
|
||||
def start_job(job_id: int):
|
||||
"""Marks a job as running and sets the start timestamp."""
|
||||
from sqlalchemy.orm.exc import StaleDataError
|
||||
|
||||
with SessionLocal() as db_session:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.status = "RUNNING"
|
||||
job_record.started_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
try:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.status = "RUNNING"
|
||||
job_record.started_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
except StaleDataError:
|
||||
db_session.rollback()
|
||||
logger.debug(
|
||||
f"Job {job_id} already modified or deleted (StaleDataError)."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def update_job(job_id: int, progress: float, current_task: str):
|
||||
"""Updates the progress and current task description for a job."""
|
||||
from sqlalchemy.orm.exc import StaleDataError
|
||||
|
||||
with SessionLocal() as db_session:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.progress = progress
|
||||
job_record.current_task = current_task
|
||||
db_session.commit()
|
||||
try:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.progress = progress
|
||||
job_record.current_task = current_task
|
||||
db_session.commit()
|
||||
except StaleDataError:
|
||||
db_session.rollback()
|
||||
|
||||
@staticmethod
|
||||
def complete_job(job_id: int):
|
||||
"""Marks a job as successfully completed."""
|
||||
from sqlalchemy.orm.exc import StaleDataError
|
||||
|
||||
with SessionLocal() as db_session:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.status = "COMPLETED"
|
||||
job_record.progress = 100.0
|
||||
job_record.completed_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
try:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.status = "COMPLETED"
|
||||
job_record.progress = 100.0
|
||||
job_record.completed_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
except StaleDataError:
|
||||
db_session.rollback()
|
||||
|
||||
@staticmethod
|
||||
def fail_job(job_id: int, error_message: str):
|
||||
"""Marks a job as failed and records the error message."""
|
||||
from sqlalchemy.orm.exc import StaleDataError
|
||||
|
||||
with SessionLocal() as db_session:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.status = "FAILED"
|
||||
job_record.error_message = error_message
|
||||
job_record.completed_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
try:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record:
|
||||
job_record.status = "FAILED"
|
||||
job_record.error_message = error_message
|
||||
job_record.completed_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
except StaleDataError:
|
||||
db_session.rollback()
|
||||
|
||||
@staticmethod
|
||||
def cancel_job(job_id: int):
|
||||
"""Submits a cancellation request for a pending or running job."""
|
||||
from sqlalchemy.orm.exc import StaleDataError
|
||||
|
||||
with SessionLocal() as db_session:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record and job_record.status in ["PENDING", "RUNNING"]:
|
||||
job_record.status = "FAILED"
|
||||
job_record.error_message = "Cancelled by user"
|
||||
job_record.completed_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
try:
|
||||
job_record = db_session.get(models.Job, job_id)
|
||||
if job_record and job_record.status in ["PENDING", "RUNNING"]:
|
||||
job_record.status = "FAILED"
|
||||
job_record.error_message = "Cancelled by user"
|
||||
job_record.completed_at = datetime.now(timezone.utc)
|
||||
db_session.commit()
|
||||
except StaleDataError:
|
||||
db_session.rollback()
|
||||
|
||||
@staticmethod
|
||||
def is_cancelled(job_id: int) -> bool:
|
||||
@@ -354,7 +382,6 @@ class ScannerService:
|
||||
mtime=file_meta["mtime"],
|
||||
is_ignored=file_meta["ignored"],
|
||||
last_seen_timestamp=timestamp,
|
||||
is_indexed=False,
|
||||
)
|
||||
)
|
||||
else:
|
||||
@@ -363,7 +390,7 @@ class ScannerService:
|
||||
or record.mtime != file_meta["mtime"]
|
||||
)
|
||||
if metadata_changed:
|
||||
record.is_indexed = False
|
||||
record.sha256_hash = None
|
||||
with self._metrics_lock:
|
||||
self.files_modified += 1
|
||||
|
||||
@@ -396,7 +423,7 @@ class ScannerService:
|
||||
total_pending = (
|
||||
db_session.query(models.FilesystemState)
|
||||
.filter(
|
||||
models.FilesystemState.is_indexed.is_(False),
|
||||
models.FilesystemState.sha256_hash.is_(None),
|
||||
models.FilesystemState.is_ignored.is_(False),
|
||||
)
|
||||
.count()
|
||||
@@ -408,7 +435,7 @@ class ScannerService:
|
||||
hashing_targets = (
|
||||
db_session.query(models.FilesystemState)
|
||||
.filter(
|
||||
models.FilesystemState.is_indexed.is_(False),
|
||||
models.FilesystemState.sha256_hash.is_(None),
|
||||
models.FilesystemState.is_ignored.is_(False),
|
||||
)
|
||||
.limit(100)
|
||||
@@ -422,7 +449,7 @@ class ScannerService:
|
||||
total_pending = (
|
||||
db_session.query(models.FilesystemState)
|
||||
.filter(
|
||||
models.FilesystemState.is_indexed.is_(False),
|
||||
models.FilesystemState.sha256_hash.is_(None),
|
||||
models.FilesystemState.is_ignored.is_(False),
|
||||
)
|
||||
.count()
|
||||
@@ -451,7 +478,6 @@ class ScannerService:
|
||||
|
||||
if computed_hash:
|
||||
target_record.sha256_hash = computed_hash
|
||||
target_record.is_indexed = True
|
||||
self.files_hashed += 1
|
||||
|
||||
if self.files_hashed % 5 == 0:
|
||||
|
||||
@@ -66,7 +66,6 @@ def test_get_insights(client, db_session):
|
||||
file_path="/source/f1.txt",
|
||||
size=100,
|
||||
mtime=1000,
|
||||
is_indexed=True,
|
||||
sha256_hash="hash1",
|
||||
)
|
||||
db_session.add(file1)
|
||||
@@ -127,7 +126,6 @@ def test_search_index(client, db_session):
|
||||
file_path="data/important.doc",
|
||||
size=500,
|
||||
mtime=2000,
|
||||
is_indexed=True,
|
||||
sha256_hash="hash",
|
||||
)
|
||||
db_session.add(file1)
|
||||
|
||||
@@ -21,7 +21,6 @@ def test_get_dashboard_stats_populated(client, db_session):
|
||||
size=1024,
|
||||
mtime=datetime.now(timezone.utc).timestamp(),
|
||||
is_ignored=False,
|
||||
is_indexed=True,
|
||||
)
|
||||
db_session.add(file_state)
|
||||
db_session.commit()
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
from app.core.utils import get_path_uuid
|
||||
|
||||
|
||||
def test_get_path_uuid_macos(mocker):
|
||||
"""Verifies UUID extraction from diskutil on macOS."""
|
||||
mocker.patch("os.path.exists", return_value=True)
|
||||
mocker.patch("sys.platform", "darwin")
|
||||
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
mock_res = mocker.MagicMock()
|
||||
mock_res.stdout = """
|
||||
Device Identifier: disk4s1
|
||||
Device Node: /dev/disk4s1
|
||||
Volume Name: MyBackups
|
||||
Volume UUID: ABCDEF-1234-5678-90AB-CDEF12345678
|
||||
"""
|
||||
mock_run.return_value = mock_res
|
||||
|
||||
uuid = get_path_uuid("/Volumes/MyBackups")
|
||||
assert uuid == "ABCDEF-1234-5678-90AB-CDEF12345678"
|
||||
mock_run.assert_called_with(
|
||||
["diskutil", "info", "/Volumes/MyBackups"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
|
||||
def test_get_path_uuid_linux(mocker):
|
||||
"""Verifies UUID extraction from lsblk on Linux."""
|
||||
mocker.patch("os.path.exists", return_value=True)
|
||||
mocker.patch("sys.platform", "linux")
|
||||
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
|
||||
def mock_subprocess(cmd, **kwargs):
|
||||
m = mocker.MagicMock()
|
||||
if "df" in cmd:
|
||||
m.stdout = "Filesystem\n/dev/sdb1"
|
||||
elif "lsblk" in cmd:
|
||||
m.stdout = "98765432-ABCD-EF01-2345-6789ABCDEF01"
|
||||
return m
|
||||
|
||||
mock_run.side_effect = mock_subprocess
|
||||
|
||||
uuid = get_path_uuid("/mnt/hdd")
|
||||
assert uuid == "98765432-ABCD-EF01-2345-6789ABCDEF01"
|
||||
@@ -0,0 +1,65 @@
|
||||
import hashlib
|
||||
import pytest
|
||||
from app.providers.cloud import CloudStorageProvider
|
||||
|
||||
|
||||
def test_cloud_provider_obfuscation_logic():
|
||||
"""Verifies that filename hashing and sharding works as expected."""
|
||||
|
||||
# CASE 1: Obfuscation Disabled
|
||||
config_plain = {
|
||||
"bucket_name": "test-bucket",
|
||||
"obfuscate_filenames": False,
|
||||
"access_key": "fake",
|
||||
"secret_key": "fake",
|
||||
}
|
||||
provider_plain = CloudStorageProvider(config_plain)
|
||||
path = "documents/secret_plan.pdf"
|
||||
|
||||
# Expectation: Key is exactly the path with prefix
|
||||
key_plain = provider_plain._get_obfuscated_key("objects", path)
|
||||
assert key_plain == "objects/documents/secret_plan.pdf"
|
||||
|
||||
# CASE 2: Obfuscation Enabled
|
||||
config_hidden = {
|
||||
"bucket_name": "test-bucket",
|
||||
"obfuscate_filenames": True,
|
||||
"access_key": "fake",
|
||||
"secret_key": "fake",
|
||||
}
|
||||
provider_hidden = CloudStorageProvider(config_hidden)
|
||||
|
||||
# Expectation: Key is hashed and sharded
|
||||
# hash of "documents/secret_plan.pdf"
|
||||
expected_hash = hashlib.sha256(path.encode("utf-8")).hexdigest()
|
||||
expected_prefix = f"objects/{expected_hash[:2]}/{expected_hash[2:4]}"
|
||||
|
||||
key_hidden = provider_hidden._get_obfuscated_key("objects", path)
|
||||
|
||||
assert key_hidden.startswith("objects/")
|
||||
assert key_hidden == f"{expected_prefix}/{expected_hash}"
|
||||
assert "secret_plan.pdf" not in key_hidden
|
||||
|
||||
|
||||
def test_cloud_secret_fallback(mocker):
|
||||
"""Verifies that the provider prioritizes local config over global settings for passphrases."""
|
||||
from app.core.config import settings
|
||||
|
||||
# Mock global settings
|
||||
mocker.patch.object(settings, "encryption_passphrase", "global-fallback")
|
||||
|
||||
# CASE 1: Local config provides passphrase
|
||||
config_local = {"bucket_name": "b", "encryption_passphrase": "local-override"}
|
||||
provider_local = CloudStorageProvider(config_local)
|
||||
assert provider_local.passphrase == "local-override"
|
||||
|
||||
# CASE 2: Local config is empty, should fallback to global
|
||||
config_empty = {"bucket_name": "b"}
|
||||
provider_fallback = CloudStorageProvider(config_empty)
|
||||
assert provider_fallback.passphrase == "global-fallback"
|
||||
|
||||
# CASE 3: No passphrase anywhere (ValueError on key derivation)
|
||||
mocker.patch.object(settings, "encryption_passphrase", "")
|
||||
provider_none = CloudStorageProvider({"bucket_name": "b"})
|
||||
with pytest.raises(ValueError, match="No encryption passphrase configured"):
|
||||
provider_none._derive_key(b"salt")
|
||||
@@ -0,0 +1,99 @@
|
||||
import io
|
||||
import pytest
|
||||
from app.providers.hdd import OfflineHDDProvider
|
||||
|
||||
|
||||
def test_hdd_initialization(tmp_path):
|
||||
"""Verifies that HDD provider correctly prepares the disk structure."""
|
||||
mount = tmp_path / "mnt_hdd"
|
||||
provider = OfflineHDDProvider({"mount_path": str(mount)})
|
||||
|
||||
success = provider.initialize_media("DISK01")
|
||||
assert success is True
|
||||
assert (mount / ".tapehoard_id").exists()
|
||||
assert (mount / ".tapehoard_id").read_text().strip() == "DISK01"
|
||||
assert (mount / "tapehoard_backups" / "archives").is_dir()
|
||||
|
||||
|
||||
def test_hdd_identification(tmp_path):
|
||||
"""Verifies that HDD provider can identify a disk by its ID file."""
|
||||
mount = tmp_path / "mnt_hdd"
|
||||
mount.mkdir()
|
||||
(mount / ".tapehoard_id").write_text("DISK02")
|
||||
|
||||
provider = OfflineHDDProvider({"mount_path": str(mount)})
|
||||
assert provider.identify_media() == "DISK02"
|
||||
|
||||
|
||||
def test_hdd_write_sequential_logic(tmp_path):
|
||||
"""Verifies that archives are numbered and padded correctly."""
|
||||
mount = tmp_path / "mnt_hdd"
|
||||
provider = OfflineHDDProvider({"mount_path": str(mount)})
|
||||
provider.initialize_media("DISK03")
|
||||
|
||||
# Write first archive
|
||||
loc1 = provider.write_archive("DISK03", io.BytesIO(b"archive1"))
|
||||
assert loc1 == "0"
|
||||
assert (mount / "tapehoard_backups" / "archives" / "000000.tar").exists()
|
||||
|
||||
# Write second archive
|
||||
loc2 = provider.write_archive("DISK03", io.BytesIO(b"archive2"))
|
||||
assert loc2 == "1"
|
||||
assert (mount / "tapehoard_backups" / "archives" / "000001.tar").exists()
|
||||
|
||||
# Read them back
|
||||
with provider.read_archive("DISK03", "0") as f:
|
||||
assert f.read() == b"archive1"
|
||||
with provider.read_archive("DISK03", "1") as f:
|
||||
assert f.read() == b"archive2"
|
||||
|
||||
|
||||
def test_hdd_direct_write_and_traversal_guard(tmp_path):
|
||||
"""Verifies direct file copies and security guards."""
|
||||
mount = tmp_path / "mnt_hdd"
|
||||
provider = OfflineHDDProvider({"mount_path": str(mount)})
|
||||
|
||||
# Valid direct write
|
||||
rel_path = "photos/holiday.jpg"
|
||||
loc = provider.write_file_direct("DISK04", rel_path, io.BytesIO(b"imagedata"))
|
||||
assert loc == rel_path
|
||||
|
||||
target = mount / "tapehoard_backups" / "objects" / rel_path
|
||||
assert target.exists()
|
||||
assert target.read_bytes() == b"imagedata"
|
||||
|
||||
# Read back (Format Negotiation)
|
||||
with provider.read_archive("DISK04", rel_path) as f:
|
||||
assert f.read() == b"imagedata"
|
||||
|
||||
# Traversal Attempt
|
||||
with pytest.raises(ValueError, match="Invalid relative path"):
|
||||
provider.write_file_direct(
|
||||
"DISK04", "escape/../../secret.txt", io.BytesIO(b"bad")
|
||||
)
|
||||
|
||||
|
||||
def test_hdd_online_check_with_uuid(tmp_path, mocker):
|
||||
"""Verifies that online check validates both path and UUID."""
|
||||
mount = tmp_path / "mnt_hdd"
|
||||
mount.mkdir()
|
||||
|
||||
# Mock UUID utility
|
||||
mock_get_uuid = mocker.patch("app.core.utils.get_path_uuid")
|
||||
mock_get_uuid.return_value = "UUID-123"
|
||||
|
||||
# CASE 1: Correct UUID
|
||||
provider_ok = OfflineHDDProvider(
|
||||
{"mount_path": str(mount), "device_uuid": "UUID-123"}
|
||||
)
|
||||
assert provider_ok.check_online() is True
|
||||
|
||||
# CASE 2: Mismatched UUID
|
||||
provider_fail = OfflineHDDProvider(
|
||||
{"mount_path": str(mount), "device_uuid": "UUID-999"}
|
||||
)
|
||||
assert provider_fail.check_online() is False
|
||||
|
||||
# CASE 3: No UUID (Path only)
|
||||
provider_path_only = OfflineHDDProvider({"mount_path": str(mount)})
|
||||
assert provider_path_only.check_online() is True
|
||||
@@ -0,0 +1,122 @@
|
||||
from app.providers.tape import LTOProvider
|
||||
|
||||
|
||||
def test_lto_compression_control(mocker):
|
||||
"""Verifies that LTOProvider sends the correct mt commands for compression."""
|
||||
|
||||
# Mock subprocess.run and os.path.exists
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
mocker.patch("os.path.exists", return_value=True)
|
||||
|
||||
# CASE 1: Compression Enabled
|
||||
provider_on = LTOProvider({"device_path": "/dev/nst0", "compression": True})
|
||||
provider_on._setup_compression()
|
||||
|
||||
# Expectation: mt compression 1
|
||||
mock_run.assert_called_with(
|
||||
["mt", "-f", "/dev/nst0", "compression", "1"], check=True, capture_output=True
|
||||
)
|
||||
|
||||
# CASE 2: Compression Disabled
|
||||
mock_run.reset_mock()
|
||||
provider_off = LTOProvider({"device_path": "/dev/nst0", "compression": False})
|
||||
provider_off._setup_compression()
|
||||
|
||||
# Expectation: mt compression 0
|
||||
mock_run.assert_called_with(
|
||||
["mt", "-f", "/dev/nst0", "compression", "0"], check=True, capture_output=True
|
||||
)
|
||||
|
||||
|
||||
def test_lto_insertion_detection(mocker):
|
||||
"""Verifies that needs_registration triggers only on new insertion."""
|
||||
|
||||
device = "/dev/nst0"
|
||||
# 1. Start with Empty State
|
||||
LTOProvider._lkg_state = {
|
||||
device: {"drive": {}, "mam": {}, "online": False, "last_check": 0.0}
|
||||
}
|
||||
provider = LTOProvider({"device_path": device})
|
||||
|
||||
# Mock OS path existence
|
||||
mocker.patch("os.path.exists", return_value=True)
|
||||
|
||||
# Mock subprocess.run to return "READY" for status and valid MAM attributes
|
||||
def mock_subprocess(cmd, **kwargs):
|
||||
m = mocker.MagicMock()
|
||||
m.returncode = 0
|
||||
if "status" in cmd:
|
||||
m.stdout = "READY"
|
||||
elif "sg_read_attr" in cmd:
|
||||
# Mock raw MAM data bytes (minimal valid structure)
|
||||
# barcode "TAPE01" is at the end of the mapping usually
|
||||
# But we patch get_mam_info to be easier
|
||||
pass
|
||||
return m
|
||||
|
||||
mocker.patch("subprocess.run", side_effect=mock_subprocess)
|
||||
|
||||
# We patch get_mam_info because parsing raw bytes in a test is overkill
|
||||
mocker.patch.object(provider, "get_mam_info", return_value={"barcode": "TAPE01"})
|
||||
mocker.patch.object(provider, "get_drive_info", return_value={"vendor": "HP"})
|
||||
|
||||
# Execution: First poll after "insertion" (transition from offline LKG to online)
|
||||
info = provider.get_live_info(force=True)
|
||||
|
||||
# EXPECTATION: needs_registration should be True
|
||||
assert info["online"] is True
|
||||
assert info["identity"] == "TAPE01"
|
||||
assert info["needs_registration"] is True
|
||||
|
||||
# 2. Second poll (already online)
|
||||
# LKG state is now updated by the provider during the first poll
|
||||
info_second = provider.get_live_info(force=True)
|
||||
|
||||
# EXPECTATION: needs_registration should be False (already saw this tape)
|
||||
assert info_second["needs_registration"] is False
|
||||
|
||||
|
||||
def test_lto_mam_parsing_logic(mocker):
|
||||
"""Verifies the parsing of raw SCSI attribute bytes into human-readable metadata."""
|
||||
import struct
|
||||
|
||||
device = "/dev/nst0"
|
||||
provider = LTOProvider({"device_path": device})
|
||||
mocker.patch("os.path.exists", return_value=True)
|
||||
|
||||
# 1. CONSTRUCT RAW PAYLOAD
|
||||
# SCSI MAM raw format: [TotalLen(4)] + { [ID(2)][Flags(1)][Len(2)][Value] }...
|
||||
|
||||
def pack_attr(attr_id, value_bytes):
|
||||
return struct.pack(">HBH", attr_id, 0, len(value_bytes)) + value_bytes
|
||||
|
||||
# Remaining Capacity (0x0000): 500GB (512000 MiB)
|
||||
attr_rem = pack_attr(0x0000, (512000).to_bytes(8, "big"))
|
||||
# Max Capacity (0x0001): 1.5TB (1536000 MiB - LTO-5 raw)
|
||||
attr_max = pack_attr(0x0001, (1536000).to_bytes(8, "big"))
|
||||
# Tape Alert Flags (0x0002): Bit 20 is "Clean Now"
|
||||
# Bit 20 from left (64-bit int) -> (1 << (64-20))
|
||||
alert_flags = 1 << (64 - 20)
|
||||
attr_alerts = pack_attr(0x0002, alert_flags.to_bytes(8, "big"))
|
||||
# Barcode (0x0806): "TAPE123"
|
||||
attr_barcode = pack_attr(0x0806, b"TAPE123\x00\x00") # null padded
|
||||
|
||||
payload_body = attr_rem + attr_max + attr_alerts + attr_barcode
|
||||
full_payload = struct.pack(">I", len(payload_body)) + payload_body
|
||||
|
||||
# 2. MOCK SUBPROCESS
|
||||
mock_res = mocker.MagicMock()
|
||||
mock_res.returncode = 0
|
||||
mock_res.stdout = full_payload
|
||||
mocker.patch("subprocess.run", return_value=mock_res)
|
||||
|
||||
# 3. EXECUTION
|
||||
mam = provider.get_mam_info(force=True)
|
||||
|
||||
# 4. EXPECTATIONS
|
||||
assert mam["remaining_capacity_mib"] == 512000
|
||||
assert mam["max_capacity_mib"] == 1536000
|
||||
assert mam["barcode"] == "TAPE123"
|
||||
assert "Clean Now" in mam["alerts"]
|
||||
# 1.5TB should be identified as LTO-5
|
||||
assert mam["generation_label"] == "LTO-5"
|
||||
@@ -1,5 +1,6 @@
|
||||
import tarfile
|
||||
import io
|
||||
import pytest
|
||||
from app.services.archiver import ArchiverService, RangeFile
|
||||
from app.db import models
|
||||
|
||||
@@ -24,17 +25,11 @@ def test_get_unbacked_files(db_session):
|
||||
archiver = ArchiverService()
|
||||
|
||||
# File 1: Completely unbacked
|
||||
f1 = models.FilesystemState(
|
||||
file_path="/data/new.txt", size=100, mtime=1, is_indexed=True
|
||||
)
|
||||
f1 = models.FilesystemState(file_path="/data/new.txt", size=100, mtime=1)
|
||||
# File 2: Partially backed (split)
|
||||
f2 = models.FilesystemState(
|
||||
file_path="/data/split.bin", size=1000, mtime=1, is_indexed=True
|
||||
)
|
||||
f2 = models.FilesystemState(file_path="/data/split.bin", size=1000, mtime=1)
|
||||
# File 3: Fully backed
|
||||
f3 = models.FilesystemState(
|
||||
file_path="/data/done.txt", size=50, mtime=1, is_indexed=True
|
||||
)
|
||||
f3 = models.FilesystemState(file_path="/data/done.txt", size=50, mtime=1)
|
||||
db_session.add_all([f1, f2, f3])
|
||||
db_session.flush()
|
||||
|
||||
@@ -89,18 +84,12 @@ def test_assemble_backup_batch(db_session):
|
||||
|
||||
# Create files
|
||||
size_200mb = 200 * 1024 * 1024
|
||||
f1 = models.FilesystemState(
|
||||
file_path="/f1.bin", size=size_200mb, mtime=1, is_indexed=True
|
||||
)
|
||||
f1 = models.FilesystemState(file_path="/f1.bin", size=size_200mb, mtime=1)
|
||||
# f2 is 200MB, would fit on fresh tape. Should be skipped on this 300MB tape
|
||||
# after f1 (200MB) is added, because only 100MB is left.
|
||||
f2 = models.FilesystemState(
|
||||
file_path="/f2.bin", size=size_200mb, mtime=1, is_indexed=True
|
||||
)
|
||||
f2 = models.FilesystemState(file_path="/f2.bin", size=size_200mb, mtime=1)
|
||||
# f3 is 500MB, larger than total capacity (300MB). SHOULD be split.
|
||||
f3 = models.FilesystemState(
|
||||
file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1, is_indexed=True
|
||||
)
|
||||
f3 = models.FilesystemState(file_path="/f3.bin", size=500 * 1024 * 1024, mtime=1)
|
||||
db_session.add_all([f1, f2, f3])
|
||||
db_session.commit()
|
||||
|
||||
@@ -138,7 +127,6 @@ def test_run_backup_mocked(db_session, mocker, tmp_path):
|
||||
file_path=str(source_file),
|
||||
size=source_file.stat().st_size,
|
||||
mtime=1,
|
||||
is_indexed=True,
|
||||
sha256_hash="hash1",
|
||||
)
|
||||
db_session.add(f1)
|
||||
@@ -164,14 +152,227 @@ def test_run_backup_mocked(db_session, mocker, tmp_path):
|
||||
db_session.expire_all()
|
||||
assert media.bytes_used > 0
|
||||
|
||||
# Verify FileVersion was recorded
|
||||
|
||||
def test_archiver_saturated_media_logic(db_session, mocker, tmp_path):
|
||||
"""Verifies that media is marked full and priority ceded based on hardware feedback."""
|
||||
staging = tmp_path / "staging"
|
||||
staging.mkdir()
|
||||
archiver = ArchiverService(staging_directory=str(staging))
|
||||
|
||||
# Setup Media (HDD for simple mocking)
|
||||
media = models.StorageMedia(
|
||||
media_type="hdd",
|
||||
identifier="FULL_DISK",
|
||||
capacity=1000,
|
||||
status="active",
|
||||
bytes_used=0,
|
||||
priority_index=1,
|
||||
)
|
||||
db_session.add(media)
|
||||
|
||||
# Other media to check priority reordering
|
||||
media2 = models.StorageMedia(
|
||||
media_type="hdd",
|
||||
identifier="NEXT_DISK",
|
||||
capacity=1000,
|
||||
status="active",
|
||||
bytes_used=0,
|
||||
priority_index=2,
|
||||
)
|
||||
db_session.add(media2)
|
||||
|
||||
# Setup a small file to trigger the loop
|
||||
source_file = tmp_path / "small.txt"
|
||||
source_file.write_bytes(b"data")
|
||||
f1 = models.FilesystemState(
|
||||
file_path=str(source_file),
|
||||
size=4,
|
||||
mtime=1,
|
||||
sha256_hash="hash_small",
|
||||
)
|
||||
db_session.add(f1)
|
||||
db_session.commit()
|
||||
|
||||
# Mock Provider to report 99% utilization
|
||||
mock_provider = mocker.MagicMock()
|
||||
mock_provider.capabilities = {"supports_random_access": True}
|
||||
mock_provider.identify_media.return_value = "FULL_DISK"
|
||||
mock_provider.prepare_for_write.return_value = True
|
||||
# Ensure write_file_direct returns a string, not a MagicMock
|
||||
mock_provider.write_file_direct.return_value = "LOC_1"
|
||||
# FORCE hardware utilization report to 99%
|
||||
mock_provider.get_utilization.return_value = 0.99
|
||||
|
||||
mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider)
|
||||
|
||||
from app.services.scanner import JobManager
|
||||
|
||||
job = JobManager.create_job(db_session, "BACKUP")
|
||||
|
||||
# Run archival
|
||||
archiver.run_backup(db_session, media.id, job.id)
|
||||
|
||||
# EXPECTATION:
|
||||
# 1. Media should be marked "full"
|
||||
# 2. Priority should be moved to the end (higher than media2)
|
||||
db_session.expire_all()
|
||||
assert media.status == "full"
|
||||
assert media.priority_index > media2.priority_index
|
||||
|
||||
|
||||
def test_archiver_chunking_logic(db_session, mocker, tmp_path):
|
||||
"""Verifies that large backup batches are split into appropriate chunks."""
|
||||
staging = tmp_path / "staging"
|
||||
staging.mkdir()
|
||||
archiver = ArchiverService(staging_directory=str(staging))
|
||||
|
||||
# Setup Media (Capacity 100GB, target chunk size ~1GB)
|
||||
media = models.StorageMedia(
|
||||
media_type="tape",
|
||||
identifier="TAPE_001",
|
||||
capacity=100 * 1024 * 1024 * 1024,
|
||||
status="active",
|
||||
bytes_used=0,
|
||||
)
|
||||
db_session.add(media)
|
||||
|
||||
# 1. Add 10 small files (100MB each) -> Should fit in one 1GB chunk
|
||||
for i in range(10):
|
||||
source_file = tmp_path / f"small_{i}.bin"
|
||||
source_file.write_bytes(b"0" * (100 * 1024 * 1024))
|
||||
f = models.FilesystemState(
|
||||
file_path=str(source_file),
|
||||
size=100 * 1024 * 1024,
|
||||
mtime=1,
|
||||
sha256_hash=f"hash_s_{i}",
|
||||
)
|
||||
db_session.add(f)
|
||||
|
||||
# 2. Add 1 large file (5GB) -> Exceeds chunk size, should trigger its own chunk
|
||||
large_file = tmp_path / "large.bin"
|
||||
large_file.write_bytes(b"0" * 10) # Mock small content for large size metadata
|
||||
f_large = models.FilesystemState(
|
||||
file_path=str(large_file),
|
||||
size=5 * 1024 * 1024 * 1024,
|
||||
mtime=1,
|
||||
sha256_hash="hash_large",
|
||||
)
|
||||
# Monkeypatch stat size for test efficiency (don't actually write 5GB to tmp)
|
||||
mocker.patch("os.path.getsize", return_value=5 * 1024 * 1024 * 1024)
|
||||
# Also patch RangeFile to avoid actual reads
|
||||
mocker.patch(
|
||||
"app.services.archiver.RangeFile.__enter__", return_value=mocker.MagicMock()
|
||||
)
|
||||
mocker.patch("app.services.archiver.RangeFile.__exit__")
|
||||
|
||||
db_session.add(f_large)
|
||||
db_session.commit()
|
||||
|
||||
# Mock Provider
|
||||
mock_provider = mocker.MagicMock()
|
||||
mock_provider.capabilities = {"supports_random_access": False}
|
||||
mock_provider.identify_media.return_value = "TAPE_001"
|
||||
mock_provider.prepare_for_write.return_value = True
|
||||
mock_provider.write_archive.return_value = "1"
|
||||
|
||||
mocker.patch.object(archiver, "_get_storage_provider", return_value=mock_provider)
|
||||
|
||||
from app.services.scanner import JobManager
|
||||
|
||||
job = JobManager.create_job(db_session, "BACKUP")
|
||||
|
||||
# Run archival
|
||||
archiver.run_backup(db_session, media.id, job.id)
|
||||
|
||||
# EXPECTATION:
|
||||
# write_archive should be called TWICE:
|
||||
# 1. Once for the 10 small files (combined chunk)
|
||||
# 2. Once for the large file (independent chunk)
|
||||
assert mock_provider.write_archive.call_count == 2
|
||||
|
||||
# Verify FileVersion was recorded for the large file
|
||||
version = (
|
||||
db_session.query(models.FileVersion)
|
||||
.filter_by(filesystem_state_id=f1.id)
|
||||
.filter_by(filesystem_state_id=f_large.id)
|
||||
.first()
|
||||
)
|
||||
assert version is not None
|
||||
assert version.file_number == "ARCH_1"
|
||||
assert version.file_number == "1"
|
||||
|
||||
|
||||
def test_range_file_alignment_guard(tmp_path):
|
||||
"""Verifies that RangeFile pads truncated files with nulls to maintain tar alignment."""
|
||||
from app.services.archiver import RangeFile
|
||||
|
||||
# Create a 50 byte file
|
||||
f = tmp_path / "truncated.bin"
|
||||
f.write_bytes(b"A" * 50)
|
||||
|
||||
# Initialize RangeFile expecting 100 bytes
|
||||
with RangeFile(str(f), offset_start=0, length=100) as rf:
|
||||
data = rf.read(100)
|
||||
|
||||
# EXPECTATION:
|
||||
# 1. Total length must be exactly 100
|
||||
# 2. First 50 are 'A's
|
||||
# 3. Last 50 are nulls '\x00'
|
||||
assert len(data) == 100
|
||||
assert data[:50] == b"A" * 50
|
||||
assert data[50:] == b"\x00" * 50
|
||||
|
||||
|
||||
def test_path_traversal_protection():
|
||||
"""Verifies that the restorer blocks path traversal attempts."""
|
||||
archiver = ArchiverService()
|
||||
base = "/restores/my_backup"
|
||||
|
||||
# CASE 1: Valid Path
|
||||
safe_path = archiver._sanitize_recovery_path(base, "photos/image.jpg")
|
||||
assert safe_path.startswith(base)
|
||||
|
||||
# CASE 2: Traversal Attempt (Relative)
|
||||
with pytest.raises(PermissionError, match="Restricted path traversal"):
|
||||
archiver._sanitize_recovery_path(base, "../../etc/shadow")
|
||||
|
||||
# CASE 3: Traversal Attempt (Absolute-style relative that escapes)
|
||||
with pytest.raises(PermissionError, match="Restricted path traversal"):
|
||||
archiver._sanitize_recovery_path(base, "/../../etc/shadow")
|
||||
|
||||
|
||||
def test_archiver_restoration_negotiation(db_session, mocker, tmp_path):
|
||||
"""Verifies that the restorer chooses the correct format (Native vs Tar)."""
|
||||
archiver = ArchiverService()
|
||||
|
||||
# Setup Mocks
|
||||
mock_media = mocker.MagicMock()
|
||||
mock_media.media_type = "cloud"
|
||||
mock_media.identifier = "BUCKET1"
|
||||
|
||||
mock_provider = mocker.MagicMock()
|
||||
mock_provider.identify_media.return_value = "BUCKET1"
|
||||
mock_provider.check_online.return_value = True
|
||||
|
||||
# 1. TEST NATIVE NEGOTIATION
|
||||
# Expectation: random_access=True + non-tar ID -> is_tar=False
|
||||
mock_provider.capabilities = {"supports_random_access": True}
|
||||
|
||||
# CASE A: Obfuscated object (Native)
|
||||
is_tar_native = archiver._test_is_tar_logic(
|
||||
mock_provider, mock_media, "objects/a1/b2/hash"
|
||||
)
|
||||
assert is_tar_native is False
|
||||
|
||||
# CASE B: Obfuscated archive (Tar)
|
||||
is_tar_cloud_archive = archiver._test_is_tar_logic(
|
||||
mock_provider, mock_media, "archives/a1/b2/hash"
|
||||
)
|
||||
assert is_tar_cloud_archive is True
|
||||
|
||||
# CASE C: Tape (Always Tar)
|
||||
mock_media.media_type = "tape"
|
||||
mock_provider.capabilities = {"supports_random_access": False}
|
||||
is_tar_tape = archiver._test_is_tar_logic(mock_provider, mock_media, "1")
|
||||
assert is_tar_tape is True
|
||||
|
||||
|
||||
def test_run_restore_mocked(db_session, mocker, tmp_path):
|
||||
@@ -206,6 +407,9 @@ def test_run_restore_mocked(db_session, mocker, tmp_path):
|
||||
# Mock Provider & Tar Stream
|
||||
mock_provider = mocker.MagicMock()
|
||||
mock_provider.identify_media.return_value = "RESTORE_ME"
|
||||
mock_provider.check_online.return_value = True
|
||||
# FORCE tar path by disabling random access
|
||||
mock_provider.capabilities = {"supports_random_access": False}
|
||||
|
||||
# Create a mock tar stream containing the file
|
||||
buf = io.BytesIO()
|
||||
|
||||
@@ -91,7 +91,7 @@ def test_metadata_update_on_change(db_session):
|
||||
|
||||
# Initial state
|
||||
f1 = models.FilesystemState(
|
||||
file_path="/data/up.txt", size=50, mtime=1, is_indexed=True, sha256_hash="old"
|
||||
file_path="/data/up.txt", size=50, mtime=1, sha256_hash="old"
|
||||
)
|
||||
db_session.add(f1)
|
||||
db_session.commit()
|
||||
@@ -103,7 +103,7 @@ def test_metadata_update_on_change(db_session):
|
||||
|
||||
db_session.refresh(f1)
|
||||
assert f1.size == 999
|
||||
assert f1.is_indexed is False # Should be reset for re-hashing
|
||||
assert f1.sha256_hash is None # Should be reset for re-hashing
|
||||
|
||||
|
||||
def test_scan_sources_mocked(db_session, mocker):
|
||||
@@ -141,7 +141,7 @@ def test_run_hashing_mocked(db_session, mocker):
|
||||
|
||||
# Setup unindexed file
|
||||
f = models.FilesystemState(
|
||||
file_path="/data/hash.me", size=10, mtime=1, is_indexed=False, is_ignored=False
|
||||
file_path="/data/hash.me", size=10, mtime=1, is_ignored=False
|
||||
)
|
||||
db_session.add(f)
|
||||
db_session.commit()
|
||||
@@ -154,5 +154,4 @@ def test_run_hashing_mocked(db_session, mocker):
|
||||
scanner.run_hashing()
|
||||
|
||||
db_session.refresh(f)
|
||||
assert f.is_indexed is True
|
||||
assert f.sha256_hash == "mocked_hash"
|
||||
|
||||
@@ -15,7 +15,11 @@ services:
|
||||
# devices:
|
||||
# - /dev/nst0:/dev/nst0
|
||||
environment:
|
||||
- PUID=1000
|
||||
- PGID=1000
|
||||
- PUID=0
|
||||
- PGID=0
|
||||
- RUN_AS_ROOT=true
|
||||
- TZ=UTC
|
||||
user: "0:0"
|
||||
cap_add:
|
||||
- SYS_RAWIO
|
||||
restart: unless-stopped
|
||||
|
||||
@@ -270,10 +270,7 @@ export type ItemMetadataSchema = {
|
||||
* Sha256 Hash
|
||||
*/
|
||||
sha256_hash?: string | null;
|
||||
/**
|
||||
* Is Indexed
|
||||
*/
|
||||
is_indexed?: boolean;
|
||||
|
||||
/**
|
||||
* Is Ignored
|
||||
*/
|
||||
|
||||
@@ -37,7 +37,7 @@ lint:
|
||||
# Run all backend tests
|
||||
pytest:
|
||||
@echo "Running backend tests..."
|
||||
cd backend && uv run pytest
|
||||
cd backend && COVERAGE_CORE=sysmon uv run pytest
|
||||
|
||||
# Run frontend checks
|
||||
check:
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
[Unit]
|
||||
Description=TapeHoard Backup Manager
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
WorkingDirectory=/opt/tapehoard
|
||||
# We use root to ensure direct /dev/nst0 and raw SCSI access
|
||||
ExecStart=/usr/bin/just dev
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
# Give the tape drive time to rewind on shutdown
|
||||
TimeoutStopSec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user