snapshot: Ensure all snapshot files are durable (#4891)

When creating or compressing a snapshot, `fsync` all files and
directories, so as to ensure that the snapshot is durable on the local
disk.

This obviously amounts to a large number of `fsync` calls, which may
negatively impact performance of taking a snapshot -- since we hold a
transaction lock while taking a snapshot, this is not to be taken
lightly.

# Expected complexity level and risk

3 -- performance impact

# Testing

I haven't quantified the performance impact.
This commit is contained in:
Kim Altintop
2026-04-30 08:25:08 +02:00
committed by GitHub
parent f22f80060f
commit 818e9b271e
5 changed files with 157 additions and 36 deletions
+20 -16
View File
@@ -144,6 +144,7 @@ impl SnapshotWorker {
struct SnapshotMetrics {
snapshot_timing_total: Histogram,
snapshot_timing_inner: Histogram,
snapshot_timing_fsync: Histogram,
}
impl SnapshotMetrics {
@@ -151,6 +152,7 @@ impl SnapshotMetrics {
Self {
snapshot_timing_total: WORKER_METRICS.snapshot_creation_time_total.with_label_values(&db),
snapshot_timing_inner: WORKER_METRICS.snapshot_creation_time_inner.with_label_values(&db),
snapshot_timing_fsync: WORKER_METRICS.snapshot_creation_time_fsync.with_label_values(&db),
}
}
}
@@ -221,27 +223,29 @@ impl SnapshotWorkerActor {
let database_identity = self.snapshot_repo.database_identity();
let maybe_offset = asyncify(move || {
let maybe_snapshot = asyncify(move || {
let _timer = inner_timer.start_timer();
Locking::take_snapshot_internal(&state, &snapshot_repo)
})
.await
.with_context(|| format!("error capturing snapshot of database {}", database_identity))?;
maybe_offset
.map(|(offset, _path)| offset)
.inspect(|snapshot_offset| {
let elapsed = Duration::from_secs_f64(timer.stop_and_record());
info!(
"Captured snapshot of database {} at TX offset {} in {:?}",
database_identity, snapshot_offset, elapsed,
);
})
.with_context(|| {
format!(
"refusing to take snapshot of database {} at TX offset -1",
database_identity
)
})
let (snapshot_offset, unflushed_snapshot) = maybe_snapshot.with_context(|| {
format!(
"refusing to take snapshot of database {} at TX offset -1",
database_identity
)
})?;
self.metrics
.snapshot_timing_fsync
.observe_closure_duration(|| unflushed_snapshot.sync_all())?;
let elapsed = Duration::from_secs_f64(timer.stop_and_record());
info!(
"Captured snapshot of database {} at TX offset {} in {:?}",
database_identity, snapshot_offset, elapsed,
);
Ok(snapshot_offset)
}
async fn maybe_compress_snapshots(&mut self, latest_snapshot: TxOffset) {
+6
View File
@@ -434,6 +434,12 @@ metrics_group!(
#[buckets(0.0005, 0.001, 0.005, 0.01, 0.1, 1.0, 5.0, 10.0)]
pub snapshot_creation_time_inner: HistogramVec,
#[name = spacetime_snapshot_creation_time_fsync_sec]
#[help = "The time (in seconds) it took to fsync a database snapshot, excluding scheduling overhead"]
#[labels(db: Identity)]
#[buckets(0.0005, 0.001, 0.005, 0.01, 0.1, 1.0, 5.0, 10.0)]
pub snapshot_creation_time_fsync: HistogramVec,
#[name = spacetime_snapshot_compression_time_total_sec]
#[help = "The time (in seconds) it took to do a compression pass on the snapshot repository, including scheduling overhead"]
#[labels(db: Identity)]
@@ -39,7 +39,7 @@ use spacetimedb_schema::{
reducer_name::ReducerName,
schema::{ColumnSchema, IndexSchema, SequenceSchema, TableSchema},
};
use spacetimedb_snapshot::{ReconstructedSnapshot, SnapshotRepository};
use spacetimedb_snapshot::{ReconstructedSnapshot, SnapshotRepository, UnflushedSnapshot};
use spacetimedb_table::{
indexes::RowPointer,
page_pool::PagePool,
@@ -228,8 +228,10 @@ impl Locking {
/// Returns an error if [`SnapshotRepository::create_snapshot`] returns an
/// error.
pub fn take_snapshot(&self, repo: &SnapshotRepository) -> Result<Option<SnapshotDirPath>> {
let maybe_offset_and_path = Self::take_snapshot_internal(&self.committed_state, repo)?;
Ok(maybe_offset_and_path.map(|(_, path)| path))
Self::take_snapshot_internal(&self.committed_state, repo)?
.map(|(_offset, snap)| snap.sync_all())
.transpose()
.map_err(Into::into)
}
pub fn assert_system_tables_match(&self) -> Result<()> {
@@ -240,7 +242,7 @@ impl Locking {
pub fn take_snapshot_internal(
committed_state: &RwLock<CommittedState>,
repo: &SnapshotRepository,
) -> Result<Option<(TxOffset, SnapshotDirPath)>> {
) -> Result<Option<(TxOffset, UnflushedSnapshot)>> {
let mut committed_state = committed_state.write();
let Some(tx_offset) = committed_state.next_tx_offset.checked_sub(1) else {
return Ok(None);
@@ -253,9 +255,9 @@ impl Locking {
);
let (tables, blob_store) = committed_state.persistent_tables_and_blob_store();
let snapshot_dir = repo.create_snapshot(tables, blob_store, tx_offset)?;
let unflushed_snapshot = repo.create_snapshot(tables, blob_store, tx_offset)?;
Ok(Some((tx_offset, snapshot_dir)))
Ok(Some((tx_offset, unflushed_snapshot)))
}
/// Returns a list over all the currently connected clients,
+115 -10
View File
@@ -23,6 +23,7 @@
#![allow(clippy::result_large_err)]
use log::warn;
use spacetimedb_data_structures::map::{HashCollectionExt as _, HashMap};
use spacetimedb_durability::TxOffset;
use spacetimedb_fs_utils::compression::{
@@ -43,8 +44,10 @@ use spacetimedb_table::{
page_pool::PagePool,
table::Table,
};
use std::fs;
use std::fs::{self, File};
use std::io;
use std::ops::RangeBounds;
use std::path::Path;
use std::time::{Duration, Instant};
use std::{
collections::BTreeMap,
@@ -173,6 +176,91 @@ struct TableEntry {
pages: Vec<blake3::Hash>,
}
/// A non-durable snapshot created via [SnapshotRepository::create_snapshot].
///
/// When [SnapshotRepository::create_snapshot] returns, all objects will have
/// been written to the underlying object repository, but not `fsync`'ed.
///
/// Because this means that the snapshot may be incomplete, the [Snapshot] file
/// will _not_ have been written, and the snapshot remains locked (via a [Lockfile]).
///
/// To turn an [UnflushedSnapshot] into a durable snapshot, call
/// [UnflushedSnapshot::sync_all]. This will:
///
/// - sync all objects the snapshot references
/// - sync the object repository root
/// - write and sync the snapshot file
/// - drop the lock file
///
/// This ensures that the snapshot file is present only if all objects are
/// present and durable, and that the snapshot is considered invalid otherwise.
///
/// If [UnflushedSnapshot] is dropped without calling `sync_all`, the [Drop]
/// impl will attempt to call `sync_all` and log any errors.
///
/// This two-stage snapshot creation exists in order to not introduce additional
/// latency while the datastore is locked for snapshotting.
#[must_use = "snapshots are not durable until `sync_all` is called"]
pub struct UnflushedSnapshot {
inner: Option<UnflushedSnapshotInner>,
}
impl UnflushedSnapshot {
/// Sync all objects in the snapshot and write out the snapshot file.
///
/// Returns the [SnapshotDirPath] on success.
pub fn sync_all(mut self) -> Result<SnapshotDirPath, SnapshotError> {
self.inner.take().unwrap().sync_all()
}
}
impl Drop for UnflushedSnapshot {
fn drop(&mut self) {
if let Some(inner) = self.inner.take()
&& let Err(e) = inner.sync_all()
{
warn!("failed to sync unflushed snapshot dropped without syncing: {e}");
}
}
}
struct UnflushedSnapshotInner {
snapshot: Snapshot,
snapshot_dir: SnapshotDirPath,
snapshot_repo: SnapshotRepository,
object_repo: DirTrie,
lockfile: Lockfile,
}
impl UnflushedSnapshotInner {
fn sync_all(self) -> Result<SnapshotDirPath, SnapshotError> {
fn fsync(path: &Path) -> io::Result<()> {
File::open(path)
.and_then(|fd| fd.sync_all())
.map_err(|e| io::Error::new(e.kind(), format!("failed to fsync {}: {}", path.display(), e)))
}
// Sync all objects and their parent directories.
// The paths yielded by the [Snapshot::files] iterator are constructed
// by [DirTree::file_path], which creates a path with a parent.
// `parent()` is thus known to succeed.
for (_, path) in self.snapshot.files(&self.object_repo) {
fsync(&path)?;
fsync(path.parent().unwrap())?;
}
// Sync the root directory of the object repo
fsync(self.object_repo.root())?;
// Write out the snapshot file (syncs internally).
self.snapshot_repo
.write_snapshot_file(&self.snapshot_dir, self.snapshot)?;
// We can now drop the lockfile.
drop(self.lockfile);
Ok(self.snapshot_dir)
}
}
#[derive(Clone, Serialize, Deserialize)]
pub struct Snapshot {
/// A magic number: must be equal to [`MAGIC`].
@@ -650,15 +738,19 @@ impl SnapshotRepository {
/// where `tables` is the committed state of all the tables in the database,
/// and `blobs` is the committed state's blob store.
///
/// Returns the path of the newly-created snapshot directory.
/// The returned [UnflushedSnapshot] is **not** durable -- call
/// [UnflushedSnapshot::sync_all] to finalize it (see the struct docs for
/// more details).
///
/// **NOTE**: The current snapshot is uncompressed to avoid the potential slowdown.
/// Also note that the snapshot remains locked for as long as [UnflushedSnapshot]
/// is alive. It will not appear in [Self::all_snapshots] nor can it be
/// modified by methods in [SnapshotRepository].
pub fn create_snapshot<'db>(
&self,
tables: impl Iterator<Item = &'db mut Table>,
blobs: &'db dyn BlobStore,
tx_offset: TxOffset,
) -> Result<SnapshotDirPath, SnapshotError> {
) -> Result<UnflushedSnapshot, SnapshotError> {
// Invalidate equal to or newer than `tx_offset`.
//
// This is because snapshots don't currently track the epoch in which
@@ -692,7 +784,7 @@ impl SnapshotRepository {
// Before performing any observable operations,
// acquire a lockfile on the snapshot you want to create.
// Because we could be compressing the snapshot.
let _lock = Lockfile::for_file(&snapshot_dir)?;
let lockfile = Lockfile::for_file(&snapshot_dir)?;
// Create the snapshot directory.
snapshot_dir.create()?;
@@ -708,8 +800,6 @@ impl SnapshotRepository {
snapshot.write_all_blobs(&object_repo, blobs, prev_snapshot.as_ref(), &mut counter)?;
snapshot.write_all_tables(&object_repo, tables, prev_snapshot.as_ref(), &mut counter)?;
self.write_snapshot_file(&snapshot_dir, snapshot)?;
log::info!(
"[{}] SNAPSHOT {:0>20}: Hardlinked {} objects and wrote {} objects",
self.database_identity,
@@ -718,9 +808,15 @@ impl SnapshotRepository {
counter.objects_written,
);
// Success! return the directory of the newly-created snapshot.
// The lockfile will be dropped here.
Ok(snapshot_dir)
Ok(UnflushedSnapshot {
inner: Some(UnflushedSnapshotInner {
snapshot,
snapshot_dir,
snapshot_repo: self.clone(),
object_repo,
lockfile,
}),
})
}
/// Write the on-disk snapshot file containing the BSATN-encoded `snapshot`
@@ -744,6 +840,12 @@ impl SnapshotRepository {
snapshot_file.write_all(hash.as_bytes())?;
snapshot_file.write_all(&snapshot_bsatn)?;
snapshot_file.flush()?;
// fsync file + enclosing directory.
snapshot_file
.into_inner()
.expect("buffered writer just flushed")
.sync_all()?;
File::open(&snapshot_dir.0)?.sync_all()?;
}
Ok(())
@@ -1102,6 +1204,7 @@ impl SnapshotRepository {
if old_file.is_compressed() {
std::fs::hard_link(old_path, src.with_extension("_tmp"))?;
std::fs::rename(src.with_extension("_tmp"), src)?;
File::open(src.parent().unwrap())?.sync_all()?;
if let Some(stats) = stats {
stats.hardlinked += 1;
}
@@ -1134,6 +1237,7 @@ impl SnapshotRepository {
log::error!("Failed to compress object file {path:?}: {err}");
})?;
}
File::open(dir.root())?.sync_all()?;
// Compress the snapshot file last,
// which marks the whole snapshot as compressed.
@@ -1142,6 +1246,7 @@ impl SnapshotRepository {
compress(&old, &snapshot_file.0, None, None).inspect_err(|err| {
log::error!("Failed to compress snapshot file {snapshot_file:?}: {err}");
})?;
File::open(&snapshot_dir.0)?.sync_all()?;
log::info!(
"Compressed snapshot {snapshot_dir:?} of replica {}: {compress_type:?}",
+8 -4
View File
@@ -727,10 +727,13 @@ where
Some(file_path) => {
let dir = file_path.parent().expect("file not in a directory").to_owned();
fs::create_dir_all(&dir).await?;
let (tmp_file, tmp_out) = spawn_blocking(move || {
let tmp = NamedTempFile::new_in(dir)?;
let out = tmp.reopen()?;
Ok::<_, io::Error>((tmp, out))
let (tmp_file, tmp_out) = spawn_blocking({
let dir = dir.clone();
move || {
let tmp = NamedTempFile::new_in(dir)?;
let out = tmp.reopen()?;
Ok::<_, io::Error>((tmp, out))
}
})
.await
.unwrap()?;
@@ -743,6 +746,7 @@ where
.await
.unwrap()
.map_err(|e| e.error)?;
fs::File::open(dir).await?.sync_all().await?;
}
None => {