smoketests: Smoketest enable replication for existing database (#3138)

Adds a (basic) smoketest that shows that we can enable, disable and
enable again replication on an existing, non-replicated database.
This commit is contained in:
Kim Altintop
2025-09-18 09:21:22 +02:00
committed by GitHub
parent aa1e732f77
commit 321e4302ef
3 changed files with 104 additions and 6 deletions
+5
View File
@@ -32,6 +32,7 @@ use spacetimedb_data_structures::map::{HashCollectionExt as _, IntMap};
use spacetimedb_datastore::execution_context::{ExecutionContext, ReducerContext, Workload, WorkloadType};
use spacetimedb_datastore::locking_tx_datastore::MutTxId;
use spacetimedb_datastore::traits::{IsolationLevel, Program, TxData};
use spacetimedb_durability::DurableOffset;
use spacetimedb_execution::pipelined::PipelinedProject;
use spacetimedb_lib::db::raw_def::v9::Lifecycle;
use spacetimedb_lib::identity::{AuthCtx, RequestId};
@@ -1233,6 +1234,10 @@ impl ModuleHost {
&self.replica_ctx().database
}
pub fn durable_tx_offset(&self) -> Option<DurableOffset> {
self.replica_ctx().relational_db.durable_tx_offset()
}
pub(crate) fn replica_ctx(&self) -> &ReplicaContext {
self.module.replica_ctx()
}
+3 -2
View File
@@ -216,7 +216,7 @@ class Smoketest(unittest.TestCase):
logs = self.spacetime("logs", "--format=json", "-n", str(n), "--", self.database_identity)
return list(map(json.loads, logs.splitlines()))
def publish_module(self, domain=None, *, clear=True, capture_stderr=True):
def publish_module(self, domain=None, *, clear=True, capture_stderr=True, num_replicas=None):
print("publishing module", self.publish_module)
publish_output = self.spacetime(
"publish",
@@ -227,10 +227,11 @@ class Smoketest(unittest.TestCase):
# because the server address is `node` which doesn't look like `localhost` or `127.0.0.1`
# and so the publish step prompts for confirmation.
"--yes",
*["--num-replicas", f"{num_replicas}"] if num_replicas is not None else [],
capture_stderr=capture_stderr,
)
self.resolved_identity = re.search(r"identity: ([0-9a-fA-F]+)", publish_output)[1]
self.database_identity = domain if domain is not None else self.resolved_identity
self.database_identity = self.resolved_identity
@classmethod
def reset_config(cls):
+96 -4
View File
@@ -1,9 +1,10 @@
from .. import COMPOSE_FILE, Smoketest, requires_docker, spacetime, parse_sql_result
from ..docker import DockerManager
import time
from typing import Callable
import unittest
from typing import Callable
import json
from .. import COMPOSE_FILE, Smoketest, random_string, requires_docker, spacetime, parse_sql_result
from ..docker import DockerManager
def retry(func: Callable, max_retries: int = 3, retry_delay: int = 2):
"""Retry a function on failure with delay."""
@@ -113,6 +114,18 @@ where replication_state.database_id={database_id} \
# TODO: Replace with confirmed read.
time.sleep(0.6)
def wait_counter_value(self, id, value, max_attempts=10, delay=1):
"""Wait for the value for `id` in the counter table to reach `value`"""
for _ in range(max_attempts):
rows = self.sql(f"select * from counter where id={id}")
if len(rows) >= 1 and int(rows[0]['value']) >= value:
return
else:
time.sleep(delay)
raise ValueError(f"Counter {id} below {value}")
def fail_leader(self, action='kill'):
"""Force leader failure through either killing or network disconnect."""
@@ -240,6 +253,9 @@ fn send_message(ctx: &ReducerContext, text: String) {
def collect_counter_rows(self):
return int_vals(self.cluster.sql("select * from counter"))
def call_control(self, reducer, *args):
self.spacetime("call", "spacetime-control", reducer, *map(json.dumps, args))
class LeaderElection(ReplicationTest):
def test_leader_election_in_loop(self):
@@ -393,3 +409,79 @@ class QuorumLoss(ReplicationTest):
with self.assertRaises(Exception):
for i in range(1001):
self.call("send_message", "terminal")
class EnableReplication(ReplicationTest):
AUTOPUBLISH = False
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.expected_counter_rows = []
def run_counter(self, id, n = 100):
self.start(id, n)
self.cluster.wait_counter_value(id, n)
self.expected_counter_rows.append({"id": id, "value": n})
self.assertEqual(self.collect_counter_rows(), self.expected_counter_rows)
def test_enable_replication(self):
"""Tests enabling and disabling replication"""
self.add_me_as_admin()
name = random_string()
n = 100
self.publish_module(name, num_replicas = 1)
self.cluster.wait_for_leader_change(None)
# start un-replicated
self.run_counter(1, n)
# enable replication
self.call_control("enable_replication", {"Name": name}, 3)
self.run_counter(2, n)
# disable replication
self.call_control("disable_replication", {"Name": name })
self.run_counter(3, n)
# enable it one more time
self.call_control("enable_replication", {"Name": name}, 3)
self.run_counter(4, n)
class EnableReplicationSuspended(ReplicationTest):
AUTOPUBLISH = False
def test_enable_replication_on_suspended_database(self):
"""Tests that we can enable replication on a suspended database"""
self.add_me_as_admin()
name = random_string()
self.publish_module(name, num_replicas = 1)
self.cluster.wait_for_leader_change(None)
self.cluster.ensure_leader_health(1)
id = self.cluster.get_db_id()
self.call_control("suspend_database", {"Name": name})
# Database is now unreachable.
with self.assertRaises(Exception):
self.call("send_message", "hi")
self.call_control("enable_replication", {"Name": name}, 3)
# Still unreachable until we call unsuspend.
with self.assertRaises(Exception):
self.call("send_message", "hi")
self.call_control("unsuspend_database", {"Name": name})
self.cluster.wait_for_leader_change(None)
self.cluster.ensure_leader_health(2)
# We can't direcly observe that there are indeed three replicas running,
# so as a sanity check inspect the event log.
rows = self.cluster.read_controldb(
f"select message from staged_enable_replication_event where database_id={id}")
self.assertEqual(rows, [
{'message': '"bootstrap requested"'},
{'message': '"bootstrap complete"'},
])