mirror of
https://github.com/clockworklabs/SpacetimeDB.git
synced 2026-05-06 07:26:43 -04:00
smoketests: Smoketest enable replication for existing database (#3138)
Adds a (basic) smoketest that shows that we can enable, disable and enable again replication on an existing, non-replicated database.
This commit is contained in:
@@ -32,6 +32,7 @@ use spacetimedb_data_structures::map::{HashCollectionExt as _, IntMap};
|
||||
use spacetimedb_datastore::execution_context::{ExecutionContext, ReducerContext, Workload, WorkloadType};
|
||||
use spacetimedb_datastore::locking_tx_datastore::MutTxId;
|
||||
use spacetimedb_datastore::traits::{IsolationLevel, Program, TxData};
|
||||
use spacetimedb_durability::DurableOffset;
|
||||
use spacetimedb_execution::pipelined::PipelinedProject;
|
||||
use spacetimedb_lib::db::raw_def::v9::Lifecycle;
|
||||
use spacetimedb_lib::identity::{AuthCtx, RequestId};
|
||||
@@ -1233,6 +1234,10 @@ impl ModuleHost {
|
||||
&self.replica_ctx().database
|
||||
}
|
||||
|
||||
pub fn durable_tx_offset(&self) -> Option<DurableOffset> {
|
||||
self.replica_ctx().relational_db.durable_tx_offset()
|
||||
}
|
||||
|
||||
pub(crate) fn replica_ctx(&self) -> &ReplicaContext {
|
||||
self.module.replica_ctx()
|
||||
}
|
||||
|
||||
@@ -216,7 +216,7 @@ class Smoketest(unittest.TestCase):
|
||||
logs = self.spacetime("logs", "--format=json", "-n", str(n), "--", self.database_identity)
|
||||
return list(map(json.loads, logs.splitlines()))
|
||||
|
||||
def publish_module(self, domain=None, *, clear=True, capture_stderr=True):
|
||||
def publish_module(self, domain=None, *, clear=True, capture_stderr=True, num_replicas=None):
|
||||
print("publishing module", self.publish_module)
|
||||
publish_output = self.spacetime(
|
||||
"publish",
|
||||
@@ -227,10 +227,11 @@ class Smoketest(unittest.TestCase):
|
||||
# because the server address is `node` which doesn't look like `localhost` or `127.0.0.1`
|
||||
# and so the publish step prompts for confirmation.
|
||||
"--yes",
|
||||
*["--num-replicas", f"{num_replicas}"] if num_replicas is not None else [],
|
||||
capture_stderr=capture_stderr,
|
||||
)
|
||||
self.resolved_identity = re.search(r"identity: ([0-9a-fA-F]+)", publish_output)[1]
|
||||
self.database_identity = domain if domain is not None else self.resolved_identity
|
||||
self.database_identity = self.resolved_identity
|
||||
|
||||
@classmethod
|
||||
def reset_config(cls):
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
from .. import COMPOSE_FILE, Smoketest, requires_docker, spacetime, parse_sql_result
|
||||
from ..docker import DockerManager
|
||||
|
||||
import time
|
||||
from typing import Callable
|
||||
import unittest
|
||||
from typing import Callable
|
||||
import json
|
||||
|
||||
from .. import COMPOSE_FILE, Smoketest, random_string, requires_docker, spacetime, parse_sql_result
|
||||
from ..docker import DockerManager
|
||||
|
||||
def retry(func: Callable, max_retries: int = 3, retry_delay: int = 2):
|
||||
"""Retry a function on failure with delay."""
|
||||
@@ -113,6 +114,18 @@ where replication_state.database_id={database_id} \
|
||||
# TODO: Replace with confirmed read.
|
||||
time.sleep(0.6)
|
||||
|
||||
def wait_counter_value(self, id, value, max_attempts=10, delay=1):
|
||||
"""Wait for the value for `id` in the counter table to reach `value`"""
|
||||
|
||||
for _ in range(max_attempts):
|
||||
rows = self.sql(f"select * from counter where id={id}")
|
||||
if len(rows) >= 1 and int(rows[0]['value']) >= value:
|
||||
return
|
||||
else:
|
||||
time.sleep(delay)
|
||||
|
||||
raise ValueError(f"Counter {id} below {value}")
|
||||
|
||||
|
||||
def fail_leader(self, action='kill'):
|
||||
"""Force leader failure through either killing or network disconnect."""
|
||||
@@ -240,6 +253,9 @@ fn send_message(ctx: &ReducerContext, text: String) {
|
||||
def collect_counter_rows(self):
|
||||
return int_vals(self.cluster.sql("select * from counter"))
|
||||
|
||||
def call_control(self, reducer, *args):
|
||||
self.spacetime("call", "spacetime-control", reducer, *map(json.dumps, args))
|
||||
|
||||
|
||||
class LeaderElection(ReplicationTest):
|
||||
def test_leader_election_in_loop(self):
|
||||
@@ -393,3 +409,79 @@ class QuorumLoss(ReplicationTest):
|
||||
with self.assertRaises(Exception):
|
||||
for i in range(1001):
|
||||
self.call("send_message", "terminal")
|
||||
|
||||
|
||||
class EnableReplication(ReplicationTest):
|
||||
AUTOPUBLISH = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.expected_counter_rows = []
|
||||
|
||||
def run_counter(self, id, n = 100):
|
||||
self.start(id, n)
|
||||
self.cluster.wait_counter_value(id, n)
|
||||
self.expected_counter_rows.append({"id": id, "value": n})
|
||||
self.assertEqual(self.collect_counter_rows(), self.expected_counter_rows)
|
||||
|
||||
def test_enable_replication(self):
|
||||
"""Tests enabling and disabling replication"""
|
||||
|
||||
self.add_me_as_admin()
|
||||
name = random_string()
|
||||
n = 100
|
||||
|
||||
self.publish_module(name, num_replicas = 1)
|
||||
self.cluster.wait_for_leader_change(None)
|
||||
|
||||
# start un-replicated
|
||||
self.run_counter(1, n)
|
||||
# enable replication
|
||||
self.call_control("enable_replication", {"Name": name}, 3)
|
||||
self.run_counter(2, n)
|
||||
# disable replication
|
||||
self.call_control("disable_replication", {"Name": name })
|
||||
self.run_counter(3, n)
|
||||
# enable it one more time
|
||||
self.call_control("enable_replication", {"Name": name}, 3)
|
||||
self.run_counter(4, n)
|
||||
|
||||
|
||||
class EnableReplicationSuspended(ReplicationTest):
|
||||
AUTOPUBLISH = False
|
||||
|
||||
def test_enable_replication_on_suspended_database(self):
|
||||
"""Tests that we can enable replication on a suspended database"""
|
||||
|
||||
self.add_me_as_admin()
|
||||
name = random_string()
|
||||
|
||||
self.publish_module(name, num_replicas = 1)
|
||||
self.cluster.wait_for_leader_change(None)
|
||||
self.cluster.ensure_leader_health(1)
|
||||
|
||||
id = self.cluster.get_db_id()
|
||||
|
||||
self.call_control("suspend_database", {"Name": name})
|
||||
# Database is now unreachable.
|
||||
with self.assertRaises(Exception):
|
||||
self.call("send_message", "hi")
|
||||
|
||||
self.call_control("enable_replication", {"Name": name}, 3)
|
||||
# Still unreachable until we call unsuspend.
|
||||
with self.assertRaises(Exception):
|
||||
self.call("send_message", "hi")
|
||||
|
||||
self.call_control("unsuspend_database", {"Name": name})
|
||||
self.cluster.wait_for_leader_change(None)
|
||||
self.cluster.ensure_leader_health(2)
|
||||
|
||||
# We can't direcly observe that there are indeed three replicas running,
|
||||
# so as a sanity check inspect the event log.
|
||||
rows = self.cluster.read_controldb(
|
||||
f"select message from staged_enable_replication_event where database_id={id}")
|
||||
self.assertEqual(rows, [
|
||||
{'message': '"bootstrap requested"'},
|
||||
{'message': '"bootstrap complete"'},
|
||||
])
|
||||
|
||||
Reference in New Issue
Block a user