Migrate the remaining cluster tests to the new framework and remove legacy files (#2297) (#3382)

Migrated the remaining cluster tests to tests/unit/cluster/ to use the same
framework for all cluster tests. Cleaned up the obsolete cluster test framework
files and updated the CI workflows to use the new unified test runner.

Changes:
  Moved and mapped 6 test files:
  - 03-failover-loop.tcl → Merged into existing failover.tcl
  - 04-resharding.tcl → resharding.tcl
  - 12-replica-migration-2.tcl + 12.1-replica-migration-3.tcl →
  replica-migration-slow.tcl
  - 07-replica-migration.tcl → Merged into existing replica-migration.tcl
  - 28-cluster-shards.tcl → Merged into existing cluster-shards.tcl

Other changes:
  - Converted old framework APIs (e.g., K, RI) to new framework APIs (e.g., R, srv)
  - Added process_is_alive check in cluster_util.tcl to fix an exception in
  failover tests caused by executing ps on dead processes
  - Heavy tests (resharding, replica-migration-slow) marked with slow tag and
  wrapped in run_solo to prevent resource contention in sanitizer environments
  - replica-migration-slow marked with valgrind:skip tag since it is very slow
  - Removed the entire tests/cluster/ directory including run.tcl, cluster.tcl,
  includes/, and helpers/
  - Kept runtest-cluster as a wrapper script (exec ./runtest --cluster "$@")
  - Removed ./runtest-cluster calls from .github/workflows/daily.yml as cluster
  tests are now included in ./runtest

Closes #2297.

Signed-off-by: Jun Yeong Kim <junyeonggim5@gmail.com>
Signed-off-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
This commit is contained in:
Jun Yeong Kim
2026-04-27 18:31:37 +09:00
committed by GitHub
parent 6dbb7f81a9
commit ff80b2d1dc
20 changed files with 724 additions and 1350 deletions
-68
View File
@@ -125,9 +125,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
- name: unittest
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: |
@@ -211,9 +208,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
- name: unittest
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: |
@@ -276,9 +270,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
- name: unittest
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: |
@@ -334,9 +325,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
test-ubuntu-no-malloc-usable-size:
runs-on: ubuntu-latest
if: |
@@ -384,9 +372,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
test-ubuntu-32bit:
runs-on: ubuntu-latest
if: |
@@ -457,9 +442,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
- name: unittest
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: |
@@ -522,10 +504,6 @@ jobs:
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: |
./runtest-sentinel --tls ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: |
./runtest-cluster --tls ${{github.event.inputs.cluster_test_args}}
test-ubuntu-tls-no-tls:
runs-on: ubuntu-latest
if: |
@@ -578,10 +556,6 @@ jobs:
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: |
./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: |
./runtest-cluster ${{github.event.inputs.cluster_test_args}}
test-ubuntu-io-threads:
runs-on: ubuntu-latest
if: |
@@ -623,9 +597,6 @@ jobs:
- name: test
if: true && !contains(github.event.inputs.skiptests, 'valkey')
run: ./runtest --io-threads --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}
test-ubuntu-tls-io-threads:
runs-on: ubuntu-latest
if: |
@@ -670,10 +641,6 @@ jobs:
if: true && !contains(github.event.inputs.skiptests, 'valkey')
run: |
./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: |
./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}}
test-ubuntu-reclaim-cache:
runs-on: ubuntu-latest
if: |
@@ -991,9 +958,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
- name: unittest
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: |
@@ -1129,9 +1093,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
- name: unittest
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: |
@@ -1263,9 +1224,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
- name: unittest
if: true && !contains(github.event.inputs.skiptests, 'unittest')
run: |
@@ -1324,9 +1282,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
test-rpm-distros-jemalloc:
if: |
(github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
@@ -1399,9 +1354,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
test-rpm-distros-tls-module:
if: |
(github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
@@ -1479,10 +1431,6 @@ jobs:
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: |
./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: |
./runtest-cluster --tls-module ${{github.event.inputs.cluster_test_args}}
test-rpm-distros-tls-module-no-tls:
if: |
(github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
@@ -1555,10 +1503,6 @@ jobs:
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: |
./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: |
./runtest-cluster ${{github.event.inputs.cluster_test_args}}
test-macos-latest:
runs-on: macos-latest
if: |
@@ -1676,9 +1620,6 @@ jobs:
- run: cd libbacktrace && ./configure && make && sudo make install
- name: make
run: make SERVER_CFLAGS='-Werror' USE_LIBBACKTRACE=yes
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
build-old-macos-versions:
strategy:
fail-fast: false
@@ -1806,9 +1747,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
test-alpine-libc-malloc:
runs-on: ubuntu-latest
if: |
@@ -1859,9 +1797,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
reply-schemas-validator:
runs-on: ubuntu-latest
timeout-minutes: 1440
@@ -1909,9 +1844,6 @@ jobs:
- name: sentinel tests
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
run: ./runtest-sentinel --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}}
- name: Install Python dependencies
uses: py-actions/py-dependency-install@30aa0023464ed4b5b116bd9fbdab87acf01a484e # v4.1.0
with:
+1 -13
View File
@@ -1,14 +1,2 @@
#!/bin/sh
TCL_VERSIONS="8.5 8.6 9.0"
TCLSH=""
for VERSION in $TCL_VERSIONS; do
TCL=`which tclsh$VERSION 2>/dev/null` && TCLSH=$TCL
done
if [ -z $TCLSH ]
then
echo "You need tcl 8.5 or newer in order to run the Valkey Cluster test"
exit 1
fi
$TCLSH tests/cluster/run.tcl $*
exec ./runtest --cluster "$@"
-238
View File
@@ -1,238 +0,0 @@
# Cluster-specific test functions.
#
# Copyright (C) 2014 Redis Ltd.
# This software is released under the BSD License. See the COPYING file for
# more information.
# Track cluster configuration as created by create_cluster below
set ::cluster_master_nodes 0
set ::cluster_replica_nodes 0
# Returns a parsed CLUSTER NODES output as a list of dictionaries. Optional status field
# can be specified to only returns entries that match the provided status.
proc get_cluster_nodes {id {status "*"}} {
set lines [split [R $id cluster nodes] "\r\n"]
set nodes {}
foreach l $lines {
set l [string trim $l]
if {$l eq {}} continue
set args [split $l]
set node [dict create \
id [lindex $args 0] \
addr [lindex $args 1] \
flags [split [lindex $args 2] ,] \
slaveof [lindex $args 3] \
ping_sent [lindex $args 4] \
pong_recv [lindex $args 5] \
config_epoch [lindex $args 6] \
linkstate [lindex $args 7] \
slots [lrange $args 8 end] \
]
if {[string match $status [lindex $args 7]]} {
lappend nodes $node
}
}
return $nodes
}
# Test node for flag.
proc has_flag {node flag} {
expr {[lsearch -exact [dict get $node flags] $flag] != -1}
}
# Returns the parsed myself node entry as a dictionary.
proc get_myself id {
set nodes [get_cluster_nodes $id]
foreach n $nodes {
if {[has_flag $n myself]} {return $n}
}
return {}
}
# Get a specific node by ID by parsing the CLUSTER NODES output
# of the instance Number 'instance_id'
proc get_node_by_id {instance_id node_id} {
set nodes [get_cluster_nodes $instance_id]
foreach n $nodes {
if {[dict get $n id] eq $node_id} {return $n}
}
return {}
}
# Return the value of the specified CLUSTER INFO field.
proc CI {n field} {
get_info_field [R $n cluster info] $field
}
# Return the value of the specified INFO field.
proc s {n field} {
get_info_field [R $n info] $field
}
# Assuming nodes are reset, this function performs slots allocation.
# Only the first 'n' nodes are used.
proc cluster_allocate_slots {n} {
set slot 16383
while {$slot >= 0} {
# Allocate successive slots to random nodes.
set node [randomInt $n]
lappend slots_$node $slot
incr slot -1
}
for {set j 0} {$j < $n} {incr j} {
R $j cluster addslots {*}[set slots_${j}]
}
}
# Check that cluster nodes agree about "state", or raise an error.
proc assert_cluster_state {state} {
foreach_valkey_id id {
if {[instance_is_killed valkey $id]} continue
wait_for_condition 1000 50 {
[CI $id cluster_state] eq $state
} else {
fail "Cluster node $id cluster_state:[CI $id cluster_state]"
}
}
}
# Search the first node starting from ID $first that is not
# already configured as a slave.
proc cluster_find_available_slave {first} {
foreach_valkey_id id {
if {$id < $first} continue
if {[instance_is_killed valkey $id]} continue
set me [get_myself $id]
if {[dict get $me slaveof] eq {-}} {return $id}
}
fail "No available slaves"
}
# Add 'slaves' slaves to a cluster composed of 'masters' masters.
# It assumes that masters are allocated sequentially from instance ID 0
# to N-1.
proc cluster_allocate_slaves {masters slaves} {
for {set j 0} {$j < $slaves} {incr j} {
set master_id [expr {$j % $masters}]
set slave_id [cluster_find_available_slave $masters]
set master_myself [get_myself $master_id]
R $slave_id cluster replicate [dict get $master_myself id]
}
}
# Create a cluster composed of the specified number of masters and slaves.
proc create_cluster {masters slaves} {
cluster_allocate_slots $masters
if {$slaves} {
cluster_allocate_slaves $masters $slaves
}
assert_cluster_state ok
set ::cluster_master_nodes $masters
set ::cluster_replica_nodes $slaves
}
proc cluster_allocate_with_continuous_slots {n} {
set slot 16383
set avg [expr ($slot+1) / $n]
while {$slot >= 0} {
set node [expr $slot/$avg >= $n ? $n-1 : $slot/$avg]
lappend slots_$node $slot
incr slot -1
}
for {set j 0} {$j < $n} {incr j} {
R $j cluster addslots {*}[set slots_${j}]
}
}
# Create a cluster composed of the specified number of masters and slaves,
# but with a continuous slot range.
proc cluster_create_with_continuous_slots {masters slaves} {
cluster_allocate_with_continuous_slots $masters
if {$slaves} {
cluster_allocate_slaves $masters $slaves
}
assert_cluster_state ok
set ::cluster_master_nodes $masters
set ::cluster_replica_nodes $slaves
}
# Set the cluster node-timeout to all the reachalbe nodes.
proc set_cluster_node_timeout {to} {
foreach_valkey_id id {
catch {R $id CONFIG SET cluster-node-timeout $to}
}
}
# Check if the cluster is writable and readable. Use node "id"
# as a starting point to talk with the cluster.
proc cluster_write_test {id} {
set prefix [randstring 20 20 alpha]
set port [get_instance_attrib valkey $id port]
set cluster [valkey_cluster 127.0.0.1:$port]
for {set j 0} {$j < 100} {incr j} {
$cluster set key.$j $prefix.$j
}
for {set j 0} {$j < 100} {incr j} {
assert {[$cluster get key.$j] eq "$prefix.$j"}
}
$cluster close
}
# Check if cluster configuration is consistent.
# All the nodes in the cluster should show same slots configuration and have health
# state "online" to be considered as consistent.
proc cluster_config_consistent {} {
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
# Check if all the nodes are online
set shards_cfg [R $j CLUSTER SHARDS]
foreach shard_cfg $shards_cfg {
set nodes [dict get $shard_cfg nodes]
foreach node $nodes {
if {[dict get $node health] ne "online"} {
return 0
}
}
}
if {$j == 0} {
set base_cfg [R $j cluster slots]
} else {
set cfg [R $j cluster slots]
if {$cfg != $base_cfg} {
return 0
}
}
}
return 1
}
# Wait for cluster configuration to propagate and be consistent across nodes.
proc wait_for_cluster_propagation {} {
wait_for_condition 1000 50 {
[cluster_config_consistent] eq 1
} else {
for {set j 0} {$j < [llength $::servers]} {incr j} {
puts "R $j cluster slots output: [R $j cluster slots]"
}
fail "cluster config did not reach a consistent state"
}
}
# Check if cluster's view of hostnames is consistent
proc are_hostnames_propagated {match_string} {
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
set cfg [R $j cluster slots]
foreach node $cfg {
for {set i 2} {$i < [llength $node]} {incr i} {
if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } {
return 0
}
}
}
}
return 1
}
-35
View File
@@ -1,35 +0,0 @@
# Cluster test suite. Copyright (C) 2014 Redis Ltd.
# This software is released under the BSD License. See the COPYING file for
# more information.
# Set the executable paths at project root
source tests/support/set_executable_path.tcl
cd tests/cluster
source cluster.tcl
source ../instances.tcl
source ../../support/cluster.tcl ; # Cluster client.
set ::instances_count 20 ; # How many instances we use at max.
set ::tlsdir "../../tls"
proc main {} {
parse_options
spawn_instance valkey $::valkey_base_port $::instances_count {
"cluster-enabled yes"
"appendonly yes"
"enable-protected-configs yes"
"enable-debug-command yes"
"save ''"
}
run_tests
cleanup
end_tests
}
if {[catch main e]} {
puts $::errorInfo
if {$::pause_on_error} pause_on_error
cleanup
exit 1
}
-117
View File
@@ -1,117 +0,0 @@
# Failover stress test.
# In this test a different node is killed in a loop for N
# iterations. The test checks that certain properties
# are preserved across iterations.
source "../tests/includes/init-tests.tcl"
test "Create a 5 nodes cluster" {
create_cluster 5 5
}
test "Cluster is up" {
assert_cluster_state ok
}
set iterations 20
set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]]
while {[incr iterations -1]} {
set tokill [randomInt 10]
set other [expr {($tokill+1)%10}] ; # Some other instance.
set key [randstring 20 20 alpha]
set val [randstring 20 20 alpha]
set role [RI $tokill role]
if {$role eq {master}} {
set slave {}
set myid [dict get [get_myself $tokill] id]
foreach_valkey_id id {
if {$id == $tokill} continue
if {[dict get [get_myself $id] slaveof] eq $myid} {
set slave $id
}
}
if {$slave eq {}} {
fail "Unable to retrieve slave's ID for master #$tokill"
}
}
puts "--- Iteration $iterations ---"
if {$role eq {master}} {
test "Wait for slave of #$tokill to sync" {
wait_for_condition 1000 50 {
[string match {*state=online*} [RI $tokill slave0]]
} else {
fail "Slave of node #$tokill is not ok"
}
}
set slave_config_epoch [CI $slave cluster_my_epoch]
}
test "Cluster is writable before failover" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i $val:$i} err
assert {$err eq {OK}}
}
# Wait for the write to propagate to the slave if we
# are going to kill a master.
if {$role eq {master}} {
R $tokill wait 1 20000
}
}
test "Terminating node #$tokill" {
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
R $tokill config set appendonly no
kill_instance valkey $tokill
}
if {$role eq {master}} {
test "Wait failover by #$slave with old epoch $slave_config_epoch" {
wait_for_condition 1000 50 {
[CI $slave cluster_my_epoch] > $slave_config_epoch
} else {
fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
}
}
}
test "Cluster should eventually be up again" {
assert_cluster_state ok
}
test "Cluster is writable again" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i:2 $val:$i:2} err
assert {$err eq {OK}}
}
}
test "Restarting node #$tokill" {
restart_instance valkey $tokill
}
test "Instance #$tokill is now a slave" {
wait_for_condition 1000 50 {
[RI $tokill role] eq {slave}
} else {
fail "Restarted instance is not a slave"
}
}
test "We can read back the value we set before" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster get $key:$i} err
assert {$err eq "$val:$i"}
catch {$cluster get $key:$i:2} err
assert {$err eq "$val:$i:2"}
}
}
}
test "Post condition: current_epoch >= my_epoch everywhere" {
foreach_valkey_id id {
assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
}
}
-196
View File
@@ -1,196 +0,0 @@
# Failover stress test.
# In this test a different node is killed in a loop for N
# iterations. The test checks that certain properties
# are preserved across iterations.
source "../tests/includes/init-tests.tcl"
source "../../../tests/support/cli.tcl"
test "Create a 5 nodes cluster" {
create_cluster 5 5
}
test "Cluster is up" {
assert_cluster_state ok
}
test "Enable AOF in all the instances" {
foreach_valkey_id id {
R $id config set appendonly yes
# We use "appendfsync no" because it's fast but also guarantees that
# write(2) is performed before replying to client.
R $id config set appendfsync no
}
foreach_valkey_id id {
wait_for_condition 1000 500 {
[RI $id aof_rewrite_in_progress] == 0 &&
[RI $id aof_enabled] == 1
} else {
fail "Failed to enable AOF on instance #$id"
}
}
}
# Return non-zero if the specified PID is about a process still in execution,
# otherwise 0 is returned.
proc process_is_running {pid} {
# PS should return with an error if PID is non existing,
# and catch will return non-zero. We want to return non-zero if
# the PID exists, so we invert the return value with expr not operator.
expr {![catch {exec ps -p $pid}]}
}
# Our resharding test performs the following actions:
#
# - N commands are sent to the cluster in the course of the test.
# - Every command selects a random key from key:0 to key:MAX-1.
# - The operation RPUSH key <randomvalue> is performed.
# - Tcl remembers into an array all the values pushed to each list.
# - After N/2 commands, the resharding process is started in background.
# - The test continues while the resharding is in progress.
# - At the end of the test, we wait for the resharding process to stop.
# - Finally the keys are checked to see if they contain the value they should.
set numkeys 50000
set numops 200000
set start_node_port [get_instance_attrib valkey 0 port]
set cluster [valkey_cluster 127.0.0.1:$start_node_port]
if {$::tls} {
# setup a non-TLS cluster client to the TLS cluster
set plaintext_port [get_instance_attrib valkey 0 plaintext-port]
set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0]
puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
} else {
set cluster_plaintext $cluster
puts "Testing using non-TLS cluster"
}
catch {unset content}
array set content {}
set tribpid {}
test "Cluster consistency during live resharding" {
set ele 0
for {set j 0} {$j < $numops} {incr j} {
# Trigger the resharding once we execute half the ops.
if {$tribpid ne {} &&
($j % 10000) == 0 &&
![process_is_running $tribpid]} {
set tribpid {}
}
if {$j >= $numops/2 && $tribpid eq {}} {
puts -nonewline "...Starting resharding..."
flush stdout
set target [dict get [get_myself [randomInt 5]] id]
set tribpid [lindex [exec \
$::VALKEY_CLI_BIN --cluster reshard \
127.0.0.1:[get_instance_attrib valkey 0 port] \
--cluster-from all \
--cluster-to $target \
--cluster-slots 100 \
--cluster-yes \
{*}[valkeycli_tls_config "../../../tests"] \
| [info nameofexecutable] \
../tests/helpers/onlydots.tcl \
&] 0]
}
# Write random data to random list.
set listid [randomInt $numkeys]
set key "key:$listid"
incr ele
# We write both with Lua scripts and with plain commands.
# This way we are able to stress Lua -> server command invocation
# as well, that has tests to prevent Lua to write into wrong
# hash slots.
# We also use both TLS and plaintext connections.
if {$listid % 3 == 0} {
$cluster rpush $key $ele
} elseif {$listid % 3 == 1} {
$cluster_plaintext rpush $key $ele
} else {
$cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
}
lappend content($key) $ele
if {($j % 1000) == 0} {
puts -nonewline W; flush stdout
}
}
# Wait for the resharding process to end
wait_for_condition 1000 500 {
[process_is_running $tribpid] == 0
} else {
fail "Resharding is not terminating after some time."
}
wait_for_cluster_propagation
}
test "Verify $numkeys keys for consistency with logical content" {
# Check that the Cluster content matches our logical content.
foreach {key value} [array get content] {
if {[$cluster lrange $key 0 -1] ne $value} {
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
}
}
}
test "Terminate and restart all the instances" {
foreach_valkey_id id {
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
R $id config set appendonly no
kill_instance valkey $id
restart_instance valkey $id
}
}
test "Cluster should eventually be up again" {
assert_cluster_state ok
}
test "Verify $numkeys keys after the restart" {
# Check that the Cluster content matches our logical content.
foreach {key value} [array get content] {
if {[$cluster lrange $key 0 -1] ne $value} {
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
}
}
}
test "Disable AOF in all the instances" {
foreach_valkey_id id {
R $id config set appendonly no
}
}
test "Verify slaves consistency" {
set verified_masters 0
foreach_valkey_id id {
set role [R $id role]
lassign $role myrole myoffset slaves
if {$myrole eq {slave}} continue
set masterport [get_instance_attrib valkey $id port]
set masterdigest [R $id debug digest]
foreach_valkey_id sid {
set srole [R $sid role]
if {[lindex $srole 0] eq {master}} continue
if {[lindex $srole 2] != $masterport} continue
wait_for_condition 1000 500 {
[R $sid debug digest] eq $masterdigest
} else {
fail "Master and slave data digest are different"
}
incr verified_masters
}
}
assert {$verified_masters >= 5}
}
test "Dump sanitization was skipped for migrations" {
set verified_masters 0
foreach_valkey_id id {
assert {[RI $id dump_payload_sanitizations] == 0}
}
}
@@ -1,103 +0,0 @@
# Replica migration test.
# Check that orphaned masters are joined by replicas of masters having
# multiple replicas attached, according to the migration barrier settings.
source "../tests/includes/init-tests.tcl"
# Create a cluster with 5 master and 10 slaves, so that we have 2
# slaves for each master.
test "Create a 5 nodes cluster" {
create_cluster 5 10
}
test "Cluster is up" {
assert_cluster_state ok
}
test "Each master should have two replicas attached" {
foreach_valkey_id id {
if {$id < 5} {
wait_for_condition 1000 50 {
[llength [lindex [R $id role] 2]] == 2
} else {
fail "Master #$id does not have 2 slaves as expected"
}
}
}
}
test "Killing all the slaves of master #0 and #1" {
kill_instance valkey 5
kill_instance valkey 10
kill_instance valkey 6
kill_instance valkey 11
after 4000
}
foreach_valkey_id id {
if {$id < 5} {
test "Master #$id should have at least one replica" {
wait_for_condition 1000 50 {
[llength [lindex [R $id role] 2]] >= 1
} else {
fail "Master #$id has no replicas"
}
}
}
}
# Now test the migration to a master which used to be a slave, after
# a failver.
source "../tests/includes/init-tests.tcl"
# Create a cluster with 5 master and 10 slaves, so that we have 2
# slaves for each master.
test "Create a 5 nodes cluster" {
create_cluster 5 10
}
test "Cluster is up" {
assert_cluster_state ok
}
test "Kill slave #7 of master #2. Only slave left is #12 now" {
kill_instance valkey 7
}
set current_epoch [CI 1 cluster_current_epoch]
test "Killing master node #2, #12 should failover" {
kill_instance valkey 2
}
test "Wait for failover" {
wait_for_condition 1000 50 {
[CI 1 cluster_current_epoch] > $current_epoch
} else {
fail "No failover detected"
}
}
test "Cluster should eventually be up again" {
assert_cluster_state ok
}
test "Cluster is writable" {
cluster_write_test 1
}
test "Instance 12 is now a master without slaves" {
assert {[RI 12 role] eq {master}}
}
# The remaining instance is now without slaves. Some other slave
# should migrate to it.
test "Master #12 should get at least one migrated replica" {
wait_for_condition 1000 50 {
[llength [lindex [R 12 role] 2]] >= 1
} else {
fail "Master #12 has no replicas"
}
}
@@ -1,75 +0,0 @@
# Replica migration test #2.
#
# Check that the status of master that can be targeted by replica migration
# is acquired again, after being getting slots again, in a cluster where the
# other masters have slaves.
source "../tests/includes/init-tests.tcl"
source "../../../tests/support/cli.tcl"
# Create a cluster with 5 master and 15 slaves, to make sure there are no
# empty masters and make rebalancing simpler to handle during the test.
test "Create a 5 nodes cluster" {
cluster_create_with_continuous_slots 5 15
}
test "Cluster is up" {
assert_cluster_state ok
}
test "Each master should have at least two replicas attached" {
foreach_valkey_id id {
if {$id < 5} {
wait_for_condition 1000 50 {
[llength [lindex [R $id role] 2]] >= 2
} else {
fail "Master #$id does not have 2 slaves as expected"
}
}
}
}
test "Set allow-replica-migration yes" {
foreach_valkey_id id {
R $id CONFIG SET cluster-allow-replica-migration yes
}
}
set master0_id [dict get [get_myself 0] id]
test "Resharding all the master #0 slots away from it" {
set output [exec \
$::VALKEY_CLI_BIN --cluster rebalance \
127.0.0.1:[get_instance_attrib valkey 0 port] \
{*}[valkeycli_tls_config "../../../tests"] \
--cluster-weight ${master0_id}=0 >@ stdout ]
}
test "Master #0 who lost all slots should turn into a replica without replicas" {
wait_for_condition 1000 50 {
[RI 0 role] == "slave" && [RI 0 connected_slaves] == 0
} else {
puts [R 0 info replication]
fail "Master #0 didn't turn itself into a replica"
}
}
test "Resharding back some slot to master #0" {
# Wait for the cluster config to propagate before attempting a
# new resharding.
after 10000
set output [exec \
$::VALKEY_CLI_BIN --cluster rebalance \
127.0.0.1:[get_instance_attrib valkey 0 port] \
{*}[valkeycli_tls_config "../../../tests"] \
--cluster-weight ${master0_id}=.01 \
--cluster-use-empty-masters >@ stdout]
}
test "Master #0 should re-acquire one or more replicas" {
wait_for_condition 1000 50 {
[llength [lindex [R 0 role] 2]] >= 1
} else {
fail "Master #0 has no has replicas"
}
}
@@ -1,65 +0,0 @@
# Replica migration test #2.
#
# Check that if 'cluster-allow-replica-migration' is set to 'no', slaves do not
# migrate when master becomes empty.
source "../tests/includes/init-tests.tcl"
source "../tests/includes/utils.tcl"
# Create a cluster with 5 master and 15 slaves, to make sure there are no
# empty masters and make rebalancing simpler to handle during the test.
test "Create a 5 nodes cluster" {
cluster_create_with_continuous_slots 5 15
}
test "Cluster is up" {
assert_cluster_state ok
}
test "Each master should have at least two replicas attached" {
foreach_valkey_id id {
if {$id < 5} {
wait_for_condition 1000 50 {
[llength [lindex [R $id role] 2]] >= 2
} else {
fail "Master #$id does not have 2 slaves as expected"
}
}
}
}
test "Set allow-replica-migration no" {
foreach_valkey_id id {
R $id CONFIG SET cluster-allow-replica-migration no
}
}
set master0_id [dict get [get_myself 0] id]
test "Resharding all the master #0 slots away from it" {
set output [exec \
$::VALKEY_CLI_BIN --cluster rebalance \
127.0.0.1:[get_instance_attrib valkey 0 port] \
{*}[valkeycli_tls_config "../../../tests"] \
--cluster-weight ${master0_id}=0 >@ stdout ]
}
test "Wait cluster to be stable" {
wait_cluster_stable
}
test "Master #0 still should have its replicas" {
assert { [llength [lindex [R 0 role] 2]] >= 2 }
}
test "Each master should have at least two replicas attached" {
foreach_valkey_id id {
if {$id < 5} {
wait_for_condition 1000 50 {
[llength [lindex [R $id role] 2]] >= 2
} else {
fail "Master #$id does not have 2 slaves as expected"
}
}
}
}
-312
View File
@@ -1,312 +0,0 @@
source "../tests/includes/init-tests.tcl"
# Initial slot distribution.
set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926]
set ::slot1 [list 5460 5460 5462 10922 10925 10925]
set ::slot2 [list 10923 10924 10927 16383]
set ::slot3 [list 1001 1001]
proc cluster_create_with_split_slots {masters replicas} {
for {set j 0} {$j < $masters} {incr j} {
R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}]
}
if {$replicas} {
cluster_allocate_slaves $masters $replicas
}
set ::cluster_master_nodes $masters
set ::cluster_replica_nodes $replicas
}
# Get the node info with the specific node_id from the
# given reference node. Valid type options are "node" and "shard"
proc get_node_info_from_shard {id reference {type node}} {
set shards_response [R $reference CLUSTER SHARDS]
foreach shard_response $shards_response {
set nodes [dict get $shard_response nodes]
foreach node $nodes {
if {[dict get $node id] eq $id} {
if {$type eq "node"} {
return $node
} elseif {$type eq "shard"} {
return $shard_response
} else {
return {}
}
}
}
}
# No shard found, return nothing
return {}
}
proc cluster_ensure_master {id} {
if { [regexp "master" [R $id role]] == 0 } {
assert_equal {OK} [R $id CLUSTER FAILOVER]
wait_for_condition 50 100 {
[regexp "master" [R $id role]] == 1
} else {
fail "instance $id is not master"
}
}
}
test "Create a 8 nodes cluster with 4 shards" {
cluster_create_with_split_slots 4 4
}
test "Cluster should start ok" {
assert_cluster_state ok
}
test "Set cluster hostnames and verify they are propagated" {
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
R $j config set cluster-announce-hostname "host-$j.com"
}
# Wait for everyone to agree about the state
wait_for_cluster_propagation
}
test "Verify information about the shards" {
set ids {}
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
lappend ids [R $j CLUSTER MYID]
}
set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3]
# Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent.
for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} {
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots]
assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname]
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip]
# Default value of 'cluster-preferred-endpoint-type' is ip.
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint]
if {$::tls} {
assert_equal [get_instance_attrib valkey $i plaintext-port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port]
} else {
assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
}
if {$i < 4} {
assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health]
} else {
assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
# Replica could be in online or loading
}
}
}
}
test "Verify no slot shard" {
# Node 8 has no slots assigned
set node_8_id [R 8 CLUSTER MYID]
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots]
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots]
}
set node_0_id [R 0 CLUSTER MYID]
test "Kill a node and tell the replica to immediately takeover" {
kill_instance valkey 0
R 4 cluster failover force
}
# Primary 0 node should report as fail, wait until the new primary acknowledges it.
test "Verify health as fail for killed node" {
wait_for_condition 1000 50 {
"fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"]
} else {
fail "New primary never detected the node failed"
}
}
set primary_id 4
set replica_id 0
test "Restarting primary node" {
restart_instance valkey $replica_id
}
test "Instance #0 gets converted into a replica" {
wait_for_condition 1000 50 {
[RI $replica_id role] eq {slave}
} else {
fail "Old primary was not converted into replica"
}
}
test "Test the replica reports a loading state while it's loading" {
# Test the command is good for verifying everything moves to a happy state
set replica_cluster_id [R $replica_id CLUSTER MYID]
wait_for_condition 50 1000 {
[dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online"
} else {
fail "Replica never transitioned to online"
}
# Set 1 MB of data, so there is something to load on full sync
R $primary_id debug populate 1000 key 1000
# Kill replica client for primary and load new data to the primary
R $primary_id config set repl-backlog-size 100
# Set the key load delay so that it will take at least
# 2 seconds to fully load the data.
R $replica_id config set key-load-delay 4000
# Trigger event loop processing every 1024 bytes, this trigger
# allows us to send and receive cluster messages, so we are setting
# it low so that the cluster messages are sent more frequently.
R $replica_id config set loading-process-events-interval-bytes 1024
R $primary_id multi
R $primary_id client kill type replica
# populate the correct data
set num 100
set value [string repeat A 1024]
for {set j 0} {$j < $num} {incr j} {
# Use hashtag valid for shard #0
set key "{ch3}$j"
R $primary_id set $key $value
}
R $primary_id exec
# The replica should reconnect and start a full sync, it will gossip about it's health to the primary.
wait_for_condition 50 1000 {
"loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
} else {
fail "Replica never transitioned to loading"
}
# Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data.
R $replica_id CLUSTER SHARDS
R $replica_id CLUSTER SLOTS
# Speed up the key loading and verify everything resumes
R $replica_id config set key-load-delay 0
wait_for_condition 50 1000 {
"online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
} else {
fail "Replica never transitioned to online"
}
# Final sanity, the replica agrees it is online.
assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health]
}
test "Regression test for a crash when calling SHARDS during handshake" {
# Reset forget a node, so we can use it to establish handshaking connections
set id [R 19 CLUSTER MYID]
R 19 CLUSTER RESET HARD
for {set i 0} {$i < 19} {incr i} {
R $i CLUSTER FORGET $id
}
R 19 cluster meet 127.0.0.1 [get_instance_attrib valkey 0 port]
# This should line would previously crash, since all the outbound
# connections were in handshake state.
R 19 CLUSTER SHARDS
}
test "Cluster is up" {
assert_cluster_state ok
}
test "Shard ids are unique" {
set shard_ids {}
for {set i 0} {$i < 4} {incr i} {
set shard_id [R $i cluster myshardid]
assert_equal [dict exists $shard_ids $shard_id] 0
dict set shard_ids $shard_id 1
}
}
test "CLUSTER MYSHARDID reports same id for both primary and replica" {
for {set i 0} {$i < 4} {incr i} {
assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid]
assert_equal [string length [R $i cluster myshardid]] 40
}
}
test "New replica receives primary's shard id" {
#find a primary
set id 0
for {} {$id < 8} {incr id} {
if {[regexp "master" [R $id role]]} {
break
}
}
assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid]
assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]]
assert_equal [R 8 cluster myshardid] [R $id cluster myshardid]
}
test "CLUSTER MYSHARDID reports same shard id after shard restart" {
set node_ids {}
for {set i 0} {$i < 8} {incr i 4} {
dict set node_ids $i [R $i cluster myshardid]
kill_instance valkey $i
wait_for_condition 50 100 {
[instance_is_killed valkey $i]
} else {
fail "instance $i is not killed"
}
}
for {set i 0} {$i < 8} {incr i 4} {
restart_instance valkey $i
}
assert_cluster_state ok
for {set i 0} {$i < 8} {incr i 4} {
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
}
}
test "CLUSTER MYSHARDID reports same shard id after cluster restart" {
set node_ids {}
for {set i 0} {$i < 8} {incr i} {
dict set node_ids $i [R $i cluster myshardid]
}
for {set i 0} {$i < 8} {incr i} {
kill_instance valkey $i
wait_for_condition 50 100 {
[instance_is_killed valkey $i]
} else {
fail "instance $i is not killed"
}
}
for {set i 0} {$i < 8} {incr i} {
restart_instance valkey $i
}
assert_cluster_state ok
for {set i 0} {$i < 8} {incr i} {
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
}
}
test "CLUSTER SHARDS id response validation" {
# For each node in the cluster
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
# Get the CLUSTER SHARDS output from this node
set shards [R $i CLUSTER SHARDS]
set seen_shard_ids {}
# For each shard in the output
foreach shard $shards {
set shard_dict [dict create {*}$shard]
# 1. Verify 'id' key exists
assert {[dict exists $shard_dict id]}
set shard_id [dict get $shard_dict id]
# 2. Verify shard_id is a 40-char string
assert {[string length $shard_id] == 40}
# 3. Verify that for a given node's output, all shard IDs are unique
assert {[dict exists $seen_shard_ids $shard_id] == 0}
dict set seen_shard_ids $shard_id 1
}
}
}
@@ -1,91 +0,0 @@
# Initialization tests -- most units will start including this.
test "(init) Restart killed instances" {
foreach type {valkey} {
foreach_${type}_id id {
if {[get_instance_attrib $type $id pid] == -1} {
puts -nonewline "$type/$id "
flush stdout
restart_instance $type $id
}
}
}
}
test "Cluster nodes are reachable" {
foreach_valkey_id id {
# Every node should be reachable.
wait_for_condition 1000 50 {
([catch {R $id ping} ping_reply] == 0) &&
($ping_reply eq {PONG})
} else {
catch {R $id ping} err
fail "Node #$id keeps replying '$err' to PING."
}
}
}
test "Cluster nodes hard reset" {
foreach_valkey_id id {
if {$::valgrind} {
set node_timeout 10000
} else {
set node_timeout 3000
}
catch {R $id flushall} ; # May fail for readonly slaves.
R $id MULTI
R $id cluster reset hard
R $id cluster set-config-epoch [expr {$id+1}]
R $id EXEC
R $id config set cluster-node-timeout $node_timeout
R $id config set cluster-slave-validity-factor 10
R $id config set loading-process-events-interval-bytes 2097152
R $id config set key-load-delay 0
R $id config set repl-diskless-load disabled
R $id config set cluster-announce-hostname ""
R $id DEBUG DROP-CLUSTER-PACKET-FILTER -1
R $id config rewrite
}
}
# Helper function to attempt to have each node in a cluster
# meet each other.
proc join_nodes_in_cluster {} {
# Join node 0 with 1, 1 with 2, ... and so forth.
# If auto-discovery works all nodes will know every other node
# eventually.
set ids {}
foreach_valkey_id id {lappend ids $id}
for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} {
set a [lindex $ids $j]
set b [lindex $ids [expr $j+1]]
set b_port [get_instance_attrib valkey $b port]
R $a cluster meet 127.0.0.1 $b_port
}
foreach_valkey_id id {
wait_for_condition 1000 50 {
[llength [get_cluster_nodes $id connected]] == [llength $ids]
} else {
return 0
}
}
return 1
}
test "Cluster Join and auto-discovery test" {
# Use multiple attempts since sometimes nodes timeout
# while attempting to connect.
for {set attempts 3} {$attempts > 0} {incr attempts -1} {
if {[join_nodes_in_cluster] == 1} {
break
}
}
if {$attempts == 0} {
fail "Cluster failed to form full mesh"
}
}
test "Before slots allocation, all nodes report cluster failure" {
assert_cluster_state fail
}
-36
View File
@@ -1,36 +0,0 @@
source "../../../tests/support/cli.tcl"
proc config_set_all_nodes {keyword value} {
foreach_valkey_id id {
R $id config set $keyword $value
}
}
proc fix_cluster {addr} {
set code [catch {
exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster fix $addr << yes
} result]
if {$code != 0} {
puts "valkey-cli --cluster fix returns non-zero exit code, output below:\n$result"
}
# Note: valkey-cli --cluster fix may return a non-zero exit code if nodes don't agree,
# but we can ignore that and rely on the check below.
assert_cluster_state ok
wait_for_condition 100 100 {
[catch {exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster check $addr} result] == 0
} else {
puts "valkey-cli --cluster check returns non-zero exit code, output below:\n$result"
fail "Cluster could not settle with configuration"
}
}
proc wait_cluster_stable {} {
wait_for_condition 1000 50 {
[catch {exec $::VALKEY_CLI_BIN --cluster \
check 127.0.0.1:[get_instance_attrib valkey 0 port] \
{*}[valkeycli_tls_config "../../../tests"] \
}] == 0
} else {
fail "Cluster doesn't stabilize"
}
}
-1
View File
@@ -1 +0,0 @@
*
+1
View File
@@ -148,6 +148,7 @@ proc wait_for_cluster_size {cluster_size} {
# Check that cluster nodes agree about "state", or raise an error.
proc wait_for_cluster_state {state} {
for {set j 0} {$j < [llength $::servers]} {incr j} {
if {![process_is_alive [srv -$j pid]]} continue
if {[process_is_paused [srv -$j pid]]} continue
wait_for_condition 1000 50 {
[CI $j cluster_state] eq $state
+286
View File
@@ -53,3 +53,289 @@ start_cluster 3 3 {tags {external:skip cluster}} {
assert_equal $shard_0_slot_coverage [dict get [get_node_info_from_shard $node_0_id $validation_node "shard"] "slots"]
}
}
# Initial slot distribution for split-slot cluster tests.
set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926]
set ::slot1 [list 5460 5460 5462 10922 10925 10925]
set ::slot2 [list 10923 10924 10927 16383]
set ::slot3 [list 1001 1001]
# Slot allocator: assigns split slots to each master.
proc split_slot_allocation {masters replicas} {
for {set j 0} {$j < $masters} {incr j} {
R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}]
}
}
# Replica allocator: allocates only masters replicas, leaving the last server
# (R 8) as a standalone no-slot node for testing purposes.
proc split_slot_replica_allocation {masters replicas} {
cluster_allocate_replicas $masters [expr {$replicas - 1}]
}
proc cluster_ensure_master {id} {
if { [regexp "master" [R $id role]] == 0 } {
assert_equal {OK} [R $id CLUSTER FAILOVER]
wait_for_condition 50 100 {
[regexp "master" [R $id role]] == 1
} else {
fail "instance $id is not master"
}
}
}
# start_cluster 4 masters + 5 nodes (4 replicas + 1 standalone R8)
start_cluster 4 5 {tags {external:skip cluster}} {
# cluster_master_nodes and cluster_replica_nodes refer to the active cluster members.
set ::cluster_master_nodes 4
set ::cluster_replica_nodes 4
test "Cluster should start ok" {
wait_for_cluster_state ok
}
test "Set cluster hostnames and verify they are propagated" {
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
R $j config set cluster-announce-hostname "host-$j.com"
}
# Wait for everyone to agree about the state
wait_for_cluster_propagation
}
test "Verify information about the shards" {
set ids {}
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
lappend ids [R $j CLUSTER MYID]
}
set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3]
# Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent.
for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} {
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots]
assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname]
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip]
# Default value of 'cluster-preferred-endpoint-type' is ip.
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint]
if {$::tls} {
assert_equal [srv [expr -1*$i] pport] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port]
} else {
assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
}
if {$i < 4} {
assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health]
} else {
assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
# Replica could be in online or loading
}
}
}
}
test "Verify no slot shard" {
# R 8 is a standalone node with no slots assigned (left standalone by split_slot_replica_allocation)
set node_8_id [R 8 CLUSTER MYID]
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots]
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots]
}
set node_0_id [R 0 CLUSTER MYID]
test "Kill a node and tell the replica to immediately takeover" {
pause_process [srv 0 pid]
R 4 cluster failover force
}
# Primary 0 node should report as fail, wait until the new primary acknowledges it.
test "Verify health as fail for killed node" {
wait_for_condition 1000 50 {
"fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"]
} else {
fail "New primary never detected the node failed"
}
}
set primary_id 4
set replica_id 0
test "Restarting primary node" {
restart_server [expr -1*$replica_id] true false
}
test "Instance #0 gets converted into a replica" {
wait_for_condition 1000 50 {
[s [expr -1*$replica_id] role] eq {slave}
} else {
fail "Old primary was not converted into replica"
}
}
test "Test the replica reports a loading state while it's loading" {
# Test the command is good for verifying everything moves to a happy state
set replica_cluster_id [R $replica_id CLUSTER MYID]
wait_for_condition 50 1000 {
[dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online"
} else {
fail "Replica never transitioned to online"
}
# Set 1 MB of data, so there is something to load on full sync
R $primary_id debug populate 1000 key 1000
# Kill replica client for primary and load new data to the primary
R $primary_id config set repl-backlog-size 100
# Set the key load delay so that it will take at least
# 2 seconds to fully load the data.
R $replica_id config set key-load-delay 4000
# Trigger event loop processing every 1024 bytes, this trigger
# allows us to send and receive cluster messages, so we are setting
# it low so that the cluster messages are sent more frequently.
R $replica_id config set loading-process-events-interval-bytes 1024
R $primary_id multi
R $primary_id client kill type replica
# populate the correct data
set num 100
set value [string repeat A 1024]
for {set j 0} {$j < $num} {incr j} {
# Use hashtag valid for shard #0
set key "{ch3}$j"
R $primary_id set $key $value
}
R $primary_id exec
# The replica should reconnect and start a full sync, it will gossip about it's health to the primary.
wait_for_condition 50 1000 {
"loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
} else {
fail "Replica never transitioned to loading"
}
# Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data.
R $replica_id CLUSTER SHARDS
R $replica_id CLUSTER SLOTS
# Speed up the key loading and verify everything resumes
R $replica_id config set key-load-delay 0
wait_for_condition 50 1000 {
"online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
} else {
fail "Replica never transitioned to online"
}
# Final sanity, the replica agrees it is online.
assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health]
}
test "Regression test for a crash when calling SHARDS during handshake" {
# Use R 8 (standalone node) to establish handshaking connections
set id [R 8 CLUSTER MYID]
R 8 CLUSTER RESET HARD
for {set i 0} {$i < 8} {incr i} {
R $i CLUSTER FORGET $id
}
R 8 cluster meet 127.0.0.1 [srv 0 port]
# This line would previously crash, since all the outbound
# connections were in handshake state.
R 8 CLUSTER SHARDS
}
test "Cluster is up" {
wait_for_cluster_state ok
}
test "Shard ids are unique" {
set shard_ids {}
for {set i 0} {$i < 4} {incr i} {
set shard_id [R $i cluster myshardid]
assert_equal [dict exists $shard_ids $shard_id] 0
dict set shard_ids $shard_id 1
}
}
test "CLUSTER MYSHARDID reports same id for both primary and replica" {
for {set i 0} {$i < 4} {incr i} {
assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid]
assert_equal [string length [R $i cluster myshardid]] 40
}
}
test "New replica receives primary's shard id" {
# find a primary
set id 0
for {} {$id < 8} {incr id} {
if {[regexp "master" [R $id role]]} {
break
}
}
assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid]
assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]]
assert_equal [R 8 cluster myshardid] [R $id cluster myshardid]
}
test "CLUSTER MYSHARDID reports same shard id after shard restart" {
set node_ids {}
for {set i 0} {$i < 8} {incr i 4} {
dict set node_ids $i [R $i cluster myshardid]
pause_process [srv [expr -1*$i] pid]
}
for {set i 0} {$i < 8} {incr i 4} {
restart_server [expr -1*$i] true false
}
wait_for_cluster_state ok
for {set i 0} {$i < 8} {incr i 4} {
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
}
}
test "CLUSTER MYSHARDID reports same shard id after cluster restart" {
set node_ids {}
for {set i 0} {$i < 8} {incr i} {
dict set node_ids $i [R $i cluster myshardid]
}
for {set i 0} {$i < 8} {incr i} {
pause_process [srv [expr -1*$i] pid]
}
for {set i 0} {$i < 8} {incr i} {
restart_server [expr -1*$i] true false
}
wait_for_cluster_state ok
for {set i 0} {$i < 8} {incr i} {
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
}
}
test "CLUSTER SHARDS id response validation" {
# For each node in the cluster
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
# Get the CLUSTER SHARDS output from this node
set shards [R $i CLUSTER SHARDS]
set seen_shard_ids {}
# For each shard in the output
foreach shard $shards {
set shard_dict [dict create {*}$shard]
# 1. Verify 'id' key exists
assert {[dict exists $shard_dict id]}
set shard_id [dict get $shard_dict id]
# 2. Verify shard_id is a 40-char string
assert {[string length $shard_id] == 40}
# 3. Verify that for a given node's output, all shard IDs are unique
assert {[dict exists $seen_shard_ids $shard_id] == 0}
dict set seen_shard_ids $shard_id 1
}
}
}
} split_slot_allocation split_slot_replica_allocation
+105
View File
@@ -152,3 +152,108 @@ start_cluster 3 1 {tags {external:skip cluster}} {
}
}
} ;# start_cluster
# Failover stress test.
# In this test a different node is killed in a loop for N
# iterations. The test checks that certain properties
# are preserved across iterations.
start_cluster 5 5 {tags {external:skip cluster}} {
set iterations 10
set cluster [valkey_cluster 127.0.0.1:[srv 0 port]]
while {[incr iterations -1]} {
set tokill [randomInt 10]
set other [expr {($tokill+1)%10}] ; # Some other instance.
set key [randstring 20 20 alpha]
set val [randstring 20 20 alpha]
set role [s [expr -1*$tokill] role]
if {$role eq {master}} {
set slave {}
set myid [dict get [cluster_get_myself $tokill] id]
for {set id 0} {$id < [llength $::servers]} {incr id} {
if {$id == $tokill} continue
if {[dict get [cluster_get_myself $id] slaveof] eq $myid} {
set slave $id
}
}
if {$slave eq {}} {
fail "Unable to retrieve slave's ID for master #$tokill"
}
}
if {$role eq {master}} {
test "Wait for slave of #$tokill to sync" {
wait_for_condition 1000 50 {
[string match {*state=online*} [s [expr -1*$tokill] slave0]]
} else {
fail "Slave of node #$tokill is not ok"
}
}
set slave_config_epoch [CI $slave cluster_my_epoch]
}
test "Cluster is writable before failover" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i $val:$i} err
assert {$err eq {OK}}
}
# Wait for the write to propagate to the slave if we
# are going to kill a master.
if {$role eq {master}} {
R $tokill wait 1 20000
}
}
test "Terminating node #$tokill" {
catch {R $tokill shutdown nosave}
}
if {$role eq {master}} {
test "Wait failover by #$slave with old epoch $slave_config_epoch" {
wait_for_condition 1000 50 {
[CI $slave cluster_my_epoch] > $slave_config_epoch
} else {
fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
}
}
}
test "Cluster should eventually be up again" {
wait_for_cluster_state ok
}
test "Cluster is writable again" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster set $key:$i:2 $val:$i:2} err
assert {$err eq {OK}}
}
}
test "Restarting node #$tokill" {
restart_server [expr -1*$tokill] true false
}
test "Instance #$tokill is now a slave" {
wait_for_condition 1000 50 {
[s [expr -1*$tokill] role] eq {slave}
} else {
fail "Restarted instance is not a slave"
}
}
test "We can read back the value we set before" {
for {set i 0} {$i < 100} {incr i} {
catch {$cluster get $key:$i} err
assert {$err eq "$val:$i"}
catch {$cluster get $key:$i:2} err
assert {$err eq "$val:$i:2"}
}
}
}
test "Post condition: current_epoch >= my_epoch everywhere" {
for {set id 0} {$id < [llength $::servers]} {incr id} {
assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
}
}
} ;# start_cluster
@@ -0,0 +1,76 @@
# Check that the status of primary that can be targeted by replica migration
# is acquired again, after being getting slots again, in a cluster where the
# other primaries have replicas.
tags {"slow valgrind:skip"} {
run_solo {cluster-replica-migration-slow} {
start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration yes}} {
test "Primary #0 should re-acquire one or more replicas" {
# Resharding all the primary #0 slots away from it
set primary0_id [dict get [cluster_get_myself 0] id]
set output [exec \
$::VALKEY_CLI_BIN --cluster rebalance \
127.0.0.1:[srv 0 port] \
{*}[valkeycli_tls_config "./tests"] \
--cluster-weight ${primary0_id}=0 >@ stdout]
# Primary #0 who lost all slots should turn into a replica without replicas
wait_for_condition 1000 50 {
[s 0 role] eq "slave" && [s 0 connected_slaves] == 0
} else {
puts [R 0 info replication]
fail "Primary #0 didn't turn itself into a replica"
}
# Resharding back some slot to primary #0
# Wait for the cluster config to propagate before attempting a
# new resharding.
wait_for_cluster_propagation
set output [exec \
$::VALKEY_CLI_BIN --cluster rebalance \
127.0.0.1:[srv 0 port] \
{*}[valkeycli_tls_config "./tests"] \
--cluster-weight ${primary0_id}=.01 \
--cluster-use-empty-masters >@ stdout]
wait_for_condition 1000 50 {
[llength [lindex [R 0 role] 2]] >= 1
} else {
fail "Primary #0 has no has replicas"
}
}
} ;# start_cluster
} ;# run_solo
} ;# tag
# Check that if 'cluster-allow-replica-migration' is set to 'no', replicas do not
# migrate when primary becomes empty.
tags {"slow valgrind:skip"} {
run_solo {cluster-replica-migration-slow} {
start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no}} {
test "Each primary should have at least two replicas attached" {
# Resharding all the primary #0 slots away from it
set primary0_id [dict get [cluster_get_myself 0] id]
set output [exec \
$::VALKEY_CLI_BIN --cluster rebalance \
127.0.0.1:[srv 0 port] \
{*}[valkeycli_tls_config "./tests"] \
--cluster-weight ${primary0_id}=0 >@ stdout]
wait_for_cluster_propagation
wait_for_cluster_state "ok"
# Primary #0 still should have its replicas
assert { [llength [lindex [R 0 role] 2]] >= 2 }
# Each primary should have at least two replicas attached
for {set id 0} {$id < 5} {incr id} {
wait_for_condition 1000 50 {
[llength [lindex [R $id role] 2]] >= 2
} else {
fail "Primary #$id does not have 2 replicas as expected"
}
}
}
} ;# start_cluster
} ;# run_solo
} ;# tag
+70
View File
@@ -433,3 +433,73 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout
assert_equal [R 2 dbsize] 0
}
} my_slot_allocation cluster_allocate_replicas ;# start_cluster
# Replica migration test.
# Check that orphaned primaries are joined by replicas of primaries having
# multiple replicas attached, according to the migration barrier settings.
start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} {
# Killing all the replicas of primary #0 and #1
pause_process [srv -5 pid]
pause_process [srv -10 pid]
pause_process [srv -6 pid]
pause_process [srv -11 pid]
# R0 and R1 will trigger replica migration.
wait_for_condition 1000 50 {
[expr {[count_log_message -7 "Migrating to orphaned primary"] +
[count_log_message -8 "Migrating to orphaned primary"] +
[count_log_message -9 "Migrating to orphaned primary"] +
[count_log_message -12 "Migrating to orphaned primary"] +
[count_log_message -13 "Migrating to orphaned primary"] +
[count_log_message -14 "Migrating to orphaned primary"]}] == 2
} else {
fail "Replicas did not trigger replica migration "
}
# Primary should have at least one replica"
for {set id 0} {$id < 5} {incr id} {
wait_for_condition 1000 50 {
[llength [lindex [R $id role] 2]] >= 1
} else {
fail "Primary #$id has no replicas"
}
}
resume_process [srv -5 pid]
resume_process [srv -10 pid]
resume_process [srv -6 pid]
resume_process [srv -11 pid]
} ;# start_cluster
# Now test the migration to a primary which used to be a replica, after
# a failover.
start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} {
test "Primary #12 should get at least one migrated replica" {
# Kill replica #7 of primary #2. Only replica left is #12 now"
pause_process [srv -7 pid]
# Killing primary node #2, #12 should failover
set current_epoch [CI 1 cluster_current_epoch]
pause_process [srv -2 pid]
wait_for_condition 1000 50 {
[CI 1 cluster_current_epoch] > $current_epoch
} else {
fail "No failover detected"
}
wait_for_cluster_state ok
cluster_write_test [srv -1 port]
assert {[s -12 role] eq {master}}
# The remaining instance is now without replicas. Some other replica
# should migrate to it.
wait_for_condition 1000 50 {
[llength [lindex [R 12 role] 2]] >= 1
} else {
fail "Primary #12 has no replicas"
}
resume_process [srv -7 pid]
resume_process [srv -2 pid]
}
} ;# start_cluster
+185
View File
@@ -0,0 +1,185 @@
# Resharding test.
# In this test a live resharding is performed and the test checks
# that certain properties are preserved across the operation.
tags {"slow"} {
run_solo {cluster-resharding} {
start_cluster 5 5 {tags {external:skip cluster}} {
test "Enable AOF in all the instances" {
for {set id 0} {$id < [llength $::servers]} {incr id} {
R $id config set appendonly yes
# We use "appendfsync no" because it's fast but also guarantees that
# write(2) is performed before replying to client.
R $id config set appendfsync no
}
for {set id 0} {$id < [llength $::servers]} {incr id} {
wait_for_condition 1000 500 {
[s [expr -1*$id] aof_rewrite_in_progress] == 0 &&
[s [expr -1*$id] aof_enabled] == 1
} else {
fail "Failed to enable AOF on instance #$id"
}
}
}
# Our resharding test performs the following actions:
#
# - N commands are sent to the cluster in the course of the test.
# - Every command selects a random key from key:0 to key:MAX-1.
# - The operation RPUSH key <randomvalue> is performed.
# - Tcl remembers into an array all the values pushed to each list.
# - After N/2 commands, the resharding process is started in background.
# - The test continues while the resharding is in progress.
# - At the end of the test, we wait for the resharding process to stop.
# - Finally the keys are checked to see if they contain the value they should.
set numkeys 50000
set numops 200000
set start_node_port [srv 0 port]
set cluster [valkey_cluster 127.0.0.1:$start_node_port]
if {$::tls} {
# setup a non-TLS cluster client to the TLS cluster
set plaintext_port [srv 0 pport]
set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0]
puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
} else {
set cluster_plaintext $cluster
puts "Testing using non-TLS cluster"
}
catch {unset content}
array set content {}
set tribpid {}
test "Cluster consistency during live resharding" {
set ele 0
for {set j 0} {$j < $numops} {incr j} {
# Trigger the resharding once we execute half the ops.
if {$tribpid ne {} &&
($j % 10000) == 0 &&
![process_is_alive $tribpid]} {
set tribpid {}
}
if {$j >= $numops/2 && $tribpid eq {}} {
puts -nonewline "...Starting resharding..."
flush stdout
set target [dict get [cluster_get_myself [randomInt 5]] id]
set tribpid [lindex [exec \
$::VALKEY_CLI_BIN --cluster reshard \
127.0.0.1:[srv 0 port] \
--cluster-from all \
--cluster-to $target \
--cluster-slots 100 \
--cluster-yes \
{*}[valkeycli_tls_config "./tests"] \
| [info nameofexecutable] \
tests/helpers/onlydots.tcl \
&] 0]
}
# Write random data to random list.
set listid [randomInt $numkeys]
set key "key:$listid"
incr ele
# We write both with Lua scripts and with plain commands.
# This way we are able to stress Lua -> server command invocation
# as well, that has tests to prevent Lua to write into wrong
# hash slots.
# We also use both TLS and plaintext connections.
if {$listid % 3 == 0} {
$cluster rpush $key $ele
} elseif {$listid % 3 == 1} {
$cluster_plaintext rpush $key $ele
} else {
$cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
}
lappend content($key) $ele
if {($j % 1000) == 0} {
puts -nonewline W; flush stdout
}
}
# Wait for the resharding process to end
wait_for_condition 1000 500 {
[process_is_alive $tribpid] == 0
} else {
fail "Resharding is not terminating after some time."
}
wait_for_cluster_propagation
}
test "Verify $numkeys keys for consistency with logical content" {
# Check that the Cluster content matches our logical content.
foreach {key value} [array get content] {
if {[$cluster lrange $key 0 -1] ne $value} {
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
}
}
}
test "Terminate and restart all the instances" {
for {set id 0} {$id < [llength $::servers]} {incr id} {
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
R $id config set appendonly no
R $id bgsave
wait_for_condition 1000 50 {
[s [expr -1*$id] rdb_bgsave_in_progress] == 0
} else {
fail "bgsave didn't finish for instance #$id"
}
restart_server [expr -1*$id] true false
}
}
test "Cluster should eventually be up again" {
wait_for_cluster_state ok
}
test "Verify $numkeys keys after the restart" {
# Check that the Cluster content matches our logical content.
foreach {key value} [array get content] {
if {[$cluster lrange $key 0 -1] ne $value} {
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
}
}
}
test "Disable AOF in all the instances" {
for {set id 0} {$id < [llength $::servers]} {incr id} {
R $id config set appendonly no
}
}
test "Verify slaves consistency" {
set verified_masters 0
for {set id 0} {$id < [llength $::servers]} {incr id} {
set role [R $id role]
lassign $role myrole myoffset slaves
if {$myrole eq {slave}} continue
set masterport [srv [expr -1*$id] port]
set masterdigest [R $id debug digest]
for {set sid 0} {$sid < [llength $::servers]} {incr sid} {
set srole [R $sid role]
if {[lindex $srole 0] eq {master}} continue
if {[lindex $srole 2] != $masterport} continue
wait_for_condition 1000 500 {
[R $sid debug digest] eq $masterdigest
} else {
fail "Master and slave data digest are different"
}
incr verified_masters
}
}
assert {$verified_masters >= 5}
}
test "Dump sanitization was skipped for migrations" {
for {set id 0} {$id < [llength $::servers]} {incr id} {
assert {[s [expr -1*$id] dump_payload_sanitizations] == 0}
}
}
} ;# start_cluster
} ;# run_solo
} ;# tag