From ff80b2d1dc7c7858e938ece683d5c3fcb8c9ed31 Mon Sep 17 00:00:00 2001 From: Jun Yeong Kim <164037052+junyeong0619@users.noreply.github.com> Date: Mon, 27 Apr 2026 18:31:37 +0900 Subject: [PATCH] Migrate the remaining cluster tests to the new framework and remove legacy files (#2297) (#3382) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrated the remaining cluster tests to tests/unit/cluster/ to use the same framework for all cluster tests. Cleaned up the obsolete cluster test framework files and updated the CI workflows to use the new unified test runner. Changes: Moved and mapped 6 test files: - 03-failover-loop.tcl → Merged into existing failover.tcl - 04-resharding.tcl → resharding.tcl - 12-replica-migration-2.tcl + 12.1-replica-migration-3.tcl → replica-migration-slow.tcl - 07-replica-migration.tcl → Merged into existing replica-migration.tcl - 28-cluster-shards.tcl → Merged into existing cluster-shards.tcl Other changes: - Converted old framework APIs (e.g., K, RI) to new framework APIs (e.g., R, srv) - Added process_is_alive check in cluster_util.tcl to fix an exception in failover tests caused by executing ps on dead processes - Heavy tests (resharding, replica-migration-slow) marked with slow tag and wrapped in run_solo to prevent resource contention in sanitizer environments - replica-migration-slow marked with valgrind:skip tag since it is very slow - Removed the entire tests/cluster/ directory including run.tcl, cluster.tcl, includes/, and helpers/ - Kept runtest-cluster as a wrapper script (exec ./runtest --cluster "$@") - Removed ./runtest-cluster calls from .github/workflows/daily.yml as cluster tests are now included in ./runtest Closes #2297. Signed-off-by: Jun Yeong Kim Signed-off-by: Binbin Co-authored-by: Binbin --- .github/workflows/daily.yml | 68 ---- runtest-cluster | 14 +- tests/cluster/cluster.tcl | 238 ------------- tests/cluster/run.tcl | 35 -- tests/cluster/tests/03-failover-loop.tcl | 117 ------- tests/cluster/tests/04-resharding.tcl | 196 ----------- tests/cluster/tests/07-replica-migration.tcl | 103 ------ .../cluster/tests/12-replica-migration-2.tcl | 75 ----- .../tests/12.1-replica-migration-3.tcl | 65 ---- tests/cluster/tests/28-cluster-shards.tcl | 312 ------------------ tests/cluster/tests/includes/init-tests.tcl | 91 ----- tests/cluster/tests/includes/utils.tcl | 36 -- tests/cluster/tmp/.gitignore | 1 - .../{cluster/tests => }/helpers/onlydots.tcl | 0 tests/support/cluster_util.tcl | 1 + tests/unit/cluster/cluster-shards.tcl | 286 ++++++++++++++++ tests/unit/cluster/failover.tcl | 105 ++++++ tests/unit/cluster/replica-migration-slow.tcl | 76 +++++ tests/unit/cluster/replica-migration.tcl | 70 ++++ tests/unit/cluster/resharding.tcl | 185 +++++++++++ 20 files changed, 724 insertions(+), 1350 deletions(-) delete mode 100644 tests/cluster/cluster.tcl delete mode 100644 tests/cluster/run.tcl delete mode 100644 tests/cluster/tests/03-failover-loop.tcl delete mode 100644 tests/cluster/tests/04-resharding.tcl delete mode 100644 tests/cluster/tests/07-replica-migration.tcl delete mode 100644 tests/cluster/tests/12-replica-migration-2.tcl delete mode 100644 tests/cluster/tests/12.1-replica-migration-3.tcl delete mode 100644 tests/cluster/tests/28-cluster-shards.tcl delete mode 100644 tests/cluster/tests/includes/init-tests.tcl delete mode 100644 tests/cluster/tests/includes/utils.tcl delete mode 100644 tests/cluster/tmp/.gitignore rename tests/{cluster/tests => }/helpers/onlydots.tcl (100%) create mode 100644 tests/unit/cluster/replica-migration-slow.tcl create mode 100644 tests/unit/cluster/resharding.tcl diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index f67d6ca47..fd275ab18 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -125,9 +125,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -211,9 +208,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -276,9 +270,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -334,9 +325,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} test-ubuntu-no-malloc-usable-size: runs-on: ubuntu-latest if: | @@ -384,9 +372,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} test-ubuntu-32bit: runs-on: ubuntu-latest if: | @@ -457,9 +442,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -522,10 +504,6 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | ./runtest-sentinel --tls ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: | - ./runtest-cluster --tls ${{github.event.inputs.cluster_test_args}} test-ubuntu-tls-no-tls: runs-on: ubuntu-latest if: | @@ -578,10 +556,6 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: | - ./runtest-cluster ${{github.event.inputs.cluster_test_args}} test-ubuntu-io-threads: runs-on: ubuntu-latest if: | @@ -623,9 +597,6 @@ jobs: - name: test if: true && !contains(github.event.inputs.skiptests, 'valkey') run: ./runtest --io-threads --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}} test-ubuntu-tls-io-threads: runs-on: ubuntu-latest if: | @@ -670,10 +641,6 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'valkey') run: | ./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: | - ./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}} test-ubuntu-reclaim-cache: runs-on: ubuntu-latest if: | @@ -991,9 +958,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -1129,9 +1093,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -1263,9 +1224,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | @@ -1324,9 +1282,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} test-rpm-distros-jemalloc: if: | (github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' || @@ -1399,9 +1354,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} test-rpm-distros-tls-module: if: | (github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' || @@ -1479,10 +1431,6 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: | - ./runtest-cluster --tls-module ${{github.event.inputs.cluster_test_args}} test-rpm-distros-tls-module-no-tls: if: | (github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' || @@ -1555,10 +1503,6 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: | ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: | - ./runtest-cluster ${{github.event.inputs.cluster_test_args}} test-macos-latest: runs-on: macos-latest if: | @@ -1676,9 +1620,6 @@ jobs: - run: cd libbacktrace && ./configure && make && sudo make install - name: make run: make SERVER_CFLAGS='-Werror' USE_LIBBACKTRACE=yes - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} build-old-macos-versions: strategy: fail-fast: false @@ -1806,9 +1747,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} test-alpine-libc-malloc: runs-on: ubuntu-latest if: | @@ -1859,9 +1797,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} reply-schemas-validator: runs-on: ubuntu-latest timeout-minutes: 1440 @@ -1909,9 +1844,6 @@ jobs: - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}} - - name: cluster tests - if: true && !contains(github.event.inputs.skiptests, 'cluster') - run: ./runtest-cluster --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}} - name: Install Python dependencies uses: py-actions/py-dependency-install@30aa0023464ed4b5b116bd9fbdab87acf01a484e # v4.1.0 with: diff --git a/runtest-cluster b/runtest-cluster index de31e7287..ef928e677 100755 --- a/runtest-cluster +++ b/runtest-cluster @@ -1,14 +1,2 @@ #!/bin/sh -TCL_VERSIONS="8.5 8.6 9.0" -TCLSH="" - -for VERSION in $TCL_VERSIONS; do - TCL=`which tclsh$VERSION 2>/dev/null` && TCLSH=$TCL -done - -if [ -z $TCLSH ] -then - echo "You need tcl 8.5 or newer in order to run the Valkey Cluster test" - exit 1 -fi -$TCLSH tests/cluster/run.tcl $* +exec ./runtest --cluster "$@" diff --git a/tests/cluster/cluster.tcl b/tests/cluster/cluster.tcl deleted file mode 100644 index 783465875..000000000 --- a/tests/cluster/cluster.tcl +++ /dev/null @@ -1,238 +0,0 @@ -# Cluster-specific test functions. -# -# Copyright (C) 2014 Redis Ltd. -# This software is released under the BSD License. See the COPYING file for -# more information. - -# Track cluster configuration as created by create_cluster below -set ::cluster_master_nodes 0 -set ::cluster_replica_nodes 0 - -# Returns a parsed CLUSTER NODES output as a list of dictionaries. Optional status field -# can be specified to only returns entries that match the provided status. -proc get_cluster_nodes {id {status "*"}} { - set lines [split [R $id cluster nodes] "\r\n"] - set nodes {} - foreach l $lines { - set l [string trim $l] - if {$l eq {}} continue - set args [split $l] - set node [dict create \ - id [lindex $args 0] \ - addr [lindex $args 1] \ - flags [split [lindex $args 2] ,] \ - slaveof [lindex $args 3] \ - ping_sent [lindex $args 4] \ - pong_recv [lindex $args 5] \ - config_epoch [lindex $args 6] \ - linkstate [lindex $args 7] \ - slots [lrange $args 8 end] \ - ] - if {[string match $status [lindex $args 7]]} { - lappend nodes $node - } - } - return $nodes -} - -# Test node for flag. -proc has_flag {node flag} { - expr {[lsearch -exact [dict get $node flags] $flag] != -1} -} - -# Returns the parsed myself node entry as a dictionary. -proc get_myself id { - set nodes [get_cluster_nodes $id] - foreach n $nodes { - if {[has_flag $n myself]} {return $n} - } - return {} -} - -# Get a specific node by ID by parsing the CLUSTER NODES output -# of the instance Number 'instance_id' -proc get_node_by_id {instance_id node_id} { - set nodes [get_cluster_nodes $instance_id] - foreach n $nodes { - if {[dict get $n id] eq $node_id} {return $n} - } - return {} -} - -# Return the value of the specified CLUSTER INFO field. -proc CI {n field} { - get_info_field [R $n cluster info] $field -} - -# Return the value of the specified INFO field. -proc s {n field} { - get_info_field [R $n info] $field -} - -# Assuming nodes are reset, this function performs slots allocation. -# Only the first 'n' nodes are used. -proc cluster_allocate_slots {n} { - set slot 16383 - while {$slot >= 0} { - # Allocate successive slots to random nodes. - set node [randomInt $n] - lappend slots_$node $slot - incr slot -1 - } - for {set j 0} {$j < $n} {incr j} { - R $j cluster addslots {*}[set slots_${j}] - } -} - -# Check that cluster nodes agree about "state", or raise an error. -proc assert_cluster_state {state} { - foreach_valkey_id id { - if {[instance_is_killed valkey $id]} continue - wait_for_condition 1000 50 { - [CI $id cluster_state] eq $state - } else { - fail "Cluster node $id cluster_state:[CI $id cluster_state]" - } - } -} - -# Search the first node starting from ID $first that is not -# already configured as a slave. -proc cluster_find_available_slave {first} { - foreach_valkey_id id { - if {$id < $first} continue - if {[instance_is_killed valkey $id]} continue - set me [get_myself $id] - if {[dict get $me slaveof] eq {-}} {return $id} - } - fail "No available slaves" -} - -# Add 'slaves' slaves to a cluster composed of 'masters' masters. -# It assumes that masters are allocated sequentially from instance ID 0 -# to N-1. -proc cluster_allocate_slaves {masters slaves} { - for {set j 0} {$j < $slaves} {incr j} { - set master_id [expr {$j % $masters}] - set slave_id [cluster_find_available_slave $masters] - set master_myself [get_myself $master_id] - R $slave_id cluster replicate [dict get $master_myself id] - } -} - -# Create a cluster composed of the specified number of masters and slaves. -proc create_cluster {masters slaves} { - cluster_allocate_slots $masters - if {$slaves} { - cluster_allocate_slaves $masters $slaves - } - assert_cluster_state ok - - set ::cluster_master_nodes $masters - set ::cluster_replica_nodes $slaves -} - -proc cluster_allocate_with_continuous_slots {n} { - set slot 16383 - set avg [expr ($slot+1) / $n] - while {$slot >= 0} { - set node [expr $slot/$avg >= $n ? $n-1 : $slot/$avg] - lappend slots_$node $slot - incr slot -1 - } - for {set j 0} {$j < $n} {incr j} { - R $j cluster addslots {*}[set slots_${j}] - } -} - -# Create a cluster composed of the specified number of masters and slaves, -# but with a continuous slot range. -proc cluster_create_with_continuous_slots {masters slaves} { - cluster_allocate_with_continuous_slots $masters - if {$slaves} { - cluster_allocate_slaves $masters $slaves - } - assert_cluster_state ok - - set ::cluster_master_nodes $masters - set ::cluster_replica_nodes $slaves -} - - -# Set the cluster node-timeout to all the reachalbe nodes. -proc set_cluster_node_timeout {to} { - foreach_valkey_id id { - catch {R $id CONFIG SET cluster-node-timeout $to} - } -} - -# Check if the cluster is writable and readable. Use node "id" -# as a starting point to talk with the cluster. -proc cluster_write_test {id} { - set prefix [randstring 20 20 alpha] - set port [get_instance_attrib valkey $id port] - set cluster [valkey_cluster 127.0.0.1:$port] - for {set j 0} {$j < 100} {incr j} { - $cluster set key.$j $prefix.$j - } - for {set j 0} {$j < 100} {incr j} { - assert {[$cluster get key.$j] eq "$prefix.$j"} - } - $cluster close -} - -# Check if cluster configuration is consistent. -# All the nodes in the cluster should show same slots configuration and have health -# state "online" to be considered as consistent. -proc cluster_config_consistent {} { - for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { - # Check if all the nodes are online - set shards_cfg [R $j CLUSTER SHARDS] - foreach shard_cfg $shards_cfg { - set nodes [dict get $shard_cfg nodes] - foreach node $nodes { - if {[dict get $node health] ne "online"} { - return 0 - } - } - } - - if {$j == 0} { - set base_cfg [R $j cluster slots] - } else { - set cfg [R $j cluster slots] - if {$cfg != $base_cfg} { - return 0 - } - } - } - - return 1 -} - -# Wait for cluster configuration to propagate and be consistent across nodes. -proc wait_for_cluster_propagation {} { - wait_for_condition 1000 50 { - [cluster_config_consistent] eq 1 - } else { - for {set j 0} {$j < [llength $::servers]} {incr j} { - puts "R $j cluster slots output: [R $j cluster slots]" - } - fail "cluster config did not reach a consistent state" - } -} - -# Check if cluster's view of hostnames is consistent -proc are_hostnames_propagated {match_string} { - for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { - set cfg [R $j cluster slots] - foreach node $cfg { - for {set i 2} {$i < [llength $node]} {incr i} { - if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } { - return 0 - } - } - } - } - return 1 -} diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl deleted file mode 100644 index 88173a8e2..000000000 --- a/tests/cluster/run.tcl +++ /dev/null @@ -1,35 +0,0 @@ -# Cluster test suite. Copyright (C) 2014 Redis Ltd. -# This software is released under the BSD License. See the COPYING file for -# more information. - -# Set the executable paths at project root -source tests/support/set_executable_path.tcl - -cd tests/cluster -source cluster.tcl -source ../instances.tcl -source ../../support/cluster.tcl ; # Cluster client. - -set ::instances_count 20 ; # How many instances we use at max. -set ::tlsdir "../../tls" - -proc main {} { - parse_options - spawn_instance valkey $::valkey_base_port $::instances_count { - "cluster-enabled yes" - "appendonly yes" - "enable-protected-configs yes" - "enable-debug-command yes" - "save ''" - } - run_tests - cleanup - end_tests -} - -if {[catch main e]} { - puts $::errorInfo - if {$::pause_on_error} pause_on_error - cleanup - exit 1 -} diff --git a/tests/cluster/tests/03-failover-loop.tcl b/tests/cluster/tests/03-failover-loop.tcl deleted file mode 100644 index 542334927..000000000 --- a/tests/cluster/tests/03-failover-loop.tcl +++ /dev/null @@ -1,117 +0,0 @@ -# Failover stress test. -# In this test a different node is killed in a loop for N -# iterations. The test checks that certain properties -# are preserved across iterations. - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -set iterations 20 -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] - -while {[incr iterations -1]} { - set tokill [randomInt 10] - set other [expr {($tokill+1)%10}] ; # Some other instance. - set key [randstring 20 20 alpha] - set val [randstring 20 20 alpha] - set role [RI $tokill role] - if {$role eq {master}} { - set slave {} - set myid [dict get [get_myself $tokill] id] - foreach_valkey_id id { - if {$id == $tokill} continue - if {[dict get [get_myself $id] slaveof] eq $myid} { - set slave $id - } - } - if {$slave eq {}} { - fail "Unable to retrieve slave's ID for master #$tokill" - } - } - - puts "--- Iteration $iterations ---" - - if {$role eq {master}} { - test "Wait for slave of #$tokill to sync" { - wait_for_condition 1000 50 { - [string match {*state=online*} [RI $tokill slave0]] - } else { - fail "Slave of node #$tokill is not ok" - } - } - set slave_config_epoch [CI $slave cluster_my_epoch] - } - - test "Cluster is writable before failover" { - for {set i 0} {$i < 100} {incr i} { - catch {$cluster set $key:$i $val:$i} err - assert {$err eq {OK}} - } - # Wait for the write to propagate to the slave if we - # are going to kill a master. - if {$role eq {master}} { - R $tokill wait 1 20000 - } - } - - test "Terminating node #$tokill" { - # Stop AOF so that an initial AOFRW won't prevent the instance from terminating - R $tokill config set appendonly no - kill_instance valkey $tokill - } - - if {$role eq {master}} { - test "Wait failover by #$slave with old epoch $slave_config_epoch" { - wait_for_condition 1000 50 { - [CI $slave cluster_my_epoch] > $slave_config_epoch - } else { - fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]" - } - } - } - - test "Cluster should eventually be up again" { - assert_cluster_state ok - } - - test "Cluster is writable again" { - for {set i 0} {$i < 100} {incr i} { - catch {$cluster set $key:$i:2 $val:$i:2} err - assert {$err eq {OK}} - } - } - - test "Restarting node #$tokill" { - restart_instance valkey $tokill - } - - test "Instance #$tokill is now a slave" { - wait_for_condition 1000 50 { - [RI $tokill role] eq {slave} - } else { - fail "Restarted instance is not a slave" - } - } - - test "We can read back the value we set before" { - for {set i 0} {$i < 100} {incr i} { - catch {$cluster get $key:$i} err - assert {$err eq "$val:$i"} - catch {$cluster get $key:$i:2} err - assert {$err eq "$val:$i:2"} - } - } -} - -test "Post condition: current_epoch >= my_epoch everywhere" { - foreach_valkey_id id { - assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]} - } -} diff --git a/tests/cluster/tests/04-resharding.tcl b/tests/cluster/tests/04-resharding.tcl deleted file mode 100644 index 634ec84d3..000000000 --- a/tests/cluster/tests/04-resharding.tcl +++ /dev/null @@ -1,196 +0,0 @@ -# Failover stress test. -# In this test a different node is killed in a loop for N -# iterations. The test checks that certain properties -# are preserved across iterations. - -source "../tests/includes/init-tests.tcl" -source "../../../tests/support/cli.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Enable AOF in all the instances" { - foreach_valkey_id id { - R $id config set appendonly yes - # We use "appendfsync no" because it's fast but also guarantees that - # write(2) is performed before replying to client. - R $id config set appendfsync no - } - - foreach_valkey_id id { - wait_for_condition 1000 500 { - [RI $id aof_rewrite_in_progress] == 0 && - [RI $id aof_enabled] == 1 - } else { - fail "Failed to enable AOF on instance #$id" - } - } -} - -# Return non-zero if the specified PID is about a process still in execution, -# otherwise 0 is returned. -proc process_is_running {pid} { - # PS should return with an error if PID is non existing, - # and catch will return non-zero. We want to return non-zero if - # the PID exists, so we invert the return value with expr not operator. - expr {![catch {exec ps -p $pid}]} -} - -# Our resharding test performs the following actions: -# -# - N commands are sent to the cluster in the course of the test. -# - Every command selects a random key from key:0 to key:MAX-1. -# - The operation RPUSH key is performed. -# - Tcl remembers into an array all the values pushed to each list. -# - After N/2 commands, the resharding process is started in background. -# - The test continues while the resharding is in progress. -# - At the end of the test, we wait for the resharding process to stop. -# - Finally the keys are checked to see if they contain the value they should. - -set numkeys 50000 -set numops 200000 -set start_node_port [get_instance_attrib valkey 0 port] -set cluster [valkey_cluster 127.0.0.1:$start_node_port] -if {$::tls} { - # setup a non-TLS cluster client to the TLS cluster - set plaintext_port [get_instance_attrib valkey 0 plaintext-port] - set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0] - puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port" -} else { - set cluster_plaintext $cluster - puts "Testing using non-TLS cluster" -} -catch {unset content} -array set content {} -set tribpid {} - -test "Cluster consistency during live resharding" { - set ele 0 - for {set j 0} {$j < $numops} {incr j} { - # Trigger the resharding once we execute half the ops. - if {$tribpid ne {} && - ($j % 10000) == 0 && - ![process_is_running $tribpid]} { - set tribpid {} - } - - if {$j >= $numops/2 && $tribpid eq {}} { - puts -nonewline "...Starting resharding..." - flush stdout - set target [dict get [get_myself [randomInt 5]] id] - set tribpid [lindex [exec \ - $::VALKEY_CLI_BIN --cluster reshard \ - 127.0.0.1:[get_instance_attrib valkey 0 port] \ - --cluster-from all \ - --cluster-to $target \ - --cluster-slots 100 \ - --cluster-yes \ - {*}[valkeycli_tls_config "../../../tests"] \ - | [info nameofexecutable] \ - ../tests/helpers/onlydots.tcl \ - &] 0] - } - - # Write random data to random list. - set listid [randomInt $numkeys] - set key "key:$listid" - incr ele - # We write both with Lua scripts and with plain commands. - # This way we are able to stress Lua -> server command invocation - # as well, that has tests to prevent Lua to write into wrong - # hash slots. - # We also use both TLS and plaintext connections. - if {$listid % 3 == 0} { - $cluster rpush $key $ele - } elseif {$listid % 3 == 1} { - $cluster_plaintext rpush $key $ele - } else { - $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele - } - lappend content($key) $ele - - if {($j % 1000) == 0} { - puts -nonewline W; flush stdout - } - } - - # Wait for the resharding process to end - wait_for_condition 1000 500 { - [process_is_running $tribpid] == 0 - } else { - fail "Resharding is not terminating after some time." - } - wait_for_cluster_propagation -} - -test "Verify $numkeys keys for consistency with logical content" { - # Check that the Cluster content matches our logical content. - foreach {key value} [array get content] { - if {[$cluster lrange $key 0 -1] ne $value} { - fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" - } - } -} - -test "Terminate and restart all the instances" { - foreach_valkey_id id { - # Stop AOF so that an initial AOFRW won't prevent the instance from terminating - R $id config set appendonly no - kill_instance valkey $id - restart_instance valkey $id - } -} - -test "Cluster should eventually be up again" { - assert_cluster_state ok -} - -test "Verify $numkeys keys after the restart" { - # Check that the Cluster content matches our logical content. - foreach {key value} [array get content] { - if {[$cluster lrange $key 0 -1] ne $value} { - fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" - } - } -} - -test "Disable AOF in all the instances" { - foreach_valkey_id id { - R $id config set appendonly no - } -} - -test "Verify slaves consistency" { - set verified_masters 0 - foreach_valkey_id id { - set role [R $id role] - lassign $role myrole myoffset slaves - if {$myrole eq {slave}} continue - set masterport [get_instance_attrib valkey $id port] - set masterdigest [R $id debug digest] - foreach_valkey_id sid { - set srole [R $sid role] - if {[lindex $srole 0] eq {master}} continue - if {[lindex $srole 2] != $masterport} continue - wait_for_condition 1000 500 { - [R $sid debug digest] eq $masterdigest - } else { - fail "Master and slave data digest are different" - } - incr verified_masters - } - } - assert {$verified_masters >= 5} -} - -test "Dump sanitization was skipped for migrations" { - set verified_masters 0 - foreach_valkey_id id { - assert {[RI $id dump_payload_sanitizations] == 0} - } -} diff --git a/tests/cluster/tests/07-replica-migration.tcl b/tests/cluster/tests/07-replica-migration.tcl deleted file mode 100644 index 8b9fc18a7..000000000 --- a/tests/cluster/tests/07-replica-migration.tcl +++ /dev/null @@ -1,103 +0,0 @@ -# Replica migration test. -# Check that orphaned masters are joined by replicas of masters having -# multiple replicas attached, according to the migration barrier settings. - -source "../tests/includes/init-tests.tcl" - -# Create a cluster with 5 master and 10 slaves, so that we have 2 -# slaves for each master. -test "Create a 5 nodes cluster" { - create_cluster 5 10 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Each master should have two replicas attached" { - foreach_valkey_id id { - if {$id < 5} { - wait_for_condition 1000 50 { - [llength [lindex [R $id role] 2]] == 2 - } else { - fail "Master #$id does not have 2 slaves as expected" - } - } - } -} - -test "Killing all the slaves of master #0 and #1" { - kill_instance valkey 5 - kill_instance valkey 10 - kill_instance valkey 6 - kill_instance valkey 11 - after 4000 -} - -foreach_valkey_id id { - if {$id < 5} { - test "Master #$id should have at least one replica" { - wait_for_condition 1000 50 { - [llength [lindex [R $id role] 2]] >= 1 - } else { - fail "Master #$id has no replicas" - } - } - } -} - -# Now test the migration to a master which used to be a slave, after -# a failver. - -source "../tests/includes/init-tests.tcl" - -# Create a cluster with 5 master and 10 slaves, so that we have 2 -# slaves for each master. -test "Create a 5 nodes cluster" { - create_cluster 5 10 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Kill slave #7 of master #2. Only slave left is #12 now" { - kill_instance valkey 7 -} - -set current_epoch [CI 1 cluster_current_epoch] - -test "Killing master node #2, #12 should failover" { - kill_instance valkey 2 -} - -test "Wait for failover" { - wait_for_condition 1000 50 { - [CI 1 cluster_current_epoch] > $current_epoch - } else { - fail "No failover detected" - } -} - -test "Cluster should eventually be up again" { - assert_cluster_state ok -} - -test "Cluster is writable" { - cluster_write_test 1 -} - -test "Instance 12 is now a master without slaves" { - assert {[RI 12 role] eq {master}} -} - -# The remaining instance is now without slaves. Some other slave -# should migrate to it. - -test "Master #12 should get at least one migrated replica" { - wait_for_condition 1000 50 { - [llength [lindex [R 12 role] 2]] >= 1 - } else { - fail "Master #12 has no replicas" - } -} diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl deleted file mode 100644 index 79c5c2750..000000000 --- a/tests/cluster/tests/12-replica-migration-2.tcl +++ /dev/null @@ -1,75 +0,0 @@ -# Replica migration test #2. -# -# Check that the status of master that can be targeted by replica migration -# is acquired again, after being getting slots again, in a cluster where the -# other masters have slaves. - -source "../tests/includes/init-tests.tcl" -source "../../../tests/support/cli.tcl" - -# Create a cluster with 5 master and 15 slaves, to make sure there are no -# empty masters and make rebalancing simpler to handle during the test. -test "Create a 5 nodes cluster" { - cluster_create_with_continuous_slots 5 15 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Each master should have at least two replicas attached" { - foreach_valkey_id id { - if {$id < 5} { - wait_for_condition 1000 50 { - [llength [lindex [R $id role] 2]] >= 2 - } else { - fail "Master #$id does not have 2 slaves as expected" - } - } - } -} - -test "Set allow-replica-migration yes" { - foreach_valkey_id id { - R $id CONFIG SET cluster-allow-replica-migration yes - } -} - -set master0_id [dict get [get_myself 0] id] -test "Resharding all the master #0 slots away from it" { - set output [exec \ - $::VALKEY_CLI_BIN --cluster rebalance \ - 127.0.0.1:[get_instance_attrib valkey 0 port] \ - {*}[valkeycli_tls_config "../../../tests"] \ - --cluster-weight ${master0_id}=0 >@ stdout ] - -} - -test "Master #0 who lost all slots should turn into a replica without replicas" { - wait_for_condition 1000 50 { - [RI 0 role] == "slave" && [RI 0 connected_slaves] == 0 - } else { - puts [R 0 info replication] - fail "Master #0 didn't turn itself into a replica" - } -} - -test "Resharding back some slot to master #0" { - # Wait for the cluster config to propagate before attempting a - # new resharding. - after 10000 - set output [exec \ - $::VALKEY_CLI_BIN --cluster rebalance \ - 127.0.0.1:[get_instance_attrib valkey 0 port] \ - {*}[valkeycli_tls_config "../../../tests"] \ - --cluster-weight ${master0_id}=.01 \ - --cluster-use-empty-masters >@ stdout] -} - -test "Master #0 should re-acquire one or more replicas" { - wait_for_condition 1000 50 { - [llength [lindex [R 0 role] 2]] >= 1 - } else { - fail "Master #0 has no has replicas" - } -} diff --git a/tests/cluster/tests/12.1-replica-migration-3.tcl b/tests/cluster/tests/12.1-replica-migration-3.tcl deleted file mode 100644 index 876d8ed7d..000000000 --- a/tests/cluster/tests/12.1-replica-migration-3.tcl +++ /dev/null @@ -1,65 +0,0 @@ -# Replica migration test #2. -# -# Check that if 'cluster-allow-replica-migration' is set to 'no', slaves do not -# migrate when master becomes empty. - -source "../tests/includes/init-tests.tcl" -source "../tests/includes/utils.tcl" - -# Create a cluster with 5 master and 15 slaves, to make sure there are no -# empty masters and make rebalancing simpler to handle during the test. -test "Create a 5 nodes cluster" { - cluster_create_with_continuous_slots 5 15 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Each master should have at least two replicas attached" { - foreach_valkey_id id { - if {$id < 5} { - wait_for_condition 1000 50 { - [llength [lindex [R $id role] 2]] >= 2 - } else { - fail "Master #$id does not have 2 slaves as expected" - } - } - } -} - -test "Set allow-replica-migration no" { - foreach_valkey_id id { - R $id CONFIG SET cluster-allow-replica-migration no - } -} - -set master0_id [dict get [get_myself 0] id] -test "Resharding all the master #0 slots away from it" { - set output [exec \ - $::VALKEY_CLI_BIN --cluster rebalance \ - 127.0.0.1:[get_instance_attrib valkey 0 port] \ - {*}[valkeycli_tls_config "../../../tests"] \ - --cluster-weight ${master0_id}=0 >@ stdout ] -} - -test "Wait cluster to be stable" { - wait_cluster_stable -} - -test "Master #0 still should have its replicas" { - assert { [llength [lindex [R 0 role] 2]] >= 2 } -} - -test "Each master should have at least two replicas attached" { - foreach_valkey_id id { - if {$id < 5} { - wait_for_condition 1000 50 { - [llength [lindex [R $id role] 2]] >= 2 - } else { - fail "Master #$id does not have 2 slaves as expected" - } - } - } -} - diff --git a/tests/cluster/tests/28-cluster-shards.tcl b/tests/cluster/tests/28-cluster-shards.tcl deleted file mode 100644 index 35fdb9c2c..000000000 --- a/tests/cluster/tests/28-cluster-shards.tcl +++ /dev/null @@ -1,312 +0,0 @@ -source "../tests/includes/init-tests.tcl" - -# Initial slot distribution. -set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926] -set ::slot1 [list 5460 5460 5462 10922 10925 10925] -set ::slot2 [list 10923 10924 10927 16383] -set ::slot3 [list 1001 1001] - -proc cluster_create_with_split_slots {masters replicas} { - for {set j 0} {$j < $masters} {incr j} { - R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}] - } - if {$replicas} { - cluster_allocate_slaves $masters $replicas - } - set ::cluster_master_nodes $masters - set ::cluster_replica_nodes $replicas -} - -# Get the node info with the specific node_id from the -# given reference node. Valid type options are "node" and "shard" -proc get_node_info_from_shard {id reference {type node}} { - set shards_response [R $reference CLUSTER SHARDS] - foreach shard_response $shards_response { - set nodes [dict get $shard_response nodes] - foreach node $nodes { - if {[dict get $node id] eq $id} { - if {$type eq "node"} { - return $node - } elseif {$type eq "shard"} { - return $shard_response - } else { - return {} - } - } - } - } - # No shard found, return nothing - return {} -} - -proc cluster_ensure_master {id} { - if { [regexp "master" [R $id role]] == 0 } { - assert_equal {OK} [R $id CLUSTER FAILOVER] - wait_for_condition 50 100 { - [regexp "master" [R $id role]] == 1 - } else { - fail "instance $id is not master" - } - } -} - -test "Create a 8 nodes cluster with 4 shards" { - cluster_create_with_split_slots 4 4 -} - -test "Cluster should start ok" { - assert_cluster_state ok -} - -test "Set cluster hostnames and verify they are propagated" { - for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { - R $j config set cluster-announce-hostname "host-$j.com" - } - - # Wait for everyone to agree about the state - wait_for_cluster_propagation -} - -test "Verify information about the shards" { - set ids {} - for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { - lappend ids [R $j CLUSTER MYID] - } - set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3] - - # Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent. - for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} { - for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} { - assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots] - assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname] - assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip] - # Default value of 'cluster-preferred-endpoint-type' is ip. - assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint] - - if {$::tls} { - assert_equal [get_instance_attrib valkey $i plaintext-port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port] - assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port] - } else { - assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port] - } - - if {$i < 4} { - assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role] - assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health] - } else { - assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role] - # Replica could be in online or loading - } - } - } -} - -test "Verify no slot shard" { - # Node 8 has no slots assigned - set node_8_id [R 8 CLUSTER MYID] - assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots] - assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots] -} - -set node_0_id [R 0 CLUSTER MYID] - -test "Kill a node and tell the replica to immediately takeover" { - kill_instance valkey 0 - R 4 cluster failover force -} - -# Primary 0 node should report as fail, wait until the new primary acknowledges it. -test "Verify health as fail for killed node" { - wait_for_condition 1000 50 { - "fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"] - } else { - fail "New primary never detected the node failed" - } -} - -set primary_id 4 -set replica_id 0 - -test "Restarting primary node" { - restart_instance valkey $replica_id -} - -test "Instance #0 gets converted into a replica" { - wait_for_condition 1000 50 { - [RI $replica_id role] eq {slave} - } else { - fail "Old primary was not converted into replica" - } -} - -test "Test the replica reports a loading state while it's loading" { - # Test the command is good for verifying everything moves to a happy state - set replica_cluster_id [R $replica_id CLUSTER MYID] - wait_for_condition 50 1000 { - [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online" - } else { - fail "Replica never transitioned to online" - } - - # Set 1 MB of data, so there is something to load on full sync - R $primary_id debug populate 1000 key 1000 - - # Kill replica client for primary and load new data to the primary - R $primary_id config set repl-backlog-size 100 - - # Set the key load delay so that it will take at least - # 2 seconds to fully load the data. - R $replica_id config set key-load-delay 4000 - - # Trigger event loop processing every 1024 bytes, this trigger - # allows us to send and receive cluster messages, so we are setting - # it low so that the cluster messages are sent more frequently. - R $replica_id config set loading-process-events-interval-bytes 1024 - - R $primary_id multi - R $primary_id client kill type replica - # populate the correct data - set num 100 - set value [string repeat A 1024] - for {set j 0} {$j < $num} {incr j} { - # Use hashtag valid for shard #0 - set key "{ch3}$j" - R $primary_id set $key $value - } - R $primary_id exec - - # The replica should reconnect and start a full sync, it will gossip about it's health to the primary. - wait_for_condition 50 1000 { - "loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] - } else { - fail "Replica never transitioned to loading" - } - - # Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data. - R $replica_id CLUSTER SHARDS - R $replica_id CLUSTER SLOTS - - # Speed up the key loading and verify everything resumes - R $replica_id config set key-load-delay 0 - - wait_for_condition 50 1000 { - "online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] - } else { - fail "Replica never transitioned to online" - } - - # Final sanity, the replica agrees it is online. - assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health] -} - -test "Regression test for a crash when calling SHARDS during handshake" { - # Reset forget a node, so we can use it to establish handshaking connections - set id [R 19 CLUSTER MYID] - R 19 CLUSTER RESET HARD - for {set i 0} {$i < 19} {incr i} { - R $i CLUSTER FORGET $id - } - R 19 cluster meet 127.0.0.1 [get_instance_attrib valkey 0 port] - # This should line would previously crash, since all the outbound - # connections were in handshake state. - R 19 CLUSTER SHARDS -} - -test "Cluster is up" { - assert_cluster_state ok -} -test "Shard ids are unique" { - set shard_ids {} - for {set i 0} {$i < 4} {incr i} { - set shard_id [R $i cluster myshardid] - assert_equal [dict exists $shard_ids $shard_id] 0 - dict set shard_ids $shard_id 1 - } -} - -test "CLUSTER MYSHARDID reports same id for both primary and replica" { - for {set i 0} {$i < 4} {incr i} { - assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid] - assert_equal [string length [R $i cluster myshardid]] 40 - } -} - -test "New replica receives primary's shard id" { - #find a primary - set id 0 - for {} {$id < 8} {incr id} { - if {[regexp "master" [R $id role]]} { - break - } - } - assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid] - assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]] - assert_equal [R 8 cluster myshardid] [R $id cluster myshardid] -} - -test "CLUSTER MYSHARDID reports same shard id after shard restart" { - set node_ids {} - for {set i 0} {$i < 8} {incr i 4} { - dict set node_ids $i [R $i cluster myshardid] - kill_instance valkey $i - wait_for_condition 50 100 { - [instance_is_killed valkey $i] - } else { - fail "instance $i is not killed" - } - } - for {set i 0} {$i < 8} {incr i 4} { - restart_instance valkey $i - } - assert_cluster_state ok - for {set i 0} {$i < 8} {incr i 4} { - assert_equal [dict get $node_ids $i] [R $i cluster myshardid] - } -} - -test "CLUSTER MYSHARDID reports same shard id after cluster restart" { - set node_ids {} - for {set i 0} {$i < 8} {incr i} { - dict set node_ids $i [R $i cluster myshardid] - } - for {set i 0} {$i < 8} {incr i} { - kill_instance valkey $i - wait_for_condition 50 100 { - [instance_is_killed valkey $i] - } else { - fail "instance $i is not killed" - } - } - for {set i 0} {$i < 8} {incr i} { - restart_instance valkey $i - } - assert_cluster_state ok - for {set i 0} {$i < 8} {incr i} { - assert_equal [dict get $node_ids $i] [R $i cluster myshardid] - } -} - -test "CLUSTER SHARDS id response validation" { - # For each node in the cluster - for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} { - # Get the CLUSTER SHARDS output from this node - set shards [R $i CLUSTER SHARDS] - set seen_shard_ids {} - - # For each shard in the output - foreach shard $shards { - set shard_dict [dict create {*}$shard] - - # 1. Verify 'id' key exists - assert {[dict exists $shard_dict id]} - set shard_id [dict get $shard_dict id] - - # 2. Verify shard_id is a 40-char string - assert {[string length $shard_id] == 40} - - # 3. Verify that for a given node's output, all shard IDs are unique - assert {[dict exists $seen_shard_ids $shard_id] == 0} - dict set seen_shard_ids $shard_id 1 - } - } -} diff --git a/tests/cluster/tests/includes/init-tests.tcl b/tests/cluster/tests/includes/init-tests.tcl deleted file mode 100644 index f1cb3a8b6..000000000 --- a/tests/cluster/tests/includes/init-tests.tcl +++ /dev/null @@ -1,91 +0,0 @@ -# Initialization tests -- most units will start including this. - -test "(init) Restart killed instances" { - foreach type {valkey} { - foreach_${type}_id id { - if {[get_instance_attrib $type $id pid] == -1} { - puts -nonewline "$type/$id " - flush stdout - restart_instance $type $id - } - } - } -} - -test "Cluster nodes are reachable" { - foreach_valkey_id id { - # Every node should be reachable. - wait_for_condition 1000 50 { - ([catch {R $id ping} ping_reply] == 0) && - ($ping_reply eq {PONG}) - } else { - catch {R $id ping} err - fail "Node #$id keeps replying '$err' to PING." - } - } -} - -test "Cluster nodes hard reset" { - foreach_valkey_id id { - if {$::valgrind} { - set node_timeout 10000 - } else { - set node_timeout 3000 - } - catch {R $id flushall} ; # May fail for readonly slaves. - R $id MULTI - R $id cluster reset hard - R $id cluster set-config-epoch [expr {$id+1}] - R $id EXEC - R $id config set cluster-node-timeout $node_timeout - R $id config set cluster-slave-validity-factor 10 - R $id config set loading-process-events-interval-bytes 2097152 - R $id config set key-load-delay 0 - R $id config set repl-diskless-load disabled - R $id config set cluster-announce-hostname "" - R $id DEBUG DROP-CLUSTER-PACKET-FILTER -1 - R $id config rewrite - } -} - -# Helper function to attempt to have each node in a cluster -# meet each other. -proc join_nodes_in_cluster {} { - # Join node 0 with 1, 1 with 2, ... and so forth. - # If auto-discovery works all nodes will know every other node - # eventually. - set ids {} - foreach_valkey_id id {lappend ids $id} - for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} { - set a [lindex $ids $j] - set b [lindex $ids [expr $j+1]] - set b_port [get_instance_attrib valkey $b port] - R $a cluster meet 127.0.0.1 $b_port - } - - foreach_valkey_id id { - wait_for_condition 1000 50 { - [llength [get_cluster_nodes $id connected]] == [llength $ids] - } else { - return 0 - } - } - return 1 -} - -test "Cluster Join and auto-discovery test" { - # Use multiple attempts since sometimes nodes timeout - # while attempting to connect. - for {set attempts 3} {$attempts > 0} {incr attempts -1} { - if {[join_nodes_in_cluster] == 1} { - break - } - } - if {$attempts == 0} { - fail "Cluster failed to form full mesh" - } -} - -test "Before slots allocation, all nodes report cluster failure" { - assert_cluster_state fail -} diff --git a/tests/cluster/tests/includes/utils.tcl b/tests/cluster/tests/includes/utils.tcl deleted file mode 100644 index 73c74075f..000000000 --- a/tests/cluster/tests/includes/utils.tcl +++ /dev/null @@ -1,36 +0,0 @@ -source "../../../tests/support/cli.tcl" - -proc config_set_all_nodes {keyword value} { - foreach_valkey_id id { - R $id config set $keyword $value - } -} - -proc fix_cluster {addr} { - set code [catch { - exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster fix $addr << yes - } result] - if {$code != 0} { - puts "valkey-cli --cluster fix returns non-zero exit code, output below:\n$result" - } - # Note: valkey-cli --cluster fix may return a non-zero exit code if nodes don't agree, - # but we can ignore that and rely on the check below. - assert_cluster_state ok - wait_for_condition 100 100 { - [catch {exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster check $addr} result] == 0 - } else { - puts "valkey-cli --cluster check returns non-zero exit code, output below:\n$result" - fail "Cluster could not settle with configuration" - } -} - -proc wait_cluster_stable {} { - wait_for_condition 1000 50 { - [catch {exec $::VALKEY_CLI_BIN --cluster \ - check 127.0.0.1:[get_instance_attrib valkey 0 port] \ - {*}[valkeycli_tls_config "../../../tests"] \ - }] == 0 - } else { - fail "Cluster doesn't stabilize" - } -} \ No newline at end of file diff --git a/tests/cluster/tmp/.gitignore b/tests/cluster/tmp/.gitignore deleted file mode 100644 index 72e8ffc0d..000000000 --- a/tests/cluster/tmp/.gitignore +++ /dev/null @@ -1 +0,0 @@ -* diff --git a/tests/cluster/tests/helpers/onlydots.tcl b/tests/helpers/onlydots.tcl similarity index 100% rename from tests/cluster/tests/helpers/onlydots.tcl rename to tests/helpers/onlydots.tcl diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index a96ca12ba..39fffbf23 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -148,6 +148,7 @@ proc wait_for_cluster_size {cluster_size} { # Check that cluster nodes agree about "state", or raise an error. proc wait_for_cluster_state {state} { for {set j 0} {$j < [llength $::servers]} {incr j} { + if {![process_is_alive [srv -$j pid]]} continue if {[process_is_paused [srv -$j pid]]} continue wait_for_condition 1000 50 { [CI $j cluster_state] eq $state diff --git a/tests/unit/cluster/cluster-shards.tcl b/tests/unit/cluster/cluster-shards.tcl index 170114d82..409d011f4 100644 --- a/tests/unit/cluster/cluster-shards.tcl +++ b/tests/unit/cluster/cluster-shards.tcl @@ -53,3 +53,289 @@ start_cluster 3 3 {tags {external:skip cluster}} { assert_equal $shard_0_slot_coverage [dict get [get_node_info_from_shard $node_0_id $validation_node "shard"] "slots"] } } +# Initial slot distribution for split-slot cluster tests. +set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926] +set ::slot1 [list 5460 5460 5462 10922 10925 10925] +set ::slot2 [list 10923 10924 10927 16383] +set ::slot3 [list 1001 1001] + +# Slot allocator: assigns split slots to each master. +proc split_slot_allocation {masters replicas} { + for {set j 0} {$j < $masters} {incr j} { + R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}] + } +} + +# Replica allocator: allocates only masters replicas, leaving the last server +# (R 8) as a standalone no-slot node for testing purposes. +proc split_slot_replica_allocation {masters replicas} { + cluster_allocate_replicas $masters [expr {$replicas - 1}] +} + +proc cluster_ensure_master {id} { + if { [regexp "master" [R $id role]] == 0 } { + assert_equal {OK} [R $id CLUSTER FAILOVER] + wait_for_condition 50 100 { + [regexp "master" [R $id role]] == 1 + } else { + fail "instance $id is not master" + } + } +} + +# start_cluster 4 masters + 5 nodes (4 replicas + 1 standalone R8) +start_cluster 4 5 {tags {external:skip cluster}} { + +# cluster_master_nodes and cluster_replica_nodes refer to the active cluster members. +set ::cluster_master_nodes 4 +set ::cluster_replica_nodes 4 + +test "Cluster should start ok" { + wait_for_cluster_state ok +} + +test "Set cluster hostnames and verify they are propagated" { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + R $j config set cluster-announce-hostname "host-$j.com" + } + + # Wait for everyone to agree about the state + wait_for_cluster_propagation +} + +test "Verify information about the shards" { + set ids {} + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + lappend ids [R $j CLUSTER MYID] + } + set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3] + + # Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent. + for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} { + for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} { + assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots] + assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname] + assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip] + # Default value of 'cluster-preferred-endpoint-type' is ip. + assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint] + + if {$::tls} { + assert_equal [srv [expr -1*$i] pport] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port] + assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port] + } else { + assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port] + } + + if {$i < 4} { + assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role] + assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health] + } else { + assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role] + # Replica could be in online or loading + } + } + } +} + +test "Verify no slot shard" { + # R 8 is a standalone node with no slots assigned (left standalone by split_slot_replica_allocation) + set node_8_id [R 8 CLUSTER MYID] + assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots] + assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots] +} + +set node_0_id [R 0 CLUSTER MYID] + +test "Kill a node and tell the replica to immediately takeover" { + pause_process [srv 0 pid] + R 4 cluster failover force +} + +# Primary 0 node should report as fail, wait until the new primary acknowledges it. +test "Verify health as fail for killed node" { + wait_for_condition 1000 50 { + "fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"] + } else { + fail "New primary never detected the node failed" + } +} + +set primary_id 4 +set replica_id 0 + +test "Restarting primary node" { + restart_server [expr -1*$replica_id] true false +} + +test "Instance #0 gets converted into a replica" { + wait_for_condition 1000 50 { + [s [expr -1*$replica_id] role] eq {slave} + } else { + fail "Old primary was not converted into replica" + } +} + +test "Test the replica reports a loading state while it's loading" { + # Test the command is good for verifying everything moves to a happy state + set replica_cluster_id [R $replica_id CLUSTER MYID] + wait_for_condition 50 1000 { + [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online" + } else { + fail "Replica never transitioned to online" + } + + # Set 1 MB of data, so there is something to load on full sync + R $primary_id debug populate 1000 key 1000 + + # Kill replica client for primary and load new data to the primary + R $primary_id config set repl-backlog-size 100 + + # Set the key load delay so that it will take at least + # 2 seconds to fully load the data. + R $replica_id config set key-load-delay 4000 + + # Trigger event loop processing every 1024 bytes, this trigger + # allows us to send and receive cluster messages, so we are setting + # it low so that the cluster messages are sent more frequently. + R $replica_id config set loading-process-events-interval-bytes 1024 + + R $primary_id multi + R $primary_id client kill type replica + # populate the correct data + set num 100 + set value [string repeat A 1024] + for {set j 0} {$j < $num} {incr j} { + # Use hashtag valid for shard #0 + set key "{ch3}$j" + R $primary_id set $key $value + } + R $primary_id exec + + # The replica should reconnect and start a full sync, it will gossip about it's health to the primary. + wait_for_condition 50 1000 { + "loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] + } else { + fail "Replica never transitioned to loading" + } + + # Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data. + R $replica_id CLUSTER SHARDS + R $replica_id CLUSTER SLOTS + + # Speed up the key loading and verify everything resumes + R $replica_id config set key-load-delay 0 + + wait_for_condition 50 1000 { + "online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] + } else { + fail "Replica never transitioned to online" + } + + # Final sanity, the replica agrees it is online. + assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health] +} + +test "Regression test for a crash when calling SHARDS during handshake" { + # Use R 8 (standalone node) to establish handshaking connections + set id [R 8 CLUSTER MYID] + R 8 CLUSTER RESET HARD + for {set i 0} {$i < 8} {incr i} { + R $i CLUSTER FORGET $id + } + R 8 cluster meet 127.0.0.1 [srv 0 port] + # This line would previously crash, since all the outbound + # connections were in handshake state. + R 8 CLUSTER SHARDS +} + +test "Cluster is up" { + wait_for_cluster_state ok +} + +test "Shard ids are unique" { + set shard_ids {} + for {set i 0} {$i < 4} {incr i} { + set shard_id [R $i cluster myshardid] + assert_equal [dict exists $shard_ids $shard_id] 0 + dict set shard_ids $shard_id 1 + } +} + +test "CLUSTER MYSHARDID reports same id for both primary and replica" { + for {set i 0} {$i < 4} {incr i} { + assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid] + assert_equal [string length [R $i cluster myshardid]] 40 + } +} + +test "New replica receives primary's shard id" { + # find a primary + set id 0 + for {} {$id < 8} {incr id} { + if {[regexp "master" [R $id role]]} { + break + } + } + assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid] + assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]] + assert_equal [R 8 cluster myshardid] [R $id cluster myshardid] +} + +test "CLUSTER MYSHARDID reports same shard id after shard restart" { + set node_ids {} + for {set i 0} {$i < 8} {incr i 4} { + dict set node_ids $i [R $i cluster myshardid] + pause_process [srv [expr -1*$i] pid] + } + for {set i 0} {$i < 8} {incr i 4} { + restart_server [expr -1*$i] true false + } + wait_for_cluster_state ok + for {set i 0} {$i < 8} {incr i 4} { + assert_equal [dict get $node_ids $i] [R $i cluster myshardid] + } +} + +test "CLUSTER MYSHARDID reports same shard id after cluster restart" { + set node_ids {} + for {set i 0} {$i < 8} {incr i} { + dict set node_ids $i [R $i cluster myshardid] + } + for {set i 0} {$i < 8} {incr i} { + pause_process [srv [expr -1*$i] pid] + } + for {set i 0} {$i < 8} {incr i} { + restart_server [expr -1*$i] true false + } + wait_for_cluster_state ok + for {set i 0} {$i < 8} {incr i} { + assert_equal [dict get $node_ids $i] [R $i cluster myshardid] + } +} + +test "CLUSTER SHARDS id response validation" { + # For each node in the cluster + for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} { + # Get the CLUSTER SHARDS output from this node + set shards [R $i CLUSTER SHARDS] + set seen_shard_ids {} + + # For each shard in the output + foreach shard $shards { + set shard_dict [dict create {*}$shard] + + # 1. Verify 'id' key exists + assert {[dict exists $shard_dict id]} + set shard_id [dict get $shard_dict id] + + # 2. Verify shard_id is a 40-char string + assert {[string length $shard_id] == 40} + + # 3. Verify that for a given node's output, all shard IDs are unique + assert {[dict exists $seen_shard_ids $shard_id] == 0} + dict set seen_shard_ids $shard_id 1 + } + } +} + +} split_slot_allocation split_slot_replica_allocation diff --git a/tests/unit/cluster/failover.tcl b/tests/unit/cluster/failover.tcl index e3c793132..5e4e80e52 100644 --- a/tests/unit/cluster/failover.tcl +++ b/tests/unit/cluster/failover.tcl @@ -152,3 +152,108 @@ start_cluster 3 1 {tags {external:skip cluster}} { } } } ;# start_cluster + +# Failover stress test. +# In this test a different node is killed in a loop for N +# iterations. The test checks that certain properties +# are preserved across iterations. +start_cluster 5 5 {tags {external:skip cluster}} { + set iterations 10 + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] + + while {[incr iterations -1]} { + set tokill [randomInt 10] + set other [expr {($tokill+1)%10}] ; # Some other instance. + set key [randstring 20 20 alpha] + set val [randstring 20 20 alpha] + set role [s [expr -1*$tokill] role] + if {$role eq {master}} { + set slave {} + set myid [dict get [cluster_get_myself $tokill] id] + for {set id 0} {$id < [llength $::servers]} {incr id} { + if {$id == $tokill} continue + if {[dict get [cluster_get_myself $id] slaveof] eq $myid} { + set slave $id + } + } + if {$slave eq {}} { + fail "Unable to retrieve slave's ID for master #$tokill" + } + } + + if {$role eq {master}} { + test "Wait for slave of #$tokill to sync" { + wait_for_condition 1000 50 { + [string match {*state=online*} [s [expr -1*$tokill] slave0]] + } else { + fail "Slave of node #$tokill is not ok" + } + } + set slave_config_epoch [CI $slave cluster_my_epoch] + } + + test "Cluster is writable before failover" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster set $key:$i $val:$i} err + assert {$err eq {OK}} + } + # Wait for the write to propagate to the slave if we + # are going to kill a master. + if {$role eq {master}} { + R $tokill wait 1 20000 + } + } + + test "Terminating node #$tokill" { + catch {R $tokill shutdown nosave} + } + + if {$role eq {master}} { + test "Wait failover by #$slave with old epoch $slave_config_epoch" { + wait_for_condition 1000 50 { + [CI $slave cluster_my_epoch] > $slave_config_epoch + } else { + fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]" + } + } + } + + test "Cluster should eventually be up again" { + wait_for_cluster_state ok + } + + test "Cluster is writable again" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster set $key:$i:2 $val:$i:2} err + assert {$err eq {OK}} + } + } + + test "Restarting node #$tokill" { + restart_server [expr -1*$tokill] true false + } + + test "Instance #$tokill is now a slave" { + wait_for_condition 1000 50 { + [s [expr -1*$tokill] role] eq {slave} + } else { + fail "Restarted instance is not a slave" + } + } + + test "We can read back the value we set before" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster get $key:$i} err + assert {$err eq "$val:$i"} + catch {$cluster get $key:$i:2} err + assert {$err eq "$val:$i:2"} + } + } + } + + test "Post condition: current_epoch >= my_epoch everywhere" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]} + } + } +} ;# start_cluster diff --git a/tests/unit/cluster/replica-migration-slow.tcl b/tests/unit/cluster/replica-migration-slow.tcl new file mode 100644 index 000000000..c0d80c80a --- /dev/null +++ b/tests/unit/cluster/replica-migration-slow.tcl @@ -0,0 +1,76 @@ +# Check that the status of primary that can be targeted by replica migration +# is acquired again, after being getting slots again, in a cluster where the +# other primaries have replicas. +tags {"slow valgrind:skip"} { +run_solo {cluster-replica-migration-slow} { +start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration yes}} { + test "Primary #0 should re-acquire one or more replicas" { + # Resharding all the primary #0 slots away from it + set primary0_id [dict get [cluster_get_myself 0] id] + set output [exec \ + $::VALKEY_CLI_BIN --cluster rebalance \ + 127.0.0.1:[srv 0 port] \ + {*}[valkeycli_tls_config "./tests"] \ + --cluster-weight ${primary0_id}=0 >@ stdout] + + # Primary #0 who lost all slots should turn into a replica without replicas + wait_for_condition 1000 50 { + [s 0 role] eq "slave" && [s 0 connected_slaves] == 0 + } else { + puts [R 0 info replication] + fail "Primary #0 didn't turn itself into a replica" + } + + # Resharding back some slot to primary #0 + # Wait for the cluster config to propagate before attempting a + # new resharding. + wait_for_cluster_propagation + set output [exec \ + $::VALKEY_CLI_BIN --cluster rebalance \ + 127.0.0.1:[srv 0 port] \ + {*}[valkeycli_tls_config "./tests"] \ + --cluster-weight ${primary0_id}=.01 \ + --cluster-use-empty-masters >@ stdout] + + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] >= 1 + } else { + fail "Primary #0 has no has replicas" + } + } +} ;# start_cluster +} ;# run_solo +} ;# tag + +# Check that if 'cluster-allow-replica-migration' is set to 'no', replicas do not +# migrate when primary becomes empty. +tags {"slow valgrind:skip"} { +run_solo {cluster-replica-migration-slow} { +start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no}} { + test "Each primary should have at least two replicas attached" { + # Resharding all the primary #0 slots away from it + set primary0_id [dict get [cluster_get_myself 0] id] + set output [exec \ + $::VALKEY_CLI_BIN --cluster rebalance \ + 127.0.0.1:[srv 0 port] \ + {*}[valkeycli_tls_config "./tests"] \ + --cluster-weight ${primary0_id}=0 >@ stdout] + + wait_for_cluster_propagation + wait_for_cluster_state "ok" + + # Primary #0 still should have its replicas + assert { [llength [lindex [R 0 role] 2]] >= 2 } + + # Each primary should have at least two replicas attached + for {set id 0} {$id < 5} {incr id} { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] >= 2 + } else { + fail "Primary #$id does not have 2 replicas as expected" + } + } + } +} ;# start_cluster +} ;# run_solo +} ;# tag diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl index fb75fb386..bb3f091b2 100644 --- a/tests/unit/cluster/replica-migration.tcl +++ b/tests/unit/cluster/replica-migration.tcl @@ -433,3 +433,73 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout assert_equal [R 2 dbsize] 0 } } my_slot_allocation cluster_allocate_replicas ;# start_cluster + +# Replica migration test. +# Check that orphaned primaries are joined by replicas of primaries having +# multiple replicas attached, according to the migration barrier settings. +start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} { + # Killing all the replicas of primary #0 and #1 + pause_process [srv -5 pid] + pause_process [srv -10 pid] + pause_process [srv -6 pid] + pause_process [srv -11 pid] + + # R0 and R1 will trigger replica migration. + wait_for_condition 1000 50 { + [expr {[count_log_message -7 "Migrating to orphaned primary"] + + [count_log_message -8 "Migrating to orphaned primary"] + + [count_log_message -9 "Migrating to orphaned primary"] + + [count_log_message -12 "Migrating to orphaned primary"] + + [count_log_message -13 "Migrating to orphaned primary"] + + [count_log_message -14 "Migrating to orphaned primary"]}] == 2 + } else { + fail "Replicas did not trigger replica migration " + } + + # Primary should have at least one replica" + for {set id 0} {$id < 5} {incr id} { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] >= 1 + } else { + fail "Primary #$id has no replicas" + } + } + + resume_process [srv -5 pid] + resume_process [srv -10 pid] + resume_process [srv -6 pid] + resume_process [srv -11 pid] +} ;# start_cluster + +# Now test the migration to a primary which used to be a replica, after +# a failover. +start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} { + test "Primary #12 should get at least one migrated replica" { + # Kill replica #7 of primary #2. Only replica left is #12 now" + pause_process [srv -7 pid] + + # Killing primary node #2, #12 should failover + set current_epoch [CI 1 cluster_current_epoch] + pause_process [srv -2 pid] + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } + + wait_for_cluster_state ok + cluster_write_test [srv -1 port] + assert {[s -12 role] eq {master}} + + # The remaining instance is now without replicas. Some other replica + # should migrate to it. + wait_for_condition 1000 50 { + [llength [lindex [R 12 role] 2]] >= 1 + } else { + fail "Primary #12 has no replicas" + } + + resume_process [srv -7 pid] + resume_process [srv -2 pid] + } +} ;# start_cluster diff --git a/tests/unit/cluster/resharding.tcl b/tests/unit/cluster/resharding.tcl new file mode 100644 index 000000000..b56fb2f2c --- /dev/null +++ b/tests/unit/cluster/resharding.tcl @@ -0,0 +1,185 @@ +# Resharding test. +# In this test a live resharding is performed and the test checks +# that certain properties are preserved across the operation. +tags {"slow"} { +run_solo {cluster-resharding} { +start_cluster 5 5 {tags {external:skip cluster}} { + test "Enable AOF in all the instances" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + R $id config set appendonly yes + # We use "appendfsync no" because it's fast but also guarantees that + # write(2) is performed before replying to client. + R $id config set appendfsync no + } + + for {set id 0} {$id < [llength $::servers]} {incr id} { + wait_for_condition 1000 500 { + [s [expr -1*$id] aof_rewrite_in_progress] == 0 && + [s [expr -1*$id] aof_enabled] == 1 + } else { + fail "Failed to enable AOF on instance #$id" + } + } + } + + # Our resharding test performs the following actions: + # + # - N commands are sent to the cluster in the course of the test. + # - Every command selects a random key from key:0 to key:MAX-1. + # - The operation RPUSH key is performed. + # - Tcl remembers into an array all the values pushed to each list. + # - After N/2 commands, the resharding process is started in background. + # - The test continues while the resharding is in progress. + # - At the end of the test, we wait for the resharding process to stop. + # - Finally the keys are checked to see if they contain the value they should. + + set numkeys 50000 + set numops 200000 + set start_node_port [srv 0 port] + set cluster [valkey_cluster 127.0.0.1:$start_node_port] + if {$::tls} { + # setup a non-TLS cluster client to the TLS cluster + set plaintext_port [srv 0 pport] + set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0] + puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port" + } else { + set cluster_plaintext $cluster + puts "Testing using non-TLS cluster" + } + catch {unset content} + array set content {} + set tribpid {} + + test "Cluster consistency during live resharding" { + set ele 0 + for {set j 0} {$j < $numops} {incr j} { + # Trigger the resharding once we execute half the ops. + if {$tribpid ne {} && + ($j % 10000) == 0 && + ![process_is_alive $tribpid]} { + set tribpid {} + } + + if {$j >= $numops/2 && $tribpid eq {}} { + puts -nonewline "...Starting resharding..." + flush stdout + set target [dict get [cluster_get_myself [randomInt 5]] id] + set tribpid [lindex [exec \ + $::VALKEY_CLI_BIN --cluster reshard \ + 127.0.0.1:[srv 0 port] \ + --cluster-from all \ + --cluster-to $target \ + --cluster-slots 100 \ + --cluster-yes \ + {*}[valkeycli_tls_config "./tests"] \ + | [info nameofexecutable] \ + tests/helpers/onlydots.tcl \ + &] 0] + } + + # Write random data to random list. + set listid [randomInt $numkeys] + set key "key:$listid" + incr ele + # We write both with Lua scripts and with plain commands. + # This way we are able to stress Lua -> server command invocation + # as well, that has tests to prevent Lua to write into wrong + # hash slots. + # We also use both TLS and plaintext connections. + if {$listid % 3 == 0} { + $cluster rpush $key $ele + } elseif {$listid % 3 == 1} { + $cluster_plaintext rpush $key $ele + } else { + $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + } + lappend content($key) $ele + + if {($j % 1000) == 0} { + puts -nonewline W; flush stdout + } + } + + # Wait for the resharding process to end + wait_for_condition 1000 500 { + [process_is_alive $tribpid] == 0 + } else { + fail "Resharding is not terminating after some time." + } + wait_for_cluster_propagation + } + + test "Verify $numkeys keys for consistency with logical content" { + # Check that the Cluster content matches our logical content. + foreach {key value} [array get content] { + if {[$cluster lrange $key 0 -1] ne $value} { + fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" + } + } + } + + test "Terminate and restart all the instances" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + # Stop AOF so that an initial AOFRW won't prevent the instance from terminating + R $id config set appendonly no + R $id bgsave + wait_for_condition 1000 50 { + [s [expr -1*$id] rdb_bgsave_in_progress] == 0 + } else { + fail "bgsave didn't finish for instance #$id" + } + restart_server [expr -1*$id] true false + } + } + + test "Cluster should eventually be up again" { + wait_for_cluster_state ok + } + + test "Verify $numkeys keys after the restart" { + # Check that the Cluster content matches our logical content. + foreach {key value} [array get content] { + if {[$cluster lrange $key 0 -1] ne $value} { + fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" + } + } + } + + test "Disable AOF in all the instances" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + R $id config set appendonly no + } + } + + test "Verify slaves consistency" { + set verified_masters 0 + for {set id 0} {$id < [llength $::servers]} {incr id} { + set role [R $id role] + lassign $role myrole myoffset slaves + if {$myrole eq {slave}} continue + set masterport [srv [expr -1*$id] port] + set masterdigest [R $id debug digest] + for {set sid 0} {$sid < [llength $::servers]} {incr sid} { + set srole [R $sid role] + if {[lindex $srole 0] eq {master}} continue + if {[lindex $srole 2] != $masterport} continue + wait_for_condition 1000 500 { + [R $sid debug digest] eq $masterdigest + } else { + fail "Master and slave data digest are different" + } + incr verified_masters + } + } + assert {$verified_masters >= 5} + } + + test "Dump sanitization was skipped for migrations" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + assert {[s [expr -1*$id] dump_payload_sanitizations] == 0} + } + } + +} ;# start_cluster +} ;# run_solo +} ;# tag