mirror of
https://github.com/valkey-io/valkey.git
synced 2026-05-06 05:26:42 -04:00
Migrated the remaining cluster tests to tests/unit/cluster/ to use the same framework for all cluster tests. Cleaned up the obsolete cluster test framework files and updated the CI workflows to use the new unified test runner. Changes: Moved and mapped 6 test files: - 03-failover-loop.tcl → Merged into existing failover.tcl - 04-resharding.tcl → resharding.tcl - 12-replica-migration-2.tcl + 12.1-replica-migration-3.tcl → replica-migration-slow.tcl - 07-replica-migration.tcl → Merged into existing replica-migration.tcl - 28-cluster-shards.tcl → Merged into existing cluster-shards.tcl Other changes: - Converted old framework APIs (e.g., K, RI) to new framework APIs (e.g., R, srv) - Added process_is_alive check in cluster_util.tcl to fix an exception in failover tests caused by executing ps on dead processes - Heavy tests (resharding, replica-migration-slow) marked with slow tag and wrapped in run_solo to prevent resource contention in sanitizer environments - replica-migration-slow marked with valgrind:skip tag since it is very slow - Removed the entire tests/cluster/ directory including run.tcl, cluster.tcl, includes/, and helpers/ - Kept runtest-cluster as a wrapper script (exec ./runtest --cluster "$@") - Removed ./runtest-cluster calls from .github/workflows/daily.yml as cluster tests are now included in ./runtest Closes #2297. Signed-off-by: Jun Yeong Kim <junyeonggim5@gmail.com> Signed-off-by: Binbin <binloveplay1314@qq.com> Co-authored-by: Binbin <binloveplay1314@qq.com>
This commit is contained in:
@@ -125,9 +125,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
- name: unittest
|
||||
if: true && !contains(github.event.inputs.skiptests, 'unittest')
|
||||
run: |
|
||||
@@ -211,9 +208,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
- name: unittest
|
||||
if: true && !contains(github.event.inputs.skiptests, 'unittest')
|
||||
run: |
|
||||
@@ -276,9 +270,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
- name: unittest
|
||||
if: true && !contains(github.event.inputs.skiptests, 'unittest')
|
||||
run: |
|
||||
@@ -334,9 +325,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
test-ubuntu-no-malloc-usable-size:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
@@ -384,9 +372,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
test-ubuntu-32bit:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
@@ -457,9 +442,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
- name: unittest
|
||||
if: true && !contains(github.event.inputs.skiptests, 'unittest')
|
||||
run: |
|
||||
@@ -522,10 +504,6 @@ jobs:
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: |
|
||||
./runtest-sentinel --tls ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: |
|
||||
./runtest-cluster --tls ${{github.event.inputs.cluster_test_args}}
|
||||
test-ubuntu-tls-no-tls:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
@@ -578,10 +556,6 @@ jobs:
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: |
|
||||
./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: |
|
||||
./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
test-ubuntu-io-threads:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
@@ -623,9 +597,6 @@ jobs:
|
||||
- name: test
|
||||
if: true && !contains(github.event.inputs.skiptests, 'valkey')
|
||||
run: ./runtest --io-threads --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}
|
||||
test-ubuntu-tls-io-threads:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
@@ -670,10 +641,6 @@ jobs:
|
||||
if: true && !contains(github.event.inputs.skiptests, 'valkey')
|
||||
run: |
|
||||
./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: |
|
||||
./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}}
|
||||
test-ubuntu-reclaim-cache:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
@@ -991,9 +958,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
- name: unittest
|
||||
if: true && !contains(github.event.inputs.skiptests, 'unittest')
|
||||
run: |
|
||||
@@ -1129,9 +1093,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
- name: unittest
|
||||
if: true && !contains(github.event.inputs.skiptests, 'unittest')
|
||||
run: |
|
||||
@@ -1263,9 +1224,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
- name: unittest
|
||||
if: true && !contains(github.event.inputs.skiptests, 'unittest')
|
||||
run: |
|
||||
@@ -1324,9 +1282,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
test-rpm-distros-jemalloc:
|
||||
if: |
|
||||
(github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
|
||||
@@ -1399,9 +1354,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
test-rpm-distros-tls-module:
|
||||
if: |
|
||||
(github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
|
||||
@@ -1479,10 +1431,6 @@ jobs:
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: |
|
||||
./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: |
|
||||
./runtest-cluster --tls-module ${{github.event.inputs.cluster_test_args}}
|
||||
test-rpm-distros-tls-module-no-tls:
|
||||
if: |
|
||||
(github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
|
||||
@@ -1555,10 +1503,6 @@ jobs:
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: |
|
||||
./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: |
|
||||
./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
test-macos-latest:
|
||||
runs-on: macos-latest
|
||||
if: |
|
||||
@@ -1676,9 +1620,6 @@ jobs:
|
||||
- run: cd libbacktrace && ./configure && make && sudo make install
|
||||
- name: make
|
||||
run: make SERVER_CFLAGS='-Werror' USE_LIBBACKTRACE=yes
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
build-old-macos-versions:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -1806,9 +1747,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
test-alpine-libc-malloc:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
@@ -1859,9 +1797,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
|
||||
reply-schemas-validator:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 1440
|
||||
@@ -1909,9 +1844,6 @@ jobs:
|
||||
- name: sentinel tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'sentinel')
|
||||
run: ./runtest-sentinel --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}}
|
||||
- name: cluster tests
|
||||
if: true && !contains(github.event.inputs.skiptests, 'cluster')
|
||||
run: ./runtest-cluster --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}}
|
||||
- name: Install Python dependencies
|
||||
uses: py-actions/py-dependency-install@30aa0023464ed4b5b116bd9fbdab87acf01a484e # v4.1.0
|
||||
with:
|
||||
|
||||
+1
-13
@@ -1,14 +1,2 @@
|
||||
#!/bin/sh
|
||||
TCL_VERSIONS="8.5 8.6 9.0"
|
||||
TCLSH=""
|
||||
|
||||
for VERSION in $TCL_VERSIONS; do
|
||||
TCL=`which tclsh$VERSION 2>/dev/null` && TCLSH=$TCL
|
||||
done
|
||||
|
||||
if [ -z $TCLSH ]
|
||||
then
|
||||
echo "You need tcl 8.5 or newer in order to run the Valkey Cluster test"
|
||||
exit 1
|
||||
fi
|
||||
$TCLSH tests/cluster/run.tcl $*
|
||||
exec ./runtest --cluster "$@"
|
||||
|
||||
@@ -1,238 +0,0 @@
|
||||
# Cluster-specific test functions.
|
||||
#
|
||||
# Copyright (C) 2014 Redis Ltd.
|
||||
# This software is released under the BSD License. See the COPYING file for
|
||||
# more information.
|
||||
|
||||
# Track cluster configuration as created by create_cluster below
|
||||
set ::cluster_master_nodes 0
|
||||
set ::cluster_replica_nodes 0
|
||||
|
||||
# Returns a parsed CLUSTER NODES output as a list of dictionaries. Optional status field
|
||||
# can be specified to only returns entries that match the provided status.
|
||||
proc get_cluster_nodes {id {status "*"}} {
|
||||
set lines [split [R $id cluster nodes] "\r\n"]
|
||||
set nodes {}
|
||||
foreach l $lines {
|
||||
set l [string trim $l]
|
||||
if {$l eq {}} continue
|
||||
set args [split $l]
|
||||
set node [dict create \
|
||||
id [lindex $args 0] \
|
||||
addr [lindex $args 1] \
|
||||
flags [split [lindex $args 2] ,] \
|
||||
slaveof [lindex $args 3] \
|
||||
ping_sent [lindex $args 4] \
|
||||
pong_recv [lindex $args 5] \
|
||||
config_epoch [lindex $args 6] \
|
||||
linkstate [lindex $args 7] \
|
||||
slots [lrange $args 8 end] \
|
||||
]
|
||||
if {[string match $status [lindex $args 7]]} {
|
||||
lappend nodes $node
|
||||
}
|
||||
}
|
||||
return $nodes
|
||||
}
|
||||
|
||||
# Test node for flag.
|
||||
proc has_flag {node flag} {
|
||||
expr {[lsearch -exact [dict get $node flags] $flag] != -1}
|
||||
}
|
||||
|
||||
# Returns the parsed myself node entry as a dictionary.
|
||||
proc get_myself id {
|
||||
set nodes [get_cluster_nodes $id]
|
||||
foreach n $nodes {
|
||||
if {[has_flag $n myself]} {return $n}
|
||||
}
|
||||
return {}
|
||||
}
|
||||
|
||||
# Get a specific node by ID by parsing the CLUSTER NODES output
|
||||
# of the instance Number 'instance_id'
|
||||
proc get_node_by_id {instance_id node_id} {
|
||||
set nodes [get_cluster_nodes $instance_id]
|
||||
foreach n $nodes {
|
||||
if {[dict get $n id] eq $node_id} {return $n}
|
||||
}
|
||||
return {}
|
||||
}
|
||||
|
||||
# Return the value of the specified CLUSTER INFO field.
|
||||
proc CI {n field} {
|
||||
get_info_field [R $n cluster info] $field
|
||||
}
|
||||
|
||||
# Return the value of the specified INFO field.
|
||||
proc s {n field} {
|
||||
get_info_field [R $n info] $field
|
||||
}
|
||||
|
||||
# Assuming nodes are reset, this function performs slots allocation.
|
||||
# Only the first 'n' nodes are used.
|
||||
proc cluster_allocate_slots {n} {
|
||||
set slot 16383
|
||||
while {$slot >= 0} {
|
||||
# Allocate successive slots to random nodes.
|
||||
set node [randomInt $n]
|
||||
lappend slots_$node $slot
|
||||
incr slot -1
|
||||
}
|
||||
for {set j 0} {$j < $n} {incr j} {
|
||||
R $j cluster addslots {*}[set slots_${j}]
|
||||
}
|
||||
}
|
||||
|
||||
# Check that cluster nodes agree about "state", or raise an error.
|
||||
proc assert_cluster_state {state} {
|
||||
foreach_valkey_id id {
|
||||
if {[instance_is_killed valkey $id]} continue
|
||||
wait_for_condition 1000 50 {
|
||||
[CI $id cluster_state] eq $state
|
||||
} else {
|
||||
fail "Cluster node $id cluster_state:[CI $id cluster_state]"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Search the first node starting from ID $first that is not
|
||||
# already configured as a slave.
|
||||
proc cluster_find_available_slave {first} {
|
||||
foreach_valkey_id id {
|
||||
if {$id < $first} continue
|
||||
if {[instance_is_killed valkey $id]} continue
|
||||
set me [get_myself $id]
|
||||
if {[dict get $me slaveof] eq {-}} {return $id}
|
||||
}
|
||||
fail "No available slaves"
|
||||
}
|
||||
|
||||
# Add 'slaves' slaves to a cluster composed of 'masters' masters.
|
||||
# It assumes that masters are allocated sequentially from instance ID 0
|
||||
# to N-1.
|
||||
proc cluster_allocate_slaves {masters slaves} {
|
||||
for {set j 0} {$j < $slaves} {incr j} {
|
||||
set master_id [expr {$j % $masters}]
|
||||
set slave_id [cluster_find_available_slave $masters]
|
||||
set master_myself [get_myself $master_id]
|
||||
R $slave_id cluster replicate [dict get $master_myself id]
|
||||
}
|
||||
}
|
||||
|
||||
# Create a cluster composed of the specified number of masters and slaves.
|
||||
proc create_cluster {masters slaves} {
|
||||
cluster_allocate_slots $masters
|
||||
if {$slaves} {
|
||||
cluster_allocate_slaves $masters $slaves
|
||||
}
|
||||
assert_cluster_state ok
|
||||
|
||||
set ::cluster_master_nodes $masters
|
||||
set ::cluster_replica_nodes $slaves
|
||||
}
|
||||
|
||||
proc cluster_allocate_with_continuous_slots {n} {
|
||||
set slot 16383
|
||||
set avg [expr ($slot+1) / $n]
|
||||
while {$slot >= 0} {
|
||||
set node [expr $slot/$avg >= $n ? $n-1 : $slot/$avg]
|
||||
lappend slots_$node $slot
|
||||
incr slot -1
|
||||
}
|
||||
for {set j 0} {$j < $n} {incr j} {
|
||||
R $j cluster addslots {*}[set slots_${j}]
|
||||
}
|
||||
}
|
||||
|
||||
# Create a cluster composed of the specified number of masters and slaves,
|
||||
# but with a continuous slot range.
|
||||
proc cluster_create_with_continuous_slots {masters slaves} {
|
||||
cluster_allocate_with_continuous_slots $masters
|
||||
if {$slaves} {
|
||||
cluster_allocate_slaves $masters $slaves
|
||||
}
|
||||
assert_cluster_state ok
|
||||
|
||||
set ::cluster_master_nodes $masters
|
||||
set ::cluster_replica_nodes $slaves
|
||||
}
|
||||
|
||||
|
||||
# Set the cluster node-timeout to all the reachalbe nodes.
|
||||
proc set_cluster_node_timeout {to} {
|
||||
foreach_valkey_id id {
|
||||
catch {R $id CONFIG SET cluster-node-timeout $to}
|
||||
}
|
||||
}
|
||||
|
||||
# Check if the cluster is writable and readable. Use node "id"
|
||||
# as a starting point to talk with the cluster.
|
||||
proc cluster_write_test {id} {
|
||||
set prefix [randstring 20 20 alpha]
|
||||
set port [get_instance_attrib valkey $id port]
|
||||
set cluster [valkey_cluster 127.0.0.1:$port]
|
||||
for {set j 0} {$j < 100} {incr j} {
|
||||
$cluster set key.$j $prefix.$j
|
||||
}
|
||||
for {set j 0} {$j < 100} {incr j} {
|
||||
assert {[$cluster get key.$j] eq "$prefix.$j"}
|
||||
}
|
||||
$cluster close
|
||||
}
|
||||
|
||||
# Check if cluster configuration is consistent.
|
||||
# All the nodes in the cluster should show same slots configuration and have health
|
||||
# state "online" to be considered as consistent.
|
||||
proc cluster_config_consistent {} {
|
||||
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
|
||||
# Check if all the nodes are online
|
||||
set shards_cfg [R $j CLUSTER SHARDS]
|
||||
foreach shard_cfg $shards_cfg {
|
||||
set nodes [dict get $shard_cfg nodes]
|
||||
foreach node $nodes {
|
||||
if {[dict get $node health] ne "online"} {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if {$j == 0} {
|
||||
set base_cfg [R $j cluster slots]
|
||||
} else {
|
||||
set cfg [R $j cluster slots]
|
||||
if {$cfg != $base_cfg} {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
# Wait for cluster configuration to propagate and be consistent across nodes.
|
||||
proc wait_for_cluster_propagation {} {
|
||||
wait_for_condition 1000 50 {
|
||||
[cluster_config_consistent] eq 1
|
||||
} else {
|
||||
for {set j 0} {$j < [llength $::servers]} {incr j} {
|
||||
puts "R $j cluster slots output: [R $j cluster slots]"
|
||||
}
|
||||
fail "cluster config did not reach a consistent state"
|
||||
}
|
||||
}
|
||||
|
||||
# Check if cluster's view of hostnames is consistent
|
||||
proc are_hostnames_propagated {match_string} {
|
||||
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
|
||||
set cfg [R $j cluster slots]
|
||||
foreach node $cfg {
|
||||
for {set i 2} {$i < [llength $node]} {incr i} {
|
||||
if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 1
|
||||
}
|
||||
@@ -1,35 +0,0 @@
|
||||
# Cluster test suite. Copyright (C) 2014 Redis Ltd.
|
||||
# This software is released under the BSD License. See the COPYING file for
|
||||
# more information.
|
||||
|
||||
# Set the executable paths at project root
|
||||
source tests/support/set_executable_path.tcl
|
||||
|
||||
cd tests/cluster
|
||||
source cluster.tcl
|
||||
source ../instances.tcl
|
||||
source ../../support/cluster.tcl ; # Cluster client.
|
||||
|
||||
set ::instances_count 20 ; # How many instances we use at max.
|
||||
set ::tlsdir "../../tls"
|
||||
|
||||
proc main {} {
|
||||
parse_options
|
||||
spawn_instance valkey $::valkey_base_port $::instances_count {
|
||||
"cluster-enabled yes"
|
||||
"appendonly yes"
|
||||
"enable-protected-configs yes"
|
||||
"enable-debug-command yes"
|
||||
"save ''"
|
||||
}
|
||||
run_tests
|
||||
cleanup
|
||||
end_tests
|
||||
}
|
||||
|
||||
if {[catch main e]} {
|
||||
puts $::errorInfo
|
||||
if {$::pause_on_error} pause_on_error
|
||||
cleanup
|
||||
exit 1
|
||||
}
|
||||
@@ -1,117 +0,0 @@
|
||||
# Failover stress test.
|
||||
# In this test a different node is killed in a loop for N
|
||||
# iterations. The test checks that certain properties
|
||||
# are preserved across iterations.
|
||||
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
|
||||
test "Create a 5 nodes cluster" {
|
||||
create_cluster 5 5
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
set iterations 20
|
||||
set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]]
|
||||
|
||||
while {[incr iterations -1]} {
|
||||
set tokill [randomInt 10]
|
||||
set other [expr {($tokill+1)%10}] ; # Some other instance.
|
||||
set key [randstring 20 20 alpha]
|
||||
set val [randstring 20 20 alpha]
|
||||
set role [RI $tokill role]
|
||||
if {$role eq {master}} {
|
||||
set slave {}
|
||||
set myid [dict get [get_myself $tokill] id]
|
||||
foreach_valkey_id id {
|
||||
if {$id == $tokill} continue
|
||||
if {[dict get [get_myself $id] slaveof] eq $myid} {
|
||||
set slave $id
|
||||
}
|
||||
}
|
||||
if {$slave eq {}} {
|
||||
fail "Unable to retrieve slave's ID for master #$tokill"
|
||||
}
|
||||
}
|
||||
|
||||
puts "--- Iteration $iterations ---"
|
||||
|
||||
if {$role eq {master}} {
|
||||
test "Wait for slave of #$tokill to sync" {
|
||||
wait_for_condition 1000 50 {
|
||||
[string match {*state=online*} [RI $tokill slave0]]
|
||||
} else {
|
||||
fail "Slave of node #$tokill is not ok"
|
||||
}
|
||||
}
|
||||
set slave_config_epoch [CI $slave cluster_my_epoch]
|
||||
}
|
||||
|
||||
test "Cluster is writable before failover" {
|
||||
for {set i 0} {$i < 100} {incr i} {
|
||||
catch {$cluster set $key:$i $val:$i} err
|
||||
assert {$err eq {OK}}
|
||||
}
|
||||
# Wait for the write to propagate to the slave if we
|
||||
# are going to kill a master.
|
||||
if {$role eq {master}} {
|
||||
R $tokill wait 1 20000
|
||||
}
|
||||
}
|
||||
|
||||
test "Terminating node #$tokill" {
|
||||
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
|
||||
R $tokill config set appendonly no
|
||||
kill_instance valkey $tokill
|
||||
}
|
||||
|
||||
if {$role eq {master}} {
|
||||
test "Wait failover by #$slave with old epoch $slave_config_epoch" {
|
||||
wait_for_condition 1000 50 {
|
||||
[CI $slave cluster_my_epoch] > $slave_config_epoch
|
||||
} else {
|
||||
fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Cluster should eventually be up again" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Cluster is writable again" {
|
||||
for {set i 0} {$i < 100} {incr i} {
|
||||
catch {$cluster set $key:$i:2 $val:$i:2} err
|
||||
assert {$err eq {OK}}
|
||||
}
|
||||
}
|
||||
|
||||
test "Restarting node #$tokill" {
|
||||
restart_instance valkey $tokill
|
||||
}
|
||||
|
||||
test "Instance #$tokill is now a slave" {
|
||||
wait_for_condition 1000 50 {
|
||||
[RI $tokill role] eq {slave}
|
||||
} else {
|
||||
fail "Restarted instance is not a slave"
|
||||
}
|
||||
}
|
||||
|
||||
test "We can read back the value we set before" {
|
||||
for {set i 0} {$i < 100} {incr i} {
|
||||
catch {$cluster get $key:$i} err
|
||||
assert {$err eq "$val:$i"}
|
||||
catch {$cluster get $key:$i:2} err
|
||||
assert {$err eq "$val:$i:2"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Post condition: current_epoch >= my_epoch everywhere" {
|
||||
foreach_valkey_id id {
|
||||
assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
|
||||
}
|
||||
}
|
||||
@@ -1,196 +0,0 @@
|
||||
# Failover stress test.
|
||||
# In this test a different node is killed in a loop for N
|
||||
# iterations. The test checks that certain properties
|
||||
# are preserved across iterations.
|
||||
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
source "../../../tests/support/cli.tcl"
|
||||
|
||||
test "Create a 5 nodes cluster" {
|
||||
create_cluster 5 5
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Enable AOF in all the instances" {
|
||||
foreach_valkey_id id {
|
||||
R $id config set appendonly yes
|
||||
# We use "appendfsync no" because it's fast but also guarantees that
|
||||
# write(2) is performed before replying to client.
|
||||
R $id config set appendfsync no
|
||||
}
|
||||
|
||||
foreach_valkey_id id {
|
||||
wait_for_condition 1000 500 {
|
||||
[RI $id aof_rewrite_in_progress] == 0 &&
|
||||
[RI $id aof_enabled] == 1
|
||||
} else {
|
||||
fail "Failed to enable AOF on instance #$id"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Return non-zero if the specified PID is about a process still in execution,
|
||||
# otherwise 0 is returned.
|
||||
proc process_is_running {pid} {
|
||||
# PS should return with an error if PID is non existing,
|
||||
# and catch will return non-zero. We want to return non-zero if
|
||||
# the PID exists, so we invert the return value with expr not operator.
|
||||
expr {![catch {exec ps -p $pid}]}
|
||||
}
|
||||
|
||||
# Our resharding test performs the following actions:
|
||||
#
|
||||
# - N commands are sent to the cluster in the course of the test.
|
||||
# - Every command selects a random key from key:0 to key:MAX-1.
|
||||
# - The operation RPUSH key <randomvalue> is performed.
|
||||
# - Tcl remembers into an array all the values pushed to each list.
|
||||
# - After N/2 commands, the resharding process is started in background.
|
||||
# - The test continues while the resharding is in progress.
|
||||
# - At the end of the test, we wait for the resharding process to stop.
|
||||
# - Finally the keys are checked to see if they contain the value they should.
|
||||
|
||||
set numkeys 50000
|
||||
set numops 200000
|
||||
set start_node_port [get_instance_attrib valkey 0 port]
|
||||
set cluster [valkey_cluster 127.0.0.1:$start_node_port]
|
||||
if {$::tls} {
|
||||
# setup a non-TLS cluster client to the TLS cluster
|
||||
set plaintext_port [get_instance_attrib valkey 0 plaintext-port]
|
||||
set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0]
|
||||
puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
|
||||
} else {
|
||||
set cluster_plaintext $cluster
|
||||
puts "Testing using non-TLS cluster"
|
||||
}
|
||||
catch {unset content}
|
||||
array set content {}
|
||||
set tribpid {}
|
||||
|
||||
test "Cluster consistency during live resharding" {
|
||||
set ele 0
|
||||
for {set j 0} {$j < $numops} {incr j} {
|
||||
# Trigger the resharding once we execute half the ops.
|
||||
if {$tribpid ne {} &&
|
||||
($j % 10000) == 0 &&
|
||||
![process_is_running $tribpid]} {
|
||||
set tribpid {}
|
||||
}
|
||||
|
||||
if {$j >= $numops/2 && $tribpid eq {}} {
|
||||
puts -nonewline "...Starting resharding..."
|
||||
flush stdout
|
||||
set target [dict get [get_myself [randomInt 5]] id]
|
||||
set tribpid [lindex [exec \
|
||||
$::VALKEY_CLI_BIN --cluster reshard \
|
||||
127.0.0.1:[get_instance_attrib valkey 0 port] \
|
||||
--cluster-from all \
|
||||
--cluster-to $target \
|
||||
--cluster-slots 100 \
|
||||
--cluster-yes \
|
||||
{*}[valkeycli_tls_config "../../../tests"] \
|
||||
| [info nameofexecutable] \
|
||||
../tests/helpers/onlydots.tcl \
|
||||
&] 0]
|
||||
}
|
||||
|
||||
# Write random data to random list.
|
||||
set listid [randomInt $numkeys]
|
||||
set key "key:$listid"
|
||||
incr ele
|
||||
# We write both with Lua scripts and with plain commands.
|
||||
# This way we are able to stress Lua -> server command invocation
|
||||
# as well, that has tests to prevent Lua to write into wrong
|
||||
# hash slots.
|
||||
# We also use both TLS and plaintext connections.
|
||||
if {$listid % 3 == 0} {
|
||||
$cluster rpush $key $ele
|
||||
} elseif {$listid % 3 == 1} {
|
||||
$cluster_plaintext rpush $key $ele
|
||||
} else {
|
||||
$cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
|
||||
}
|
||||
lappend content($key) $ele
|
||||
|
||||
if {($j % 1000) == 0} {
|
||||
puts -nonewline W; flush stdout
|
||||
}
|
||||
}
|
||||
|
||||
# Wait for the resharding process to end
|
||||
wait_for_condition 1000 500 {
|
||||
[process_is_running $tribpid] == 0
|
||||
} else {
|
||||
fail "Resharding is not terminating after some time."
|
||||
}
|
||||
wait_for_cluster_propagation
|
||||
}
|
||||
|
||||
test "Verify $numkeys keys for consistency with logical content" {
|
||||
# Check that the Cluster content matches our logical content.
|
||||
foreach {key value} [array get content] {
|
||||
if {[$cluster lrange $key 0 -1] ne $value} {
|
||||
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Terminate and restart all the instances" {
|
||||
foreach_valkey_id id {
|
||||
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
|
||||
R $id config set appendonly no
|
||||
kill_instance valkey $id
|
||||
restart_instance valkey $id
|
||||
}
|
||||
}
|
||||
|
||||
test "Cluster should eventually be up again" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Verify $numkeys keys after the restart" {
|
||||
# Check that the Cluster content matches our logical content.
|
||||
foreach {key value} [array get content] {
|
||||
if {[$cluster lrange $key 0 -1] ne $value} {
|
||||
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Disable AOF in all the instances" {
|
||||
foreach_valkey_id id {
|
||||
R $id config set appendonly no
|
||||
}
|
||||
}
|
||||
|
||||
test "Verify slaves consistency" {
|
||||
set verified_masters 0
|
||||
foreach_valkey_id id {
|
||||
set role [R $id role]
|
||||
lassign $role myrole myoffset slaves
|
||||
if {$myrole eq {slave}} continue
|
||||
set masterport [get_instance_attrib valkey $id port]
|
||||
set masterdigest [R $id debug digest]
|
||||
foreach_valkey_id sid {
|
||||
set srole [R $sid role]
|
||||
if {[lindex $srole 0] eq {master}} continue
|
||||
if {[lindex $srole 2] != $masterport} continue
|
||||
wait_for_condition 1000 500 {
|
||||
[R $sid debug digest] eq $masterdigest
|
||||
} else {
|
||||
fail "Master and slave data digest are different"
|
||||
}
|
||||
incr verified_masters
|
||||
}
|
||||
}
|
||||
assert {$verified_masters >= 5}
|
||||
}
|
||||
|
||||
test "Dump sanitization was skipped for migrations" {
|
||||
set verified_masters 0
|
||||
foreach_valkey_id id {
|
||||
assert {[RI $id dump_payload_sanitizations] == 0}
|
||||
}
|
||||
}
|
||||
@@ -1,103 +0,0 @@
|
||||
# Replica migration test.
|
||||
# Check that orphaned masters are joined by replicas of masters having
|
||||
# multiple replicas attached, according to the migration barrier settings.
|
||||
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
|
||||
# Create a cluster with 5 master and 10 slaves, so that we have 2
|
||||
# slaves for each master.
|
||||
test "Create a 5 nodes cluster" {
|
||||
create_cluster 5 10
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Each master should have two replicas attached" {
|
||||
foreach_valkey_id id {
|
||||
if {$id < 5} {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R $id role] 2]] == 2
|
||||
} else {
|
||||
fail "Master #$id does not have 2 slaves as expected"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Killing all the slaves of master #0 and #1" {
|
||||
kill_instance valkey 5
|
||||
kill_instance valkey 10
|
||||
kill_instance valkey 6
|
||||
kill_instance valkey 11
|
||||
after 4000
|
||||
}
|
||||
|
||||
foreach_valkey_id id {
|
||||
if {$id < 5} {
|
||||
test "Master #$id should have at least one replica" {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R $id role] 2]] >= 1
|
||||
} else {
|
||||
fail "Master #$id has no replicas"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Now test the migration to a master which used to be a slave, after
|
||||
# a failver.
|
||||
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
|
||||
# Create a cluster with 5 master and 10 slaves, so that we have 2
|
||||
# slaves for each master.
|
||||
test "Create a 5 nodes cluster" {
|
||||
create_cluster 5 10
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Kill slave #7 of master #2. Only slave left is #12 now" {
|
||||
kill_instance valkey 7
|
||||
}
|
||||
|
||||
set current_epoch [CI 1 cluster_current_epoch]
|
||||
|
||||
test "Killing master node #2, #12 should failover" {
|
||||
kill_instance valkey 2
|
||||
}
|
||||
|
||||
test "Wait for failover" {
|
||||
wait_for_condition 1000 50 {
|
||||
[CI 1 cluster_current_epoch] > $current_epoch
|
||||
} else {
|
||||
fail "No failover detected"
|
||||
}
|
||||
}
|
||||
|
||||
test "Cluster should eventually be up again" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Cluster is writable" {
|
||||
cluster_write_test 1
|
||||
}
|
||||
|
||||
test "Instance 12 is now a master without slaves" {
|
||||
assert {[RI 12 role] eq {master}}
|
||||
}
|
||||
|
||||
# The remaining instance is now without slaves. Some other slave
|
||||
# should migrate to it.
|
||||
|
||||
test "Master #12 should get at least one migrated replica" {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R 12 role] 2]] >= 1
|
||||
} else {
|
||||
fail "Master #12 has no replicas"
|
||||
}
|
||||
}
|
||||
@@ -1,75 +0,0 @@
|
||||
# Replica migration test #2.
|
||||
#
|
||||
# Check that the status of master that can be targeted by replica migration
|
||||
# is acquired again, after being getting slots again, in a cluster where the
|
||||
# other masters have slaves.
|
||||
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
source "../../../tests/support/cli.tcl"
|
||||
|
||||
# Create a cluster with 5 master and 15 slaves, to make sure there are no
|
||||
# empty masters and make rebalancing simpler to handle during the test.
|
||||
test "Create a 5 nodes cluster" {
|
||||
cluster_create_with_continuous_slots 5 15
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Each master should have at least two replicas attached" {
|
||||
foreach_valkey_id id {
|
||||
if {$id < 5} {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R $id role] 2]] >= 2
|
||||
} else {
|
||||
fail "Master #$id does not have 2 slaves as expected"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Set allow-replica-migration yes" {
|
||||
foreach_valkey_id id {
|
||||
R $id CONFIG SET cluster-allow-replica-migration yes
|
||||
}
|
||||
}
|
||||
|
||||
set master0_id [dict get [get_myself 0] id]
|
||||
test "Resharding all the master #0 slots away from it" {
|
||||
set output [exec \
|
||||
$::VALKEY_CLI_BIN --cluster rebalance \
|
||||
127.0.0.1:[get_instance_attrib valkey 0 port] \
|
||||
{*}[valkeycli_tls_config "../../../tests"] \
|
||||
--cluster-weight ${master0_id}=0 >@ stdout ]
|
||||
|
||||
}
|
||||
|
||||
test "Master #0 who lost all slots should turn into a replica without replicas" {
|
||||
wait_for_condition 1000 50 {
|
||||
[RI 0 role] == "slave" && [RI 0 connected_slaves] == 0
|
||||
} else {
|
||||
puts [R 0 info replication]
|
||||
fail "Master #0 didn't turn itself into a replica"
|
||||
}
|
||||
}
|
||||
|
||||
test "Resharding back some slot to master #0" {
|
||||
# Wait for the cluster config to propagate before attempting a
|
||||
# new resharding.
|
||||
after 10000
|
||||
set output [exec \
|
||||
$::VALKEY_CLI_BIN --cluster rebalance \
|
||||
127.0.0.1:[get_instance_attrib valkey 0 port] \
|
||||
{*}[valkeycli_tls_config "../../../tests"] \
|
||||
--cluster-weight ${master0_id}=.01 \
|
||||
--cluster-use-empty-masters >@ stdout]
|
||||
}
|
||||
|
||||
test "Master #0 should re-acquire one or more replicas" {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R 0 role] 2]] >= 1
|
||||
} else {
|
||||
fail "Master #0 has no has replicas"
|
||||
}
|
||||
}
|
||||
@@ -1,65 +0,0 @@
|
||||
# Replica migration test #2.
|
||||
#
|
||||
# Check that if 'cluster-allow-replica-migration' is set to 'no', slaves do not
|
||||
# migrate when master becomes empty.
|
||||
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
source "../tests/includes/utils.tcl"
|
||||
|
||||
# Create a cluster with 5 master and 15 slaves, to make sure there are no
|
||||
# empty masters and make rebalancing simpler to handle during the test.
|
||||
test "Create a 5 nodes cluster" {
|
||||
cluster_create_with_continuous_slots 5 15
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Each master should have at least two replicas attached" {
|
||||
foreach_valkey_id id {
|
||||
if {$id < 5} {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R $id role] 2]] >= 2
|
||||
} else {
|
||||
fail "Master #$id does not have 2 slaves as expected"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Set allow-replica-migration no" {
|
||||
foreach_valkey_id id {
|
||||
R $id CONFIG SET cluster-allow-replica-migration no
|
||||
}
|
||||
}
|
||||
|
||||
set master0_id [dict get [get_myself 0] id]
|
||||
test "Resharding all the master #0 slots away from it" {
|
||||
set output [exec \
|
||||
$::VALKEY_CLI_BIN --cluster rebalance \
|
||||
127.0.0.1:[get_instance_attrib valkey 0 port] \
|
||||
{*}[valkeycli_tls_config "../../../tests"] \
|
||||
--cluster-weight ${master0_id}=0 >@ stdout ]
|
||||
}
|
||||
|
||||
test "Wait cluster to be stable" {
|
||||
wait_cluster_stable
|
||||
}
|
||||
|
||||
test "Master #0 still should have its replicas" {
|
||||
assert { [llength [lindex [R 0 role] 2]] >= 2 }
|
||||
}
|
||||
|
||||
test "Each master should have at least two replicas attached" {
|
||||
foreach_valkey_id id {
|
||||
if {$id < 5} {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R $id role] 2]] >= 2
|
||||
} else {
|
||||
fail "Master #$id does not have 2 slaves as expected"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,312 +0,0 @@
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
|
||||
# Initial slot distribution.
|
||||
set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926]
|
||||
set ::slot1 [list 5460 5460 5462 10922 10925 10925]
|
||||
set ::slot2 [list 10923 10924 10927 16383]
|
||||
set ::slot3 [list 1001 1001]
|
||||
|
||||
proc cluster_create_with_split_slots {masters replicas} {
|
||||
for {set j 0} {$j < $masters} {incr j} {
|
||||
R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}]
|
||||
}
|
||||
if {$replicas} {
|
||||
cluster_allocate_slaves $masters $replicas
|
||||
}
|
||||
set ::cluster_master_nodes $masters
|
||||
set ::cluster_replica_nodes $replicas
|
||||
}
|
||||
|
||||
# Get the node info with the specific node_id from the
|
||||
# given reference node. Valid type options are "node" and "shard"
|
||||
proc get_node_info_from_shard {id reference {type node}} {
|
||||
set shards_response [R $reference CLUSTER SHARDS]
|
||||
foreach shard_response $shards_response {
|
||||
set nodes [dict get $shard_response nodes]
|
||||
foreach node $nodes {
|
||||
if {[dict get $node id] eq $id} {
|
||||
if {$type eq "node"} {
|
||||
return $node
|
||||
} elseif {$type eq "shard"} {
|
||||
return $shard_response
|
||||
} else {
|
||||
return {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
# No shard found, return nothing
|
||||
return {}
|
||||
}
|
||||
|
||||
proc cluster_ensure_master {id} {
|
||||
if { [regexp "master" [R $id role]] == 0 } {
|
||||
assert_equal {OK} [R $id CLUSTER FAILOVER]
|
||||
wait_for_condition 50 100 {
|
||||
[regexp "master" [R $id role]] == 1
|
||||
} else {
|
||||
fail "instance $id is not master"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Create a 8 nodes cluster with 4 shards" {
|
||||
cluster_create_with_split_slots 4 4
|
||||
}
|
||||
|
||||
test "Cluster should start ok" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
|
||||
test "Set cluster hostnames and verify they are propagated" {
|
||||
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
|
||||
R $j config set cluster-announce-hostname "host-$j.com"
|
||||
}
|
||||
|
||||
# Wait for everyone to agree about the state
|
||||
wait_for_cluster_propagation
|
||||
}
|
||||
|
||||
test "Verify information about the shards" {
|
||||
set ids {}
|
||||
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
|
||||
lappend ids [R $j CLUSTER MYID]
|
||||
}
|
||||
set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3]
|
||||
|
||||
# Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent.
|
||||
for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} {
|
||||
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
|
||||
assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots]
|
||||
assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname]
|
||||
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip]
|
||||
# Default value of 'cluster-preferred-endpoint-type' is ip.
|
||||
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint]
|
||||
|
||||
if {$::tls} {
|
||||
assert_equal [get_instance_attrib valkey $i plaintext-port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
|
||||
assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port]
|
||||
} else {
|
||||
assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
|
||||
}
|
||||
|
||||
if {$i < 4} {
|
||||
assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
|
||||
assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health]
|
||||
} else {
|
||||
assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
|
||||
# Replica could be in online or loading
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Verify no slot shard" {
|
||||
# Node 8 has no slots assigned
|
||||
set node_8_id [R 8 CLUSTER MYID]
|
||||
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots]
|
||||
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots]
|
||||
}
|
||||
|
||||
set node_0_id [R 0 CLUSTER MYID]
|
||||
|
||||
test "Kill a node and tell the replica to immediately takeover" {
|
||||
kill_instance valkey 0
|
||||
R 4 cluster failover force
|
||||
}
|
||||
|
||||
# Primary 0 node should report as fail, wait until the new primary acknowledges it.
|
||||
test "Verify health as fail for killed node" {
|
||||
wait_for_condition 1000 50 {
|
||||
"fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"]
|
||||
} else {
|
||||
fail "New primary never detected the node failed"
|
||||
}
|
||||
}
|
||||
|
||||
set primary_id 4
|
||||
set replica_id 0
|
||||
|
||||
test "Restarting primary node" {
|
||||
restart_instance valkey $replica_id
|
||||
}
|
||||
|
||||
test "Instance #0 gets converted into a replica" {
|
||||
wait_for_condition 1000 50 {
|
||||
[RI $replica_id role] eq {slave}
|
||||
} else {
|
||||
fail "Old primary was not converted into replica"
|
||||
}
|
||||
}
|
||||
|
||||
test "Test the replica reports a loading state while it's loading" {
|
||||
# Test the command is good for verifying everything moves to a happy state
|
||||
set replica_cluster_id [R $replica_id CLUSTER MYID]
|
||||
wait_for_condition 50 1000 {
|
||||
[dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online"
|
||||
} else {
|
||||
fail "Replica never transitioned to online"
|
||||
}
|
||||
|
||||
# Set 1 MB of data, so there is something to load on full sync
|
||||
R $primary_id debug populate 1000 key 1000
|
||||
|
||||
# Kill replica client for primary and load new data to the primary
|
||||
R $primary_id config set repl-backlog-size 100
|
||||
|
||||
# Set the key load delay so that it will take at least
|
||||
# 2 seconds to fully load the data.
|
||||
R $replica_id config set key-load-delay 4000
|
||||
|
||||
# Trigger event loop processing every 1024 bytes, this trigger
|
||||
# allows us to send and receive cluster messages, so we are setting
|
||||
# it low so that the cluster messages are sent more frequently.
|
||||
R $replica_id config set loading-process-events-interval-bytes 1024
|
||||
|
||||
R $primary_id multi
|
||||
R $primary_id client kill type replica
|
||||
# populate the correct data
|
||||
set num 100
|
||||
set value [string repeat A 1024]
|
||||
for {set j 0} {$j < $num} {incr j} {
|
||||
# Use hashtag valid for shard #0
|
||||
set key "{ch3}$j"
|
||||
R $primary_id set $key $value
|
||||
}
|
||||
R $primary_id exec
|
||||
|
||||
# The replica should reconnect and start a full sync, it will gossip about it's health to the primary.
|
||||
wait_for_condition 50 1000 {
|
||||
"loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
|
||||
} else {
|
||||
fail "Replica never transitioned to loading"
|
||||
}
|
||||
|
||||
# Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data.
|
||||
R $replica_id CLUSTER SHARDS
|
||||
R $replica_id CLUSTER SLOTS
|
||||
|
||||
# Speed up the key loading and verify everything resumes
|
||||
R $replica_id config set key-load-delay 0
|
||||
|
||||
wait_for_condition 50 1000 {
|
||||
"online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
|
||||
} else {
|
||||
fail "Replica never transitioned to online"
|
||||
}
|
||||
|
||||
# Final sanity, the replica agrees it is online.
|
||||
assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health]
|
||||
}
|
||||
|
||||
test "Regression test for a crash when calling SHARDS during handshake" {
|
||||
# Reset forget a node, so we can use it to establish handshaking connections
|
||||
set id [R 19 CLUSTER MYID]
|
||||
R 19 CLUSTER RESET HARD
|
||||
for {set i 0} {$i < 19} {incr i} {
|
||||
R $i CLUSTER FORGET $id
|
||||
}
|
||||
R 19 cluster meet 127.0.0.1 [get_instance_attrib valkey 0 port]
|
||||
# This should line would previously crash, since all the outbound
|
||||
# connections were in handshake state.
|
||||
R 19 CLUSTER SHARDS
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
assert_cluster_state ok
|
||||
}
|
||||
test "Shard ids are unique" {
|
||||
set shard_ids {}
|
||||
for {set i 0} {$i < 4} {incr i} {
|
||||
set shard_id [R $i cluster myshardid]
|
||||
assert_equal [dict exists $shard_ids $shard_id] 0
|
||||
dict set shard_ids $shard_id 1
|
||||
}
|
||||
}
|
||||
|
||||
test "CLUSTER MYSHARDID reports same id for both primary and replica" {
|
||||
for {set i 0} {$i < 4} {incr i} {
|
||||
assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid]
|
||||
assert_equal [string length [R $i cluster myshardid]] 40
|
||||
}
|
||||
}
|
||||
|
||||
test "New replica receives primary's shard id" {
|
||||
#find a primary
|
||||
set id 0
|
||||
for {} {$id < 8} {incr id} {
|
||||
if {[regexp "master" [R $id role]]} {
|
||||
break
|
||||
}
|
||||
}
|
||||
assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid]
|
||||
assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]]
|
||||
assert_equal [R 8 cluster myshardid] [R $id cluster myshardid]
|
||||
}
|
||||
|
||||
test "CLUSTER MYSHARDID reports same shard id after shard restart" {
|
||||
set node_ids {}
|
||||
for {set i 0} {$i < 8} {incr i 4} {
|
||||
dict set node_ids $i [R $i cluster myshardid]
|
||||
kill_instance valkey $i
|
||||
wait_for_condition 50 100 {
|
||||
[instance_is_killed valkey $i]
|
||||
} else {
|
||||
fail "instance $i is not killed"
|
||||
}
|
||||
}
|
||||
for {set i 0} {$i < 8} {incr i 4} {
|
||||
restart_instance valkey $i
|
||||
}
|
||||
assert_cluster_state ok
|
||||
for {set i 0} {$i < 8} {incr i 4} {
|
||||
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
|
||||
}
|
||||
}
|
||||
|
||||
test "CLUSTER MYSHARDID reports same shard id after cluster restart" {
|
||||
set node_ids {}
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
dict set node_ids $i [R $i cluster myshardid]
|
||||
}
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
kill_instance valkey $i
|
||||
wait_for_condition 50 100 {
|
||||
[instance_is_killed valkey $i]
|
||||
} else {
|
||||
fail "instance $i is not killed"
|
||||
}
|
||||
}
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
restart_instance valkey $i
|
||||
}
|
||||
assert_cluster_state ok
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
|
||||
}
|
||||
}
|
||||
|
||||
test "CLUSTER SHARDS id response validation" {
|
||||
# For each node in the cluster
|
||||
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
|
||||
# Get the CLUSTER SHARDS output from this node
|
||||
set shards [R $i CLUSTER SHARDS]
|
||||
set seen_shard_ids {}
|
||||
|
||||
# For each shard in the output
|
||||
foreach shard $shards {
|
||||
set shard_dict [dict create {*}$shard]
|
||||
|
||||
# 1. Verify 'id' key exists
|
||||
assert {[dict exists $shard_dict id]}
|
||||
set shard_id [dict get $shard_dict id]
|
||||
|
||||
# 2. Verify shard_id is a 40-char string
|
||||
assert {[string length $shard_id] == 40}
|
||||
|
||||
# 3. Verify that for a given node's output, all shard IDs are unique
|
||||
assert {[dict exists $seen_shard_ids $shard_id] == 0}
|
||||
dict set seen_shard_ids $shard_id 1
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
# Initialization tests -- most units will start including this.
|
||||
|
||||
test "(init) Restart killed instances" {
|
||||
foreach type {valkey} {
|
||||
foreach_${type}_id id {
|
||||
if {[get_instance_attrib $type $id pid] == -1} {
|
||||
puts -nonewline "$type/$id "
|
||||
flush stdout
|
||||
restart_instance $type $id
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Cluster nodes are reachable" {
|
||||
foreach_valkey_id id {
|
||||
# Every node should be reachable.
|
||||
wait_for_condition 1000 50 {
|
||||
([catch {R $id ping} ping_reply] == 0) &&
|
||||
($ping_reply eq {PONG})
|
||||
} else {
|
||||
catch {R $id ping} err
|
||||
fail "Node #$id keeps replying '$err' to PING."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Cluster nodes hard reset" {
|
||||
foreach_valkey_id id {
|
||||
if {$::valgrind} {
|
||||
set node_timeout 10000
|
||||
} else {
|
||||
set node_timeout 3000
|
||||
}
|
||||
catch {R $id flushall} ; # May fail for readonly slaves.
|
||||
R $id MULTI
|
||||
R $id cluster reset hard
|
||||
R $id cluster set-config-epoch [expr {$id+1}]
|
||||
R $id EXEC
|
||||
R $id config set cluster-node-timeout $node_timeout
|
||||
R $id config set cluster-slave-validity-factor 10
|
||||
R $id config set loading-process-events-interval-bytes 2097152
|
||||
R $id config set key-load-delay 0
|
||||
R $id config set repl-diskless-load disabled
|
||||
R $id config set cluster-announce-hostname ""
|
||||
R $id DEBUG DROP-CLUSTER-PACKET-FILTER -1
|
||||
R $id config rewrite
|
||||
}
|
||||
}
|
||||
|
||||
# Helper function to attempt to have each node in a cluster
|
||||
# meet each other.
|
||||
proc join_nodes_in_cluster {} {
|
||||
# Join node 0 with 1, 1 with 2, ... and so forth.
|
||||
# If auto-discovery works all nodes will know every other node
|
||||
# eventually.
|
||||
set ids {}
|
||||
foreach_valkey_id id {lappend ids $id}
|
||||
for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} {
|
||||
set a [lindex $ids $j]
|
||||
set b [lindex $ids [expr $j+1]]
|
||||
set b_port [get_instance_attrib valkey $b port]
|
||||
R $a cluster meet 127.0.0.1 $b_port
|
||||
}
|
||||
|
||||
foreach_valkey_id id {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [get_cluster_nodes $id connected]] == [llength $ids]
|
||||
} else {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
test "Cluster Join and auto-discovery test" {
|
||||
# Use multiple attempts since sometimes nodes timeout
|
||||
# while attempting to connect.
|
||||
for {set attempts 3} {$attempts > 0} {incr attempts -1} {
|
||||
if {[join_nodes_in_cluster] == 1} {
|
||||
break
|
||||
}
|
||||
}
|
||||
if {$attempts == 0} {
|
||||
fail "Cluster failed to form full mesh"
|
||||
}
|
||||
}
|
||||
|
||||
test "Before slots allocation, all nodes report cluster failure" {
|
||||
assert_cluster_state fail
|
||||
}
|
||||
@@ -1,36 +0,0 @@
|
||||
source "../../../tests/support/cli.tcl"
|
||||
|
||||
proc config_set_all_nodes {keyword value} {
|
||||
foreach_valkey_id id {
|
||||
R $id config set $keyword $value
|
||||
}
|
||||
}
|
||||
|
||||
proc fix_cluster {addr} {
|
||||
set code [catch {
|
||||
exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster fix $addr << yes
|
||||
} result]
|
||||
if {$code != 0} {
|
||||
puts "valkey-cli --cluster fix returns non-zero exit code, output below:\n$result"
|
||||
}
|
||||
# Note: valkey-cli --cluster fix may return a non-zero exit code if nodes don't agree,
|
||||
# but we can ignore that and rely on the check below.
|
||||
assert_cluster_state ok
|
||||
wait_for_condition 100 100 {
|
||||
[catch {exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster check $addr} result] == 0
|
||||
} else {
|
||||
puts "valkey-cli --cluster check returns non-zero exit code, output below:\n$result"
|
||||
fail "Cluster could not settle with configuration"
|
||||
}
|
||||
}
|
||||
|
||||
proc wait_cluster_stable {} {
|
||||
wait_for_condition 1000 50 {
|
||||
[catch {exec $::VALKEY_CLI_BIN --cluster \
|
||||
check 127.0.0.1:[get_instance_attrib valkey 0 port] \
|
||||
{*}[valkeycli_tls_config "../../../tests"] \
|
||||
}] == 0
|
||||
} else {
|
||||
fail "Cluster doesn't stabilize"
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
*
|
||||
@@ -148,6 +148,7 @@ proc wait_for_cluster_size {cluster_size} {
|
||||
# Check that cluster nodes agree about "state", or raise an error.
|
||||
proc wait_for_cluster_state {state} {
|
||||
for {set j 0} {$j < [llength $::servers]} {incr j} {
|
||||
if {![process_is_alive [srv -$j pid]]} continue
|
||||
if {[process_is_paused [srv -$j pid]]} continue
|
||||
wait_for_condition 1000 50 {
|
||||
[CI $j cluster_state] eq $state
|
||||
|
||||
@@ -53,3 +53,289 @@ start_cluster 3 3 {tags {external:skip cluster}} {
|
||||
assert_equal $shard_0_slot_coverage [dict get [get_node_info_from_shard $node_0_id $validation_node "shard"] "slots"]
|
||||
}
|
||||
}
|
||||
# Initial slot distribution for split-slot cluster tests.
|
||||
set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926]
|
||||
set ::slot1 [list 5460 5460 5462 10922 10925 10925]
|
||||
set ::slot2 [list 10923 10924 10927 16383]
|
||||
set ::slot3 [list 1001 1001]
|
||||
|
||||
# Slot allocator: assigns split slots to each master.
|
||||
proc split_slot_allocation {masters replicas} {
|
||||
for {set j 0} {$j < $masters} {incr j} {
|
||||
R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}]
|
||||
}
|
||||
}
|
||||
|
||||
# Replica allocator: allocates only masters replicas, leaving the last server
|
||||
# (R 8) as a standalone no-slot node for testing purposes.
|
||||
proc split_slot_replica_allocation {masters replicas} {
|
||||
cluster_allocate_replicas $masters [expr {$replicas - 1}]
|
||||
}
|
||||
|
||||
proc cluster_ensure_master {id} {
|
||||
if { [regexp "master" [R $id role]] == 0 } {
|
||||
assert_equal {OK} [R $id CLUSTER FAILOVER]
|
||||
wait_for_condition 50 100 {
|
||||
[regexp "master" [R $id role]] == 1
|
||||
} else {
|
||||
fail "instance $id is not master"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# start_cluster 4 masters + 5 nodes (4 replicas + 1 standalone R8)
|
||||
start_cluster 4 5 {tags {external:skip cluster}} {
|
||||
|
||||
# cluster_master_nodes and cluster_replica_nodes refer to the active cluster members.
|
||||
set ::cluster_master_nodes 4
|
||||
set ::cluster_replica_nodes 4
|
||||
|
||||
test "Cluster should start ok" {
|
||||
wait_for_cluster_state ok
|
||||
}
|
||||
|
||||
test "Set cluster hostnames and verify they are propagated" {
|
||||
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
|
||||
R $j config set cluster-announce-hostname "host-$j.com"
|
||||
}
|
||||
|
||||
# Wait for everyone to agree about the state
|
||||
wait_for_cluster_propagation
|
||||
}
|
||||
|
||||
test "Verify information about the shards" {
|
||||
set ids {}
|
||||
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
|
||||
lappend ids [R $j CLUSTER MYID]
|
||||
}
|
||||
set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3]
|
||||
|
||||
# Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent.
|
||||
for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} {
|
||||
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
|
||||
assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots]
|
||||
assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname]
|
||||
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip]
|
||||
# Default value of 'cluster-preferred-endpoint-type' is ip.
|
||||
assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint]
|
||||
|
||||
if {$::tls} {
|
||||
assert_equal [srv [expr -1*$i] pport] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
|
||||
assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port]
|
||||
} else {
|
||||
assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
|
||||
}
|
||||
|
||||
if {$i < 4} {
|
||||
assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
|
||||
assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health]
|
||||
} else {
|
||||
assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
|
||||
# Replica could be in online or loading
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Verify no slot shard" {
|
||||
# R 8 is a standalone node with no slots assigned (left standalone by split_slot_replica_allocation)
|
||||
set node_8_id [R 8 CLUSTER MYID]
|
||||
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots]
|
||||
assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots]
|
||||
}
|
||||
|
||||
set node_0_id [R 0 CLUSTER MYID]
|
||||
|
||||
test "Kill a node and tell the replica to immediately takeover" {
|
||||
pause_process [srv 0 pid]
|
||||
R 4 cluster failover force
|
||||
}
|
||||
|
||||
# Primary 0 node should report as fail, wait until the new primary acknowledges it.
|
||||
test "Verify health as fail for killed node" {
|
||||
wait_for_condition 1000 50 {
|
||||
"fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"]
|
||||
} else {
|
||||
fail "New primary never detected the node failed"
|
||||
}
|
||||
}
|
||||
|
||||
set primary_id 4
|
||||
set replica_id 0
|
||||
|
||||
test "Restarting primary node" {
|
||||
restart_server [expr -1*$replica_id] true false
|
||||
}
|
||||
|
||||
test "Instance #0 gets converted into a replica" {
|
||||
wait_for_condition 1000 50 {
|
||||
[s [expr -1*$replica_id] role] eq {slave}
|
||||
} else {
|
||||
fail "Old primary was not converted into replica"
|
||||
}
|
||||
}
|
||||
|
||||
test "Test the replica reports a loading state while it's loading" {
|
||||
# Test the command is good for verifying everything moves to a happy state
|
||||
set replica_cluster_id [R $replica_id CLUSTER MYID]
|
||||
wait_for_condition 50 1000 {
|
||||
[dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online"
|
||||
} else {
|
||||
fail "Replica never transitioned to online"
|
||||
}
|
||||
|
||||
# Set 1 MB of data, so there is something to load on full sync
|
||||
R $primary_id debug populate 1000 key 1000
|
||||
|
||||
# Kill replica client for primary and load new data to the primary
|
||||
R $primary_id config set repl-backlog-size 100
|
||||
|
||||
# Set the key load delay so that it will take at least
|
||||
# 2 seconds to fully load the data.
|
||||
R $replica_id config set key-load-delay 4000
|
||||
|
||||
# Trigger event loop processing every 1024 bytes, this trigger
|
||||
# allows us to send and receive cluster messages, so we are setting
|
||||
# it low so that the cluster messages are sent more frequently.
|
||||
R $replica_id config set loading-process-events-interval-bytes 1024
|
||||
|
||||
R $primary_id multi
|
||||
R $primary_id client kill type replica
|
||||
# populate the correct data
|
||||
set num 100
|
||||
set value [string repeat A 1024]
|
||||
for {set j 0} {$j < $num} {incr j} {
|
||||
# Use hashtag valid for shard #0
|
||||
set key "{ch3}$j"
|
||||
R $primary_id set $key $value
|
||||
}
|
||||
R $primary_id exec
|
||||
|
||||
# The replica should reconnect and start a full sync, it will gossip about it's health to the primary.
|
||||
wait_for_condition 50 1000 {
|
||||
"loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
|
||||
} else {
|
||||
fail "Replica never transitioned to loading"
|
||||
}
|
||||
|
||||
# Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data.
|
||||
R $replica_id CLUSTER SHARDS
|
||||
R $replica_id CLUSTER SLOTS
|
||||
|
||||
# Speed up the key loading and verify everything resumes
|
||||
R $replica_id config set key-load-delay 0
|
||||
|
||||
wait_for_condition 50 1000 {
|
||||
"online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
|
||||
} else {
|
||||
fail "Replica never transitioned to online"
|
||||
}
|
||||
|
||||
# Final sanity, the replica agrees it is online.
|
||||
assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health]
|
||||
}
|
||||
|
||||
test "Regression test for a crash when calling SHARDS during handshake" {
|
||||
# Use R 8 (standalone node) to establish handshaking connections
|
||||
set id [R 8 CLUSTER MYID]
|
||||
R 8 CLUSTER RESET HARD
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
R $i CLUSTER FORGET $id
|
||||
}
|
||||
R 8 cluster meet 127.0.0.1 [srv 0 port]
|
||||
# This line would previously crash, since all the outbound
|
||||
# connections were in handshake state.
|
||||
R 8 CLUSTER SHARDS
|
||||
}
|
||||
|
||||
test "Cluster is up" {
|
||||
wait_for_cluster_state ok
|
||||
}
|
||||
|
||||
test "Shard ids are unique" {
|
||||
set shard_ids {}
|
||||
for {set i 0} {$i < 4} {incr i} {
|
||||
set shard_id [R $i cluster myshardid]
|
||||
assert_equal [dict exists $shard_ids $shard_id] 0
|
||||
dict set shard_ids $shard_id 1
|
||||
}
|
||||
}
|
||||
|
||||
test "CLUSTER MYSHARDID reports same id for both primary and replica" {
|
||||
for {set i 0} {$i < 4} {incr i} {
|
||||
assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid]
|
||||
assert_equal [string length [R $i cluster myshardid]] 40
|
||||
}
|
||||
}
|
||||
|
||||
test "New replica receives primary's shard id" {
|
||||
# find a primary
|
||||
set id 0
|
||||
for {} {$id < 8} {incr id} {
|
||||
if {[regexp "master" [R $id role]]} {
|
||||
break
|
||||
}
|
||||
}
|
||||
assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid]
|
||||
assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]]
|
||||
assert_equal [R 8 cluster myshardid] [R $id cluster myshardid]
|
||||
}
|
||||
|
||||
test "CLUSTER MYSHARDID reports same shard id after shard restart" {
|
||||
set node_ids {}
|
||||
for {set i 0} {$i < 8} {incr i 4} {
|
||||
dict set node_ids $i [R $i cluster myshardid]
|
||||
pause_process [srv [expr -1*$i] pid]
|
||||
}
|
||||
for {set i 0} {$i < 8} {incr i 4} {
|
||||
restart_server [expr -1*$i] true false
|
||||
}
|
||||
wait_for_cluster_state ok
|
||||
for {set i 0} {$i < 8} {incr i 4} {
|
||||
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
|
||||
}
|
||||
}
|
||||
|
||||
test "CLUSTER MYSHARDID reports same shard id after cluster restart" {
|
||||
set node_ids {}
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
dict set node_ids $i [R $i cluster myshardid]
|
||||
}
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
pause_process [srv [expr -1*$i] pid]
|
||||
}
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
restart_server [expr -1*$i] true false
|
||||
}
|
||||
wait_for_cluster_state ok
|
||||
for {set i 0} {$i < 8} {incr i} {
|
||||
assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
|
||||
}
|
||||
}
|
||||
|
||||
test "CLUSTER SHARDS id response validation" {
|
||||
# For each node in the cluster
|
||||
for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
|
||||
# Get the CLUSTER SHARDS output from this node
|
||||
set shards [R $i CLUSTER SHARDS]
|
||||
set seen_shard_ids {}
|
||||
|
||||
# For each shard in the output
|
||||
foreach shard $shards {
|
||||
set shard_dict [dict create {*}$shard]
|
||||
|
||||
# 1. Verify 'id' key exists
|
||||
assert {[dict exists $shard_dict id]}
|
||||
set shard_id [dict get $shard_dict id]
|
||||
|
||||
# 2. Verify shard_id is a 40-char string
|
||||
assert {[string length $shard_id] == 40}
|
||||
|
||||
# 3. Verify that for a given node's output, all shard IDs are unique
|
||||
assert {[dict exists $seen_shard_ids $shard_id] == 0}
|
||||
dict set seen_shard_ids $shard_id 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} split_slot_allocation split_slot_replica_allocation
|
||||
|
||||
@@ -152,3 +152,108 @@ start_cluster 3 1 {tags {external:skip cluster}} {
|
||||
}
|
||||
}
|
||||
} ;# start_cluster
|
||||
|
||||
# Failover stress test.
|
||||
# In this test a different node is killed in a loop for N
|
||||
# iterations. The test checks that certain properties
|
||||
# are preserved across iterations.
|
||||
start_cluster 5 5 {tags {external:skip cluster}} {
|
||||
set iterations 10
|
||||
set cluster [valkey_cluster 127.0.0.1:[srv 0 port]]
|
||||
|
||||
while {[incr iterations -1]} {
|
||||
set tokill [randomInt 10]
|
||||
set other [expr {($tokill+1)%10}] ; # Some other instance.
|
||||
set key [randstring 20 20 alpha]
|
||||
set val [randstring 20 20 alpha]
|
||||
set role [s [expr -1*$tokill] role]
|
||||
if {$role eq {master}} {
|
||||
set slave {}
|
||||
set myid [dict get [cluster_get_myself $tokill] id]
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
if {$id == $tokill} continue
|
||||
if {[dict get [cluster_get_myself $id] slaveof] eq $myid} {
|
||||
set slave $id
|
||||
}
|
||||
}
|
||||
if {$slave eq {}} {
|
||||
fail "Unable to retrieve slave's ID for master #$tokill"
|
||||
}
|
||||
}
|
||||
|
||||
if {$role eq {master}} {
|
||||
test "Wait for slave of #$tokill to sync" {
|
||||
wait_for_condition 1000 50 {
|
||||
[string match {*state=online*} [s [expr -1*$tokill] slave0]]
|
||||
} else {
|
||||
fail "Slave of node #$tokill is not ok"
|
||||
}
|
||||
}
|
||||
set slave_config_epoch [CI $slave cluster_my_epoch]
|
||||
}
|
||||
|
||||
test "Cluster is writable before failover" {
|
||||
for {set i 0} {$i < 100} {incr i} {
|
||||
catch {$cluster set $key:$i $val:$i} err
|
||||
assert {$err eq {OK}}
|
||||
}
|
||||
# Wait for the write to propagate to the slave if we
|
||||
# are going to kill a master.
|
||||
if {$role eq {master}} {
|
||||
R $tokill wait 1 20000
|
||||
}
|
||||
}
|
||||
|
||||
test "Terminating node #$tokill" {
|
||||
catch {R $tokill shutdown nosave}
|
||||
}
|
||||
|
||||
if {$role eq {master}} {
|
||||
test "Wait failover by #$slave with old epoch $slave_config_epoch" {
|
||||
wait_for_condition 1000 50 {
|
||||
[CI $slave cluster_my_epoch] > $slave_config_epoch
|
||||
} else {
|
||||
fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Cluster should eventually be up again" {
|
||||
wait_for_cluster_state ok
|
||||
}
|
||||
|
||||
test "Cluster is writable again" {
|
||||
for {set i 0} {$i < 100} {incr i} {
|
||||
catch {$cluster set $key:$i:2 $val:$i:2} err
|
||||
assert {$err eq {OK}}
|
||||
}
|
||||
}
|
||||
|
||||
test "Restarting node #$tokill" {
|
||||
restart_server [expr -1*$tokill] true false
|
||||
}
|
||||
|
||||
test "Instance #$tokill is now a slave" {
|
||||
wait_for_condition 1000 50 {
|
||||
[s [expr -1*$tokill] role] eq {slave}
|
||||
} else {
|
||||
fail "Restarted instance is not a slave"
|
||||
}
|
||||
}
|
||||
|
||||
test "We can read back the value we set before" {
|
||||
for {set i 0} {$i < 100} {incr i} {
|
||||
catch {$cluster get $key:$i} err
|
||||
assert {$err eq "$val:$i"}
|
||||
catch {$cluster get $key:$i:2} err
|
||||
assert {$err eq "$val:$i:2"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Post condition: current_epoch >= my_epoch everywhere" {
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
|
||||
}
|
||||
}
|
||||
} ;# start_cluster
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
# Check that the status of primary that can be targeted by replica migration
|
||||
# is acquired again, after being getting slots again, in a cluster where the
|
||||
# other primaries have replicas.
|
||||
tags {"slow valgrind:skip"} {
|
||||
run_solo {cluster-replica-migration-slow} {
|
||||
start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration yes}} {
|
||||
test "Primary #0 should re-acquire one or more replicas" {
|
||||
# Resharding all the primary #0 slots away from it
|
||||
set primary0_id [dict get [cluster_get_myself 0] id]
|
||||
set output [exec \
|
||||
$::VALKEY_CLI_BIN --cluster rebalance \
|
||||
127.0.0.1:[srv 0 port] \
|
||||
{*}[valkeycli_tls_config "./tests"] \
|
||||
--cluster-weight ${primary0_id}=0 >@ stdout]
|
||||
|
||||
# Primary #0 who lost all slots should turn into a replica without replicas
|
||||
wait_for_condition 1000 50 {
|
||||
[s 0 role] eq "slave" && [s 0 connected_slaves] == 0
|
||||
} else {
|
||||
puts [R 0 info replication]
|
||||
fail "Primary #0 didn't turn itself into a replica"
|
||||
}
|
||||
|
||||
# Resharding back some slot to primary #0
|
||||
# Wait for the cluster config to propagate before attempting a
|
||||
# new resharding.
|
||||
wait_for_cluster_propagation
|
||||
set output [exec \
|
||||
$::VALKEY_CLI_BIN --cluster rebalance \
|
||||
127.0.0.1:[srv 0 port] \
|
||||
{*}[valkeycli_tls_config "./tests"] \
|
||||
--cluster-weight ${primary0_id}=.01 \
|
||||
--cluster-use-empty-masters >@ stdout]
|
||||
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R 0 role] 2]] >= 1
|
||||
} else {
|
||||
fail "Primary #0 has no has replicas"
|
||||
}
|
||||
}
|
||||
} ;# start_cluster
|
||||
} ;# run_solo
|
||||
} ;# tag
|
||||
|
||||
# Check that if 'cluster-allow-replica-migration' is set to 'no', replicas do not
|
||||
# migrate when primary becomes empty.
|
||||
tags {"slow valgrind:skip"} {
|
||||
run_solo {cluster-replica-migration-slow} {
|
||||
start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no}} {
|
||||
test "Each primary should have at least two replicas attached" {
|
||||
# Resharding all the primary #0 slots away from it
|
||||
set primary0_id [dict get [cluster_get_myself 0] id]
|
||||
set output [exec \
|
||||
$::VALKEY_CLI_BIN --cluster rebalance \
|
||||
127.0.0.1:[srv 0 port] \
|
||||
{*}[valkeycli_tls_config "./tests"] \
|
||||
--cluster-weight ${primary0_id}=0 >@ stdout]
|
||||
|
||||
wait_for_cluster_propagation
|
||||
wait_for_cluster_state "ok"
|
||||
|
||||
# Primary #0 still should have its replicas
|
||||
assert { [llength [lindex [R 0 role] 2]] >= 2 }
|
||||
|
||||
# Each primary should have at least two replicas attached
|
||||
for {set id 0} {$id < 5} {incr id} {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R $id role] 2]] >= 2
|
||||
} else {
|
||||
fail "Primary #$id does not have 2 replicas as expected"
|
||||
}
|
||||
}
|
||||
}
|
||||
} ;# start_cluster
|
||||
} ;# run_solo
|
||||
} ;# tag
|
||||
@@ -433,3 +433,73 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout
|
||||
assert_equal [R 2 dbsize] 0
|
||||
}
|
||||
} my_slot_allocation cluster_allocate_replicas ;# start_cluster
|
||||
|
||||
# Replica migration test.
|
||||
# Check that orphaned primaries are joined by replicas of primaries having
|
||||
# multiple replicas attached, according to the migration barrier settings.
|
||||
start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} {
|
||||
# Killing all the replicas of primary #0 and #1
|
||||
pause_process [srv -5 pid]
|
||||
pause_process [srv -10 pid]
|
||||
pause_process [srv -6 pid]
|
||||
pause_process [srv -11 pid]
|
||||
|
||||
# R0 and R1 will trigger replica migration.
|
||||
wait_for_condition 1000 50 {
|
||||
[expr {[count_log_message -7 "Migrating to orphaned primary"] +
|
||||
[count_log_message -8 "Migrating to orphaned primary"] +
|
||||
[count_log_message -9 "Migrating to orphaned primary"] +
|
||||
[count_log_message -12 "Migrating to orphaned primary"] +
|
||||
[count_log_message -13 "Migrating to orphaned primary"] +
|
||||
[count_log_message -14 "Migrating to orphaned primary"]}] == 2
|
||||
} else {
|
||||
fail "Replicas did not trigger replica migration "
|
||||
}
|
||||
|
||||
# Primary should have at least one replica"
|
||||
for {set id 0} {$id < 5} {incr id} {
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R $id role] 2]] >= 1
|
||||
} else {
|
||||
fail "Primary #$id has no replicas"
|
||||
}
|
||||
}
|
||||
|
||||
resume_process [srv -5 pid]
|
||||
resume_process [srv -10 pid]
|
||||
resume_process [srv -6 pid]
|
||||
resume_process [srv -11 pid]
|
||||
} ;# start_cluster
|
||||
|
||||
# Now test the migration to a primary which used to be a replica, after
|
||||
# a failover.
|
||||
start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} {
|
||||
test "Primary #12 should get at least one migrated replica" {
|
||||
# Kill replica #7 of primary #2. Only replica left is #12 now"
|
||||
pause_process [srv -7 pid]
|
||||
|
||||
# Killing primary node #2, #12 should failover
|
||||
set current_epoch [CI 1 cluster_current_epoch]
|
||||
pause_process [srv -2 pid]
|
||||
wait_for_condition 1000 50 {
|
||||
[CI 1 cluster_current_epoch] > $current_epoch
|
||||
} else {
|
||||
fail "No failover detected"
|
||||
}
|
||||
|
||||
wait_for_cluster_state ok
|
||||
cluster_write_test [srv -1 port]
|
||||
assert {[s -12 role] eq {master}}
|
||||
|
||||
# The remaining instance is now without replicas. Some other replica
|
||||
# should migrate to it.
|
||||
wait_for_condition 1000 50 {
|
||||
[llength [lindex [R 12 role] 2]] >= 1
|
||||
} else {
|
||||
fail "Primary #12 has no replicas"
|
||||
}
|
||||
|
||||
resume_process [srv -7 pid]
|
||||
resume_process [srv -2 pid]
|
||||
}
|
||||
} ;# start_cluster
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
# Resharding test.
|
||||
# In this test a live resharding is performed and the test checks
|
||||
# that certain properties are preserved across the operation.
|
||||
tags {"slow"} {
|
||||
run_solo {cluster-resharding} {
|
||||
start_cluster 5 5 {tags {external:skip cluster}} {
|
||||
test "Enable AOF in all the instances" {
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
R $id config set appendonly yes
|
||||
# We use "appendfsync no" because it's fast but also guarantees that
|
||||
# write(2) is performed before replying to client.
|
||||
R $id config set appendfsync no
|
||||
}
|
||||
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
wait_for_condition 1000 500 {
|
||||
[s [expr -1*$id] aof_rewrite_in_progress] == 0 &&
|
||||
[s [expr -1*$id] aof_enabled] == 1
|
||||
} else {
|
||||
fail "Failed to enable AOF on instance #$id"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Our resharding test performs the following actions:
|
||||
#
|
||||
# - N commands are sent to the cluster in the course of the test.
|
||||
# - Every command selects a random key from key:0 to key:MAX-1.
|
||||
# - The operation RPUSH key <randomvalue> is performed.
|
||||
# - Tcl remembers into an array all the values pushed to each list.
|
||||
# - After N/2 commands, the resharding process is started in background.
|
||||
# - The test continues while the resharding is in progress.
|
||||
# - At the end of the test, we wait for the resharding process to stop.
|
||||
# - Finally the keys are checked to see if they contain the value they should.
|
||||
|
||||
set numkeys 50000
|
||||
set numops 200000
|
||||
set start_node_port [srv 0 port]
|
||||
set cluster [valkey_cluster 127.0.0.1:$start_node_port]
|
||||
if {$::tls} {
|
||||
# setup a non-TLS cluster client to the TLS cluster
|
||||
set plaintext_port [srv 0 pport]
|
||||
set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0]
|
||||
puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
|
||||
} else {
|
||||
set cluster_plaintext $cluster
|
||||
puts "Testing using non-TLS cluster"
|
||||
}
|
||||
catch {unset content}
|
||||
array set content {}
|
||||
set tribpid {}
|
||||
|
||||
test "Cluster consistency during live resharding" {
|
||||
set ele 0
|
||||
for {set j 0} {$j < $numops} {incr j} {
|
||||
# Trigger the resharding once we execute half the ops.
|
||||
if {$tribpid ne {} &&
|
||||
($j % 10000) == 0 &&
|
||||
![process_is_alive $tribpid]} {
|
||||
set tribpid {}
|
||||
}
|
||||
|
||||
if {$j >= $numops/2 && $tribpid eq {}} {
|
||||
puts -nonewline "...Starting resharding..."
|
||||
flush stdout
|
||||
set target [dict get [cluster_get_myself [randomInt 5]] id]
|
||||
set tribpid [lindex [exec \
|
||||
$::VALKEY_CLI_BIN --cluster reshard \
|
||||
127.0.0.1:[srv 0 port] \
|
||||
--cluster-from all \
|
||||
--cluster-to $target \
|
||||
--cluster-slots 100 \
|
||||
--cluster-yes \
|
||||
{*}[valkeycli_tls_config "./tests"] \
|
||||
| [info nameofexecutable] \
|
||||
tests/helpers/onlydots.tcl \
|
||||
&] 0]
|
||||
}
|
||||
|
||||
# Write random data to random list.
|
||||
set listid [randomInt $numkeys]
|
||||
set key "key:$listid"
|
||||
incr ele
|
||||
# We write both with Lua scripts and with plain commands.
|
||||
# This way we are able to stress Lua -> server command invocation
|
||||
# as well, that has tests to prevent Lua to write into wrong
|
||||
# hash slots.
|
||||
# We also use both TLS and plaintext connections.
|
||||
if {$listid % 3 == 0} {
|
||||
$cluster rpush $key $ele
|
||||
} elseif {$listid % 3 == 1} {
|
||||
$cluster_plaintext rpush $key $ele
|
||||
} else {
|
||||
$cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
|
||||
}
|
||||
lappend content($key) $ele
|
||||
|
||||
if {($j % 1000) == 0} {
|
||||
puts -nonewline W; flush stdout
|
||||
}
|
||||
}
|
||||
|
||||
# Wait for the resharding process to end
|
||||
wait_for_condition 1000 500 {
|
||||
[process_is_alive $tribpid] == 0
|
||||
} else {
|
||||
fail "Resharding is not terminating after some time."
|
||||
}
|
||||
wait_for_cluster_propagation
|
||||
}
|
||||
|
||||
test "Verify $numkeys keys for consistency with logical content" {
|
||||
# Check that the Cluster content matches our logical content.
|
||||
foreach {key value} [array get content] {
|
||||
if {[$cluster lrange $key 0 -1] ne $value} {
|
||||
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Terminate and restart all the instances" {
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
# Stop AOF so that an initial AOFRW won't prevent the instance from terminating
|
||||
R $id config set appendonly no
|
||||
R $id bgsave
|
||||
wait_for_condition 1000 50 {
|
||||
[s [expr -1*$id] rdb_bgsave_in_progress] == 0
|
||||
} else {
|
||||
fail "bgsave didn't finish for instance #$id"
|
||||
}
|
||||
restart_server [expr -1*$id] true false
|
||||
}
|
||||
}
|
||||
|
||||
test "Cluster should eventually be up again" {
|
||||
wait_for_cluster_state ok
|
||||
}
|
||||
|
||||
test "Verify $numkeys keys after the restart" {
|
||||
# Check that the Cluster content matches our logical content.
|
||||
foreach {key value} [array get content] {
|
||||
if {[$cluster lrange $key 0 -1] ne $value} {
|
||||
fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Disable AOF in all the instances" {
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
R $id config set appendonly no
|
||||
}
|
||||
}
|
||||
|
||||
test "Verify slaves consistency" {
|
||||
set verified_masters 0
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
set role [R $id role]
|
||||
lassign $role myrole myoffset slaves
|
||||
if {$myrole eq {slave}} continue
|
||||
set masterport [srv [expr -1*$id] port]
|
||||
set masterdigest [R $id debug digest]
|
||||
for {set sid 0} {$sid < [llength $::servers]} {incr sid} {
|
||||
set srole [R $sid role]
|
||||
if {[lindex $srole 0] eq {master}} continue
|
||||
if {[lindex $srole 2] != $masterport} continue
|
||||
wait_for_condition 1000 500 {
|
||||
[R $sid debug digest] eq $masterdigest
|
||||
} else {
|
||||
fail "Master and slave data digest are different"
|
||||
}
|
||||
incr verified_masters
|
||||
}
|
||||
}
|
||||
assert {$verified_masters >= 5}
|
||||
}
|
||||
|
||||
test "Dump sanitization was skipped for migrations" {
|
||||
for {set id 0} {$id < [llength $::servers]} {incr id} {
|
||||
assert {[s [expr -1*$id] dump_payload_sanitizations] == 0}
|
||||
}
|
||||
}
|
||||
|
||||
} ;# start_cluster
|
||||
} ;# run_solo
|
||||
} ;# tag
|
||||
Reference in New Issue
Block a user