Migrate the remaining cluster tests to the new framework and remove legacy files (#2297) (#3382)

Migrated the remaining cluster tests to tests/unit/cluster/ to use the same framework for all cluster tests. Cleaned up the obsolete cluster test framework files and updated the CI workflows to use the new unified test runner. Changes: Moved and mapped 6 test files: - 03-failover-loop.tcl → Merged into existing failover.tcl - 04-resharding.tcl → resharding.tcl - 12-replica-migration-2.tcl + 12.1-replica-migration-3.tcl → replica-migration-slow.tcl - 07-replica-migration.tcl → Merged into existing replica-migration.tcl - 28-cluster-shards.tcl → Merged into existing cluster-shards.tcl Other changes: - Converted old framework APIs (e.g., K, RI) to new framework APIs (e.g., R, srv) - Added process_is_alive check in cluster_util.tcl to fix an exception in failover tests caused by executing ps on dead processes - Heavy tests (resharding, replica-migration-slow) marked with slow tag and wrapped in run_solo to prevent resource contention in sanitizer environments - replica-migration-slow marked with valgrind:skip tag since it is very slow - Removed the entire tests/cluster/ directory including run.tcl, cluster.tcl, includes/, and helpers/ - Kept runtest-cluster as a wrapper script (exec ./runtest --cluster "$@") - Removed ./runtest-cluster calls from .github/workflows/daily.yml as cluster tests are now included in ./runtest Closes #2297. Signed-off-by: Jun Yeong Kim <junyeonggim5@gmail.com> Signed-off-by: Binbin <binloveplay1314@qq.com> Co-authored-by: Binbin <binloveplay1314@qq.com>
2026-05-06 05:26:42 -04:00 · 2026-04-27 18:31:37 +09:00
parent 6dbb7f81a9
commit ff80b2d1dc
20 changed files with 724 additions and 1350 deletions
@@ -125,9 +125,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
      - name: unittest
        if: true && !contains(github.event.inputs.skiptests, 'unittest')
        run: |
@@ -211,9 +208,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
      - name: unittest
        if: true && !contains(github.event.inputs.skiptests, 'unittest')
        run: |
@@ -276,9 +270,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
      - name: unittest
        if: true && !contains(github.event.inputs.skiptests, 'unittest')
        run: |
@@ -334,9 +325,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  test-ubuntu-no-malloc-usable-size:
    runs-on: ubuntu-latest
    if: |
@@ -384,9 +372,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  test-ubuntu-32bit:
    runs-on: ubuntu-latest
    if: |
@@ -457,9 +442,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
      - name: unittest
        if: true && !contains(github.event.inputs.skiptests, 'unittest')
        run: |
@@ -522,10 +504,6 @@ jobs:
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: |
          ./runtest-sentinel --tls ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: |
-          ./runtest-cluster --tls ${{github.event.inputs.cluster_test_args}}
  test-ubuntu-tls-no-tls:
    runs-on: ubuntu-latest
    if: |
@@ -578,10 +556,6 @@ jobs:
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: |
          ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: |
-          ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  test-ubuntu-io-threads:
    runs-on: ubuntu-latest
    if: |
@@ -623,9 +597,6 @@ jobs:
      - name: test
        if: true && !contains(github.event.inputs.skiptests, 'valkey')
        run: ./runtest --io-threads --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}
  test-ubuntu-tls-io-threads:
    runs-on: ubuntu-latest
    if: |
@@ -670,10 +641,6 @@ jobs:
        if: true && !contains(github.event.inputs.skiptests, 'valkey')
        run: |
          ./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: |
-          ./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}}
  test-ubuntu-reclaim-cache:
    runs-on: ubuntu-latest
    if: |
@@ -991,9 +958,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
      - name: unittest
        if: true && !contains(github.event.inputs.skiptests, 'unittest')
        run: |
@@ -1129,9 +1093,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
      - name: unittest
        if: true && !contains(github.event.inputs.skiptests, 'unittest')
        run: |
@@ -1263,9 +1224,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
      - name: unittest
        if: true && !contains(github.event.inputs.skiptests, 'unittest')
        run: |
@@ -1324,9 +1282,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  test-rpm-distros-jemalloc:
    if: |
      (github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
@@ -1399,9 +1354,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  test-rpm-distros-tls-module:
    if: |
      (github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
@@ -1479,10 +1431,6 @@ jobs:
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: |
          ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: |
-          ./runtest-cluster --tls-module ${{github.event.inputs.cluster_test_args}}
  test-rpm-distros-tls-module-no-tls:
    if: |
      (github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' ||
@@ -1555,10 +1503,6 @@ jobs:
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: |
          ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: |
-          ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  test-macos-latest:
    runs-on: macos-latest
    if: |
@@ -1676,9 +1620,6 @@ jobs:
      - run: cd libbacktrace && ./configure && make && sudo make install
      - name: make
        run: make SERVER_CFLAGS='-Werror' USE_LIBBACKTRACE=yes
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  build-old-macos-versions:
    strategy:
      fail-fast: false
@@ -1806,9 +1747,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  test-alpine-libc-malloc:
    runs-on: ubuntu-latest
    if: |
@@ -1859,9 +1797,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
  reply-schemas-validator:
    runs-on: ubuntu-latest
    timeout-minutes: 1440
@@ -1909,9 +1844,6 @@ jobs:
      - name: sentinel tests
        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
        run: ./runtest-sentinel --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}}
-      - name: cluster tests
-        if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster --log-req-res --dont-clean --force-resp3 ${{github.event.inputs.cluster_test_args}}
      - name: Install Python dependencies
        uses: py-actions/py-dependency-install@30aa0023464ed4b5b116bd9fbdab87acf01a484e # v4.1.0
        with:
@@ -1,14 +1,2 @@
 #!/bin/sh
-TCL_VERSIONS="8.5 8.6 9.0"
-TCLSH=""
-
-for VERSION in $TCL_VERSIONS; do
-	TCL=`which tclsh$VERSION 2>/dev/null` && TCLSH=$TCL
-done
-
-if [ -z $TCLSH ]
-then
-    echo "You need tcl 8.5 or newer in order to run the Valkey Cluster test"
-    exit 1
-fi
-$TCLSH tests/cluster/run.tcl $*
+exec ./runtest --cluster "$@"
@@ -1,238 +0,0 @@
-# Cluster-specific test functions.
-#
-# Copyright (C) 2014 Redis Ltd.
-# This software is released under the BSD License. See the COPYING file for
-# more information.
-
-# Track cluster configuration as created by create_cluster below
-set ::cluster_master_nodes 0
-set ::cluster_replica_nodes 0
-
-# Returns a parsed CLUSTER NODES output as a list of dictionaries. Optional status field
-# can be specified to only returns entries that match the provided status.
-proc get_cluster_nodes {id {status "*"}} {
-    set lines [split [R $id cluster nodes] "\r\n"]
-    set nodes {}
-    foreach l $lines {
-        set l [string trim $l]
-        if {$l eq {}} continue
-        set args [split $l]
-        set node [dict create \
-            id [lindex $args 0] \
-            addr [lindex $args 1] \
-            flags [split [lindex $args 2] ,] \
-            slaveof [lindex $args 3] \
-            ping_sent [lindex $args 4] \
-            pong_recv [lindex $args 5] \
-            config_epoch [lindex $args 6] \
-            linkstate [lindex $args 7] \
-            slots [lrange $args 8 end] \
-        ]
-        if {[string match $status [lindex $args 7]]} {
-            lappend nodes $node
-        }
-    }
-    return $nodes
-}
-
-# Test node for flag.
-proc has_flag {node flag} {
-    expr {[lsearch -exact [dict get $node flags] $flag] != -1}
-}
-
-# Returns the parsed myself node entry as a dictionary.
-proc get_myself id {
-    set nodes [get_cluster_nodes $id]
-    foreach n $nodes {
-        if {[has_flag $n myself]} {return $n}
-    }
-    return {}
-}
-
-# Get a specific node by ID by parsing the CLUSTER NODES output
-# of the instance Number 'instance_id'
-proc get_node_by_id {instance_id node_id} {
-    set nodes [get_cluster_nodes $instance_id]
-    foreach n $nodes {
-        if {[dict get $n id] eq $node_id} {return $n}
-    }
-    return {}
-}
-
-# Return the value of the specified CLUSTER INFO field.
-proc CI {n field} {
-    get_info_field [R $n cluster info] $field
-}
-
-# Return the value of the specified INFO field.
-proc s {n field} {
-    get_info_field [R $n info] $field
-}
-
-# Assuming nodes are reset, this function performs slots allocation.
-# Only the first 'n' nodes are used.
-proc cluster_allocate_slots {n} {
-    set slot 16383
-    while {$slot >= 0} {
-        # Allocate successive slots to random nodes.
-        set node [randomInt $n]
-        lappend slots_$node $slot
-        incr slot -1
-    }
-    for {set j 0} {$j < $n} {incr j} {
-        R $j cluster addslots {*}[set slots_${j}]
-    }
-}
-
-# Check that cluster nodes agree about "state", or raise an error.
-proc assert_cluster_state {state} {
-    foreach_valkey_id id {
-        if {[instance_is_killed valkey $id]} continue
-        wait_for_condition 1000 50 {
-            [CI $id cluster_state] eq $state
-        } else {
-            fail "Cluster node $id cluster_state:[CI $id cluster_state]"
-        }
-    }
-}
-
-# Search the first node starting from ID $first that is not
-# already configured as a slave.
-proc cluster_find_available_slave {first} {
-    foreach_valkey_id id {
-        if {$id < $first} continue
-        if {[instance_is_killed valkey $id]} continue
-        set me [get_myself $id]
-        if {[dict get $me slaveof] eq {-}} {return $id}
-    }
-    fail "No available slaves"
-}
-
-# Add 'slaves' slaves to a cluster composed of 'masters' masters.
-# It assumes that masters are allocated sequentially from instance ID 0
-# to N-1.
-proc cluster_allocate_slaves {masters slaves} {
-    for {set j 0} {$j < $slaves} {incr j} {
-        set master_id [expr {$j % $masters}]
-        set slave_id [cluster_find_available_slave $masters]
-        set master_myself [get_myself $master_id]
-        R $slave_id cluster replicate [dict get $master_myself id]
-    }
-}
-
-# Create a cluster composed of the specified number of masters and slaves.
-proc create_cluster {masters slaves} {
-    cluster_allocate_slots $masters
-    if {$slaves} {
-        cluster_allocate_slaves $masters $slaves
-    }
-    assert_cluster_state ok
-
-    set ::cluster_master_nodes $masters
-    set ::cluster_replica_nodes $slaves
-}
-
-proc cluster_allocate_with_continuous_slots {n} {
-    set slot 16383
-    set avg [expr ($slot+1) / $n]
-    while {$slot >= 0} {
-        set node [expr $slot/$avg >= $n ? $n-1 : $slot/$avg]
-        lappend slots_$node $slot
-        incr slot -1
-    }
-    for {set j 0} {$j < $n} {incr j} {
-        R $j cluster addslots {*}[set slots_${j}]
-    }
-}
-
-# Create a cluster composed of the specified number of masters and slaves,
-# but with a continuous slot range. 
-proc cluster_create_with_continuous_slots {masters slaves} {
-    cluster_allocate_with_continuous_slots $masters
-    if {$slaves} {
-        cluster_allocate_slaves $masters $slaves
-    }
-    assert_cluster_state ok
-
-    set ::cluster_master_nodes $masters
-    set ::cluster_replica_nodes $slaves
-}
-
-
-# Set the cluster node-timeout to all the reachalbe nodes.
-proc set_cluster_node_timeout {to} {
-    foreach_valkey_id id {
-        catch {R $id CONFIG SET cluster-node-timeout $to}
-    }
-}
-
-# Check if the cluster is writable and readable. Use node "id"
-# as a starting point to talk with the cluster.
-proc cluster_write_test {id} {
-    set prefix [randstring 20 20 alpha]
-    set port [get_instance_attrib valkey $id port]
-    set cluster [valkey_cluster 127.0.0.1:$port]
-    for {set j 0} {$j < 100} {incr j} {
-        $cluster set key.$j $prefix.$j
-    }
-    for {set j 0} {$j < 100} {incr j} {
-        assert {[$cluster get key.$j] eq "$prefix.$j"}
-    }
-    $cluster close
-}
-
-# Check if cluster configuration is consistent.
-# All the nodes in the cluster should show same slots configuration and have health
-# state "online" to be considered as consistent.
-proc cluster_config_consistent {} {
-    for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
-        # Check if all the nodes are online
-        set shards_cfg [R $j CLUSTER SHARDS]
-        foreach shard_cfg $shards_cfg {
-            set nodes [dict get $shard_cfg nodes]
-            foreach node $nodes {
-                if {[dict get $node health] ne "online"} {
-                    return 0
-                }
-            }
-        }
-
-        if {$j == 0} {
-            set base_cfg [R $j cluster slots]
-        } else {
-            set cfg [R $j cluster slots]
-            if {$cfg != $base_cfg} {
-                return 0
-            }
-        }
-    }
-
-    return 1
-}
-
-# Wait for cluster configuration to propagate and be consistent across nodes.
-proc wait_for_cluster_propagation {} {
-    wait_for_condition 1000 50 {
-        [cluster_config_consistent] eq 1
-    } else {
-        for {set j 0} {$j < [llength $::servers]} {incr j} {
-            puts "R $j cluster slots output: [R $j cluster slots]"
-        }
-        fail "cluster config did not reach a consistent state"
-    }
-}
-
-# Check if cluster's view of hostnames is consistent
-proc are_hostnames_propagated {match_string} {
-    for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
-        set cfg [R $j cluster slots]
-        foreach node $cfg {
-            for {set i 2} {$i < [llength $node]} {incr i} {
-                if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } {
-                    return 0
-                }
-            }
-        }
-    }
-    return 1
-}
@@ -1,35 +0,0 @@
-# Cluster test suite. Copyright (C) 2014 Redis Ltd.
-# This software is released under the BSD License. See the COPYING file for
-# more information.
-
-# Set the executable paths at project root
-source tests/support/set_executable_path.tcl
-
-cd tests/cluster
-source cluster.tcl
-source ../instances.tcl
-source ../../support/cluster.tcl ; # Cluster client.
-
-set ::instances_count 20 ; # How many instances we use at max.
-set ::tlsdir "../../tls"
-
-proc main {} {
-    parse_options
-    spawn_instance valkey $::valkey_base_port $::instances_count {
-        "cluster-enabled yes"
-        "appendonly yes"
-        "enable-protected-configs yes"
-        "enable-debug-command yes"
-        "save ''"
-    }
-    run_tests
-    cleanup
-    end_tests
-}
-
-if {[catch main e]} {
-    puts $::errorInfo
-    if {$::pause_on_error} pause_on_error
-    cleanup
-    exit 1
-}
@@ -1,117 +0,0 @@
-# Failover stress test.
-# In this test a different node is killed in a loop for N
-# iterations. The test checks that certain properties
-# are preserved across iterations.
-
-source "../tests/includes/init-tests.tcl"
-
-test "Create a 5 nodes cluster" {
-    create_cluster 5 5
-}
-
-test "Cluster is up" {
-    assert_cluster_state ok
-}
-
-set iterations 20
-set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]]
-
-while {[incr iterations -1]} {
-    set tokill [randomInt 10]
-    set other [expr {($tokill+1)%10}] ; # Some other instance.
-    set key [randstring 20 20 alpha]
-    set val [randstring 20 20 alpha]
-    set role [RI $tokill role]
-    if {$role eq {master}} {
-        set slave {}
-        set myid [dict get [get_myself $tokill] id]
-        foreach_valkey_id id {
-            if {$id == $tokill} continue
-            if {[dict get [get_myself $id] slaveof] eq $myid} {
-                set slave $id
-            }
-        }
-        if {$slave eq {}} {
-            fail "Unable to retrieve slave's ID for master #$tokill"
-        }
-    }
-
-    puts "--- Iteration $iterations ---"
-
-    if {$role eq {master}} {
-        test "Wait for slave of #$tokill to sync" {
-            wait_for_condition 1000 50 {
-                [string match {*state=online*} [RI $tokill slave0]]
-            } else {
-                fail "Slave of node #$tokill is not ok"
-            }
-        }
-        set slave_config_epoch [CI $slave cluster_my_epoch]
-    }
-
-    test "Cluster is writable before failover" {
-        for {set i 0} {$i < 100} {incr i} {
-            catch {$cluster set $key:$i $val:$i} err
-            assert {$err eq {OK}}
-        }
-        # Wait for the write to propagate to the slave if we
-        # are going to kill a master.
-        if {$role eq {master}} {
-            R $tokill wait 1 20000
-        }
-    }
-
-    test "Terminating node #$tokill" {
-        # Stop AOF so that an initial AOFRW won't prevent the instance from terminating
-        R $tokill config set appendonly no
-        kill_instance valkey $tokill
-    }
-
-    if {$role eq {master}} {
-        test "Wait failover by #$slave with old epoch $slave_config_epoch" {
-            wait_for_condition 1000 50 {
-                [CI $slave cluster_my_epoch] > $slave_config_epoch
-            } else {
-                fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
-            }
-        }
-    }
-
-    test "Cluster should eventually be up again" {
-        assert_cluster_state ok
-    }
-
-    test "Cluster is writable again" {
-        for {set i 0} {$i < 100} {incr i} {
-            catch {$cluster set $key:$i:2 $val:$i:2} err
-            assert {$err eq {OK}}
-        }
-    }
-
-    test "Restarting node #$tokill" {
-        restart_instance valkey $tokill
-    }
-
-    test "Instance #$tokill is now a slave" {
-        wait_for_condition 1000 50 {
-            [RI $tokill role] eq {slave}
-        } else {
-            fail "Restarted instance is not a slave"
-        }
-    }
-
-    test "We can read back the value we set before" {
-        for {set i 0} {$i < 100} {incr i} {
-            catch {$cluster get $key:$i} err
-            assert {$err eq "$val:$i"}
-            catch {$cluster get $key:$i:2} err
-            assert {$err eq "$val:$i:2"}
-        }
-    }
-}
-
-test "Post condition: current_epoch >= my_epoch everywhere" {
-    foreach_valkey_id id {
-        assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
-    }
-}
@@ -1,196 +0,0 @@
-# Failover stress test.
-# In this test a different node is killed in a loop for N
-# iterations. The test checks that certain properties
-# are preserved across iterations.
-
-source "../tests/includes/init-tests.tcl"
-source "../../../tests/support/cli.tcl"
-
-test "Create a 5 nodes cluster" {
-    create_cluster 5 5
-}
-
-test "Cluster is up" {
-    assert_cluster_state ok
-}
-
-test "Enable AOF in all the instances" {
-    foreach_valkey_id id {
-        R $id config set appendonly yes
-        # We use "appendfsync no" because it's fast but also guarantees that
-        # write(2) is performed before replying to client.
-        R $id config set appendfsync no
-    }
-
-    foreach_valkey_id id {
-        wait_for_condition 1000 500 {
-            [RI $id aof_rewrite_in_progress] == 0 &&
-            [RI $id aof_enabled] == 1
-        } else {
-            fail "Failed to enable AOF on instance #$id"
-        }
-    }
-}
-
-# Return non-zero if the specified PID is about a process still in execution,
-# otherwise 0 is returned.
-proc process_is_running {pid} {
-    # PS should return with an error if PID is non existing,
-    # and catch will return non-zero. We want to return non-zero if
-    # the PID exists, so we invert the return value with expr not operator.
-    expr {![catch {exec ps -p $pid}]}
-}
-
-# Our resharding test performs the following actions:
-#
-# - N commands are sent to the cluster in the course of the test.
-# - Every command selects a random key from key:0 to key:MAX-1.
-# - The operation RPUSH key <randomvalue> is performed.
-# - Tcl remembers into an array all the values pushed to each list.
-# - After N/2 commands, the resharding process is started in background.
-# - The test continues while the resharding is in progress.
-# - At the end of the test, we wait for the resharding process to stop.
-# - Finally the keys are checked to see if they contain the value they should.
-
-set numkeys 50000
-set numops 200000
-set start_node_port [get_instance_attrib valkey 0 port]
-set cluster [valkey_cluster 127.0.0.1:$start_node_port]
-if {$::tls} {
-    # setup a non-TLS cluster client to the TLS cluster
-    set plaintext_port [get_instance_attrib valkey 0 plaintext-port]
-    set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0]
-    puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
-} else {
-    set cluster_plaintext $cluster
-    puts "Testing using non-TLS cluster"
-}
-catch {unset content}
-array set content {}
-set tribpid {}
-
-test "Cluster consistency during live resharding" {
-    set ele 0
-    for {set j 0} {$j < $numops} {incr j} {
-        # Trigger the resharding once we execute half the ops.
-        if {$tribpid ne {} &&
-            ($j % 10000) == 0 &&
-            ![process_is_running $tribpid]} {
-            set tribpid {}
-        }
-
-        if {$j >= $numops/2 && $tribpid eq {}} {
-            puts -nonewline "...Starting resharding..."
-            flush stdout
-            set target [dict get [get_myself [randomInt 5]] id]
-            set tribpid [lindex [exec \
-                $::VALKEY_CLI_BIN --cluster reshard \
-                127.0.0.1:[get_instance_attrib valkey 0 port] \
-                --cluster-from all \
-                --cluster-to $target \
-                --cluster-slots 100 \
-                --cluster-yes \
-                {*}[valkeycli_tls_config "../../../tests"] \
-                | [info nameofexecutable] \
-                ../tests/helpers/onlydots.tcl \
-                &] 0]
-        }
-
-        # Write random data to random list.
-        set listid [randomInt $numkeys]
-        set key "key:$listid"
-        incr ele
-        # We write both with Lua scripts and with plain commands.
-        # This way we are able to stress Lua -> server command invocation
-        # as well, that has tests to prevent Lua to write into wrong
-        # hash slots.
-        # We also use both TLS and plaintext connections.
-        if {$listid % 3 == 0} {
-            $cluster rpush $key $ele
-        } elseif {$listid % 3 == 1} {
-            $cluster_plaintext rpush $key $ele
-        } else {
-            $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
-        }
-        lappend content($key) $ele
-
-        if {($j % 1000) == 0} {
-            puts -nonewline W; flush stdout
-        }
-    }
-
-    # Wait for the resharding process to end
-    wait_for_condition 1000 500 {
-        [process_is_running $tribpid] == 0
-    } else {
-        fail "Resharding is not terminating after some time."
-    }
-    wait_for_cluster_propagation
-}
-
-test "Verify $numkeys keys for consistency with logical content" {
-    # Check that the Cluster content matches our logical content.
-    foreach {key value} [array get content] {
-        if {[$cluster lrange $key 0 -1] ne $value} {
-            fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
-        }
-    }
-}
-
-test "Terminate and restart all the instances" {
-    foreach_valkey_id id {
-        # Stop AOF so that an initial AOFRW won't prevent the instance from terminating
-        R $id config set appendonly no
-        kill_instance valkey $id
-        restart_instance valkey $id
-    }
-}
-
-test "Cluster should eventually be up again" {
-    assert_cluster_state ok
-}
-
-test "Verify $numkeys keys after the restart" {
-    # Check that the Cluster content matches our logical content.
-    foreach {key value} [array get content] {
-        if {[$cluster lrange $key 0 -1] ne $value} {
-            fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
-        }
-    }
-}
-
-test "Disable AOF in all the instances" {
-    foreach_valkey_id id {
-        R $id config set appendonly no
-    }
-}
-
-test "Verify slaves consistency" {
-    set verified_masters 0
-    foreach_valkey_id id {
-        set role [R $id role]
-        lassign $role myrole myoffset slaves
-        if {$myrole eq {slave}} continue
-        set masterport [get_instance_attrib valkey $id port]
-        set masterdigest [R $id debug digest]
-        foreach_valkey_id sid {
-            set srole [R $sid role]
-            if {[lindex $srole 0] eq {master}} continue
-            if {[lindex $srole 2] != $masterport} continue
-            wait_for_condition 1000 500 {
-                [R $sid debug digest] eq $masterdigest
-            } else {
-                fail "Master and slave data digest are different"
-            }
-            incr verified_masters
-        }
-    }
-    assert {$verified_masters >= 5}
-}
-
-test "Dump sanitization was skipped for migrations" {
-    set verified_masters 0
-    foreach_valkey_id id {
-        assert {[RI $id dump_payload_sanitizations] == 0}
-    }
-}
@@ -1,103 +0,0 @@
-# Replica migration test.
-# Check that orphaned masters are joined by replicas of masters having
-# multiple replicas attached, according to the migration barrier settings.
-
-source "../tests/includes/init-tests.tcl"
-
-# Create a cluster with 5 master and 10 slaves, so that we have 2
-# slaves for each master.
-test "Create a 5 nodes cluster" {
-    create_cluster 5 10
-}
-
-test "Cluster is up" {
-    assert_cluster_state ok
-}
-
-test "Each master should have two replicas attached" {
-    foreach_valkey_id id {
-        if {$id < 5} {
-            wait_for_condition 1000 50 {
-                [llength [lindex [R $id role] 2]] == 2
-            } else {
-                fail "Master #$id does not have 2 slaves as expected"
-            }
-        }
-    }
-}
-
-test "Killing all the slaves of master #0 and #1" {
-    kill_instance valkey 5
-    kill_instance valkey 10
-    kill_instance valkey 6
-    kill_instance valkey 11
-    after 4000
-}
-
-foreach_valkey_id id {
-    if {$id < 5} {
-        test "Master #$id should have at least one replica" {
-            wait_for_condition 1000 50 {
-                [llength [lindex [R $id role] 2]] >= 1
-            } else {
-                fail "Master #$id has no replicas"
-            }
-        }
-    }
-}
-
-# Now test the migration to a master which used to be a slave, after
-# a failver.
-
-source "../tests/includes/init-tests.tcl"
-
-# Create a cluster with 5 master and 10 slaves, so that we have 2
-# slaves for each master.
-test "Create a 5 nodes cluster" {
-    create_cluster 5 10
-}
-
-test "Cluster is up" {
-    assert_cluster_state ok
-}
-
-test "Kill slave #7 of master #2. Only slave left is #12 now" {
-    kill_instance valkey 7
-}
-
-set current_epoch [CI 1 cluster_current_epoch]
-
-test "Killing master node #2, #12 should failover" {
-    kill_instance valkey 2
-}
-
-test "Wait for failover" {
-    wait_for_condition 1000 50 {
-        [CI 1 cluster_current_epoch] > $current_epoch
-    } else {
-        fail "No failover detected"
-    }
-}
-
-test "Cluster should eventually be up again" {
-    assert_cluster_state ok
-}
-
-test "Cluster is writable" {
-    cluster_write_test 1
-}
-
-test "Instance 12 is now a master without slaves" {
-    assert {[RI 12 role] eq {master}}
-}
-
-# The remaining instance is now without slaves. Some other slave
-# should migrate to it.
-
-test "Master #12 should get at least one migrated replica" {
-    wait_for_condition 1000 50 {
-        [llength [lindex [R 12 role] 2]] >= 1
-    } else {
-        fail "Master #12 has no replicas"
-    }
-}
@@ -1,75 +0,0 @@
-# Replica migration test #2.
-#
-# Check that the status of master that can be targeted by replica migration
-# is acquired again, after being getting slots again, in a cluster where the
-# other masters have slaves.
-
-source "../tests/includes/init-tests.tcl"
-source "../../../tests/support/cli.tcl"
-
-# Create a cluster with 5 master and 15 slaves, to make sure there are no
-# empty masters and make rebalancing simpler to handle during the test.
-test "Create a 5 nodes cluster" {
-    cluster_create_with_continuous_slots 5 15
-}
-
-test "Cluster is up" {
-    assert_cluster_state ok
-}
-
-test "Each master should have at least two replicas attached" {
-    foreach_valkey_id id {
-        if {$id < 5} {
-            wait_for_condition 1000 50 {
-                [llength [lindex [R $id role] 2]] >= 2
-            } else {
-                fail "Master #$id does not have 2 slaves as expected"
-            }
-        }
-    }
-}
-
-test "Set allow-replica-migration yes" {
-    foreach_valkey_id id {
-        R $id CONFIG SET cluster-allow-replica-migration yes
-    }
-}
-
-set master0_id [dict get [get_myself 0] id]
-test "Resharding all the master #0 slots away from it" {
-    set output [exec \
-        $::VALKEY_CLI_BIN --cluster rebalance \
-        127.0.0.1:[get_instance_attrib valkey 0 port] \
-        {*}[valkeycli_tls_config "../../../tests"] \
-        --cluster-weight ${master0_id}=0 >@ stdout ]
-
-}
-
-test "Master #0 who lost all slots should turn into a replica without replicas" {
-    wait_for_condition 1000 50 {
-        [RI 0 role] == "slave" && [RI 0 connected_slaves] == 0
-    } else {
-        puts [R 0 info replication]
-        fail "Master #0 didn't turn itself into a replica"
-    }
-}
-
-test "Resharding back some slot to master #0" {
-    # Wait for the cluster config to propagate before attempting a
-    # new resharding.
-    after 10000
-    set output [exec \
-        $::VALKEY_CLI_BIN --cluster rebalance \
-        127.0.0.1:[get_instance_attrib valkey 0 port] \
-        {*}[valkeycli_tls_config "../../../tests"] \
-        --cluster-weight ${master0_id}=.01 \
-        --cluster-use-empty-masters  >@ stdout]
-}
-
-test "Master #0 should re-acquire one or more replicas" {
-    wait_for_condition 1000 50 {
-        [llength [lindex [R 0 role] 2]] >= 1
-    } else {
-        fail "Master #0 has no has replicas"
-    }
-}
@@ -1,65 +0,0 @@
-# Replica migration test #2.
-#
-# Check that if 'cluster-allow-replica-migration' is set to 'no', slaves do not
-# migrate when master becomes empty.
-
-source "../tests/includes/init-tests.tcl"
-source "../tests/includes/utils.tcl"
-
-# Create a cluster with 5 master and 15 slaves, to make sure there are no
-# empty masters and make rebalancing simpler to handle during the test.
-test "Create a 5 nodes cluster" {
-    cluster_create_with_continuous_slots 5 15
-}
-
-test "Cluster is up" {
-    assert_cluster_state ok
-}
-
-test "Each master should have at least two replicas attached" {
-    foreach_valkey_id id {
-        if {$id < 5} {
-            wait_for_condition 1000 50 {
-                [llength [lindex [R $id role] 2]] >= 2
-            } else {
-                fail "Master #$id does not have 2 slaves as expected"
-            }
-        }
-    }
-}
-
-test "Set allow-replica-migration no" {
-    foreach_valkey_id id {
-        R $id CONFIG SET cluster-allow-replica-migration no
-    }
-}
-
-set master0_id [dict get [get_myself 0] id]
-test "Resharding all the master #0 slots away from it" {
-    set output [exec \
-        $::VALKEY_CLI_BIN --cluster rebalance \
-        127.0.0.1:[get_instance_attrib valkey 0 port] \
-        {*}[valkeycli_tls_config "../../../tests"] \
-        --cluster-weight ${master0_id}=0 >@ stdout ]
-}
-
-test "Wait cluster to be stable" {
-    wait_cluster_stable
-}
-
-test "Master #0 still should have its replicas" {
-    assert { [llength [lindex [R 0 role] 2]] >= 2 }
-}
-
-test "Each master should have at least two replicas attached" {
-    foreach_valkey_id id {
-        if {$id < 5} {
-            wait_for_condition 1000 50 {
-                [llength [lindex [R $id role] 2]] >= 2
-            } else {
-                fail "Master #$id does not have 2 slaves as expected"
-            }
-        }
-    }
-}
-
@@ -1,312 +0,0 @@
-source "../tests/includes/init-tests.tcl"
-
-# Initial slot distribution.
-set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926]
-set ::slot1 [list 5460 5460 5462 10922 10925 10925]
-set ::slot2 [list 10923 10924 10927 16383]
-set ::slot3 [list 1001 1001]
-
-proc cluster_create_with_split_slots {masters replicas} {
-    for {set j 0} {$j < $masters} {incr j} {
-        R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}]
-    }
-    if {$replicas} {
-        cluster_allocate_slaves $masters $replicas
-    }
-    set ::cluster_master_nodes $masters
-    set ::cluster_replica_nodes $replicas
-}
-
-# Get the node info with the specific node_id from the
-# given reference node. Valid type options are "node" and "shard"
-proc get_node_info_from_shard {id reference {type node}} {
-    set shards_response [R $reference CLUSTER SHARDS]
-    foreach shard_response $shards_response {
-        set nodes [dict get $shard_response nodes]
-        foreach node $nodes {
-            if {[dict get $node id] eq $id} {
-                if {$type eq "node"} {
-                    return $node
-                } elseif {$type eq "shard"} {
-                    return $shard_response
-                } else {
-                    return {}
-                }
-            }
-        }
-    }
-    # No shard found, return nothing
-    return {}
-}
-
-proc cluster_ensure_master {id} {
-    if { [regexp "master" [R $id role]] == 0 } {
-        assert_equal {OK} [R $id CLUSTER FAILOVER]
-        wait_for_condition 50 100 {
-            [regexp "master" [R $id role]] == 1
-        } else {
-            fail "instance $id is not master"
-        }
-    }
-}
-
-test "Create a 8 nodes cluster with 4 shards" {
-    cluster_create_with_split_slots 4 4
-}
-
-test "Cluster should start ok" {
-    assert_cluster_state ok
-}
-
-test "Set cluster hostnames and verify they are propagated" {
-    for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
-        R $j config set cluster-announce-hostname "host-$j.com"
-    }
-
-    # Wait for everyone to agree about the state
-    wait_for_cluster_propagation
-}
-
-test "Verify information about the shards" {
-    set ids {}
-    for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
-        lappend ids [R $j CLUSTER MYID]
-    }
-    set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3]
-
-    # Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent.
-    for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} {
-        for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
-            assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots]
-            assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname]
-            assert_equal "127.0.0.1"  [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip]
-            # Default value of 'cluster-preferred-endpoint-type' is ip.
-            assert_equal "127.0.0.1"  [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint]
-
-            if {$::tls} {
-                assert_equal [get_instance_attrib valkey $i plaintext-port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
-                assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port]
-            } else {
-                assert_equal [get_instance_attrib valkey $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
-            }
-
-            if {$i < 4} {
-                assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
-                assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health]
-            } else {
-                assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
-                # Replica could be in online or loading
-            }
-        }
-    }
-}
-
-test "Verify no slot shard" {
-    # Node 8 has no slots assigned
-    set node_8_id [R 8 CLUSTER MYID]
-    assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots]
-    assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots]
-}
-
-set node_0_id [R 0 CLUSTER MYID]
-
-test "Kill a node and tell the replica to immediately takeover" {
-    kill_instance valkey 0
-    R 4 cluster failover force
-}
-
-# Primary 0 node should report as fail, wait until the new primary acknowledges it.
-test "Verify health as fail for killed node" {
-    wait_for_condition 1000 50 {
-        "fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"]
-    } else {
-        fail "New primary never detected the node failed"
-    }
-}
-
-set primary_id 4
-set replica_id 0
-
-test "Restarting primary node" {
-    restart_instance valkey $replica_id
-}
-
-test "Instance #0 gets converted into a replica" {
-    wait_for_condition 1000 50 {
-        [RI $replica_id role] eq {slave}
-    } else {
-        fail "Old primary was not converted into replica"
-    }
-}
-
-test "Test the replica reports a loading state while it's loading" {
-    # Test the command is good for verifying everything moves to a happy state
-    set replica_cluster_id [R $replica_id CLUSTER MYID]
-    wait_for_condition 50 1000 {
-        [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online"
-    } else {
-        fail "Replica never transitioned to online"
-    }
-
-    # Set 1 MB of data, so there is something to load on full sync
-    R $primary_id debug populate 1000 key 1000
-
-    # Kill replica client for primary and load new data to the primary
-    R $primary_id config set repl-backlog-size 100
-
-    # Set the key load delay so that it will take at least
-    # 2 seconds to fully load the data.
-    R $replica_id config set key-load-delay 4000
-
-    # Trigger event loop processing every 1024 bytes, this trigger
-    # allows us to send and receive cluster messages, so we are setting
-    # it low so that the cluster messages are sent more frequently.
-    R $replica_id config set loading-process-events-interval-bytes 1024
-
-    R $primary_id multi
-    R $primary_id client kill type replica
-    # populate the correct data
-    set num 100
-    set value [string repeat A 1024]
-    for {set j 0} {$j < $num} {incr j} {
-        # Use hashtag valid for shard #0
-        set key "{ch3}$j"
-        R $primary_id set $key $value
-    }
-    R $primary_id exec
-
-    # The replica should reconnect and start a full sync, it will gossip about it's health to the primary.
-    wait_for_condition 50 1000 {
-        "loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
-    } else {
-        fail "Replica never transitioned to loading"
-    }
-
-    # Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data.
-    R $replica_id CLUSTER SHARDS
-    R $replica_id CLUSTER SLOTS
-
-    # Speed up the key loading and verify everything resumes
-    R $replica_id config set key-load-delay 0
-
-    wait_for_condition 50 1000 {
-        "online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
-    } else {
-        fail "Replica never transitioned to online"
-    }
-
-    # Final sanity, the replica agrees it is online.
-    assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health]
-}
-
-test "Regression test for a crash when calling SHARDS during handshake" {
-    # Reset forget a node, so we can use it to establish handshaking connections
-    set id [R 19 CLUSTER MYID]
-    R 19 CLUSTER RESET HARD
-    for {set i 0} {$i < 19} {incr i} {
-        R $i CLUSTER FORGET $id
-    }
-    R 19 cluster meet 127.0.0.1 [get_instance_attrib valkey 0 port]
-    # This should line would previously crash, since all the outbound
-    # connections were in handshake state.
-    R 19 CLUSTER SHARDS
-}
-
-test "Cluster is up" {
-    assert_cluster_state ok
-}
-test "Shard ids are unique" {
-    set shard_ids {}
-    for {set i 0} {$i < 4} {incr i} {
-        set shard_id [R $i cluster myshardid]
-        assert_equal [dict exists $shard_ids $shard_id] 0
-        dict set shard_ids $shard_id 1
-    }
-}
-
-test "CLUSTER MYSHARDID reports same id for both primary and replica" {
-    for {set i 0} {$i < 4} {incr i} {
-        assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid]
-        assert_equal [string length [R $i cluster myshardid]] 40
-    }
-}
-
-test "New replica receives primary's shard id" {
-    #find a primary
-    set id 0
-    for {} {$id < 8} {incr id} {
-        if {[regexp "master" [R $id role]]} {
-            break
-        }
-    }
-    assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid]
-    assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]]
-    assert_equal [R 8 cluster myshardid] [R $id cluster myshardid]
-}
-
-test "CLUSTER MYSHARDID reports same shard id after shard restart" {
-    set node_ids {}
-    for {set i 0} {$i < 8} {incr i 4} {
-        dict set node_ids $i [R $i cluster myshardid]
-        kill_instance valkey $i
-        wait_for_condition 50 100 {
-            [instance_is_killed valkey $i]
-        } else {
-            fail "instance $i is not killed"
-        }
-    }
-    for {set i 0} {$i < 8} {incr i 4} {
-        restart_instance valkey $i
-    }
-    assert_cluster_state ok
-    for {set i 0} {$i < 8} {incr i 4} {
-        assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
-    }
-}
-
-test "CLUSTER MYSHARDID reports same shard id after cluster restart" {
-    set node_ids {}
-    for {set i 0} {$i < 8} {incr i} {
-        dict set node_ids $i [R $i cluster myshardid]
-    }
-    for {set i 0} {$i < 8} {incr i} {
-        kill_instance valkey $i
-        wait_for_condition 50 100 {
-            [instance_is_killed valkey $i]
-        } else {
-            fail "instance $i is not killed"
-        }
-    }
-    for {set i 0} {$i < 8} {incr i} {
-        restart_instance valkey $i
-    }
-    assert_cluster_state ok
-    for {set i 0} {$i < 8} {incr i} {
-        assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
-    }
-}
-
-test "CLUSTER SHARDS id response validation" {
-    # For each node in the cluster
-    for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
-        # Get the CLUSTER SHARDS output from this node
-        set shards [R $i CLUSTER SHARDS]
-        set seen_shard_ids {}
-
-        # For each shard in the output
-        foreach shard $shards {
-            set shard_dict [dict create {*}$shard]
-
-            # 1. Verify 'id' key exists
-            assert {[dict exists $shard_dict id]}
-            set shard_id [dict get $shard_dict id]
-
-            # 2. Verify shard_id is a 40-char string
-            assert {[string length $shard_id] == 40}
-
-            # 3. Verify that for a given node's output, all shard IDs are unique
-            assert {[dict exists $seen_shard_ids $shard_id] == 0}
-            dict set seen_shard_ids $shard_id 1
-        }
-    }
-}
@@ -1,91 +0,0 @@
-# Initialization tests -- most units will start including this.
-
-test "(init) Restart killed instances" {
-    foreach type {valkey} {
-        foreach_${type}_id id {
-            if {[get_instance_attrib $type $id pid] == -1} {
-                puts -nonewline "$type/$id "
-                flush stdout
-                restart_instance $type $id
-            }
-        }
-    }
-}
-
-test "Cluster nodes are reachable" {
-    foreach_valkey_id id {
-        # Every node should be reachable.
-        wait_for_condition 1000 50 {
-            ([catch {R $id ping} ping_reply] == 0) &&
-            ($ping_reply eq {PONG})
-        } else {
-            catch {R $id ping} err
-            fail "Node #$id keeps replying '$err' to PING."
-        }
-    }
-}
-
-test "Cluster nodes hard reset" {
-    foreach_valkey_id id {
-        if {$::valgrind} {
-            set node_timeout 10000
-        } else {
-            set node_timeout 3000
-        }
-        catch {R $id flushall} ; # May fail for readonly slaves.
-        R $id MULTI
-        R $id cluster reset hard
-        R $id cluster set-config-epoch [expr {$id+1}]
-        R $id EXEC
-        R $id config set cluster-node-timeout $node_timeout
-        R $id config set cluster-slave-validity-factor 10
-        R $id config set loading-process-events-interval-bytes 2097152
-        R $id config set key-load-delay 0
-        R $id config set repl-diskless-load disabled
-        R $id config set cluster-announce-hostname ""
-        R $id DEBUG DROP-CLUSTER-PACKET-FILTER -1
-        R $id config rewrite
-    }
-}
-
-# Helper function to attempt to have each node in a cluster
-# meet each other.
-proc join_nodes_in_cluster {} {
-    # Join node 0 with 1, 1 with 2, ... and so forth.
-    # If auto-discovery works all nodes will know every other node
-    # eventually.
-    set ids {}
-    foreach_valkey_id id {lappend ids $id}
-    for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} {
-        set a [lindex $ids $j]
-        set b [lindex $ids [expr $j+1]]
-        set b_port [get_instance_attrib valkey $b port]
-        R $a cluster meet 127.0.0.1 $b_port
-    }
-
-    foreach_valkey_id id {
-        wait_for_condition 1000 50 {
-            [llength [get_cluster_nodes $id connected]] == [llength $ids]
-        } else {
-            return 0
-        }
-    }
-    return 1
-}
-
-test "Cluster Join and auto-discovery test" {
-    # Use multiple attempts since sometimes nodes timeout
-    # while attempting to connect.
-    for {set attempts 3} {$attempts > 0} {incr attempts -1} {
-        if {[join_nodes_in_cluster] == 1} {
-            break
-        }
-    }
-    if {$attempts == 0} {
-        fail "Cluster failed to form full mesh"
-    }
-}
-
-test "Before slots allocation, all nodes report cluster failure" {
-    assert_cluster_state fail
-}
@@ -1,36 +0,0 @@
-source "../../../tests/support/cli.tcl"
-
-proc config_set_all_nodes {keyword value} {
-    foreach_valkey_id id {
-        R $id config set $keyword $value
-    }
-}
-
-proc fix_cluster {addr} {
-    set code [catch {
-        exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster fix $addr << yes
-    } result]
-    if {$code != 0} {
-        puts "valkey-cli --cluster fix returns non-zero exit code, output below:\n$result"
-    }
-    # Note: valkey-cli --cluster fix may return a non-zero exit code if nodes don't agree,
-    # but we can ignore that and rely on the check below.
-    assert_cluster_state ok
-    wait_for_condition 100 100 {
-        [catch {exec $::VALKEY_CLI_BIN {*}[valkeycli_tls_config "../../../tests"] --cluster check $addr} result] == 0
-    } else {
-        puts "valkey-cli --cluster check returns non-zero exit code, output below:\n$result"
-        fail "Cluster could not settle with configuration"
-    }
-}
-
-proc wait_cluster_stable {} {
-    wait_for_condition 1000 50 {
-        [catch {exec $::VALKEY_CLI_BIN --cluster \
-            check 127.0.0.1:[get_instance_attrib valkey 0 port] \
-            {*}[valkeycli_tls_config "../../../tests"] \
-            }] == 0
-    } else {
-        fail "Cluster doesn't stabilize"
-    }
-}
@@ -1 +0,0 @@
-*
@@ -148,6 +148,7 @@ proc wait_for_cluster_size {cluster_size} {
 # Check that cluster nodes agree about "state", or raise an error.
 proc wait_for_cluster_state {state} {
    for {set j 0} {$j < [llength $::servers]} {incr j} {
+        if {![process_is_alive [srv -$j pid]]} continue
        if {[process_is_paused [srv -$j pid]]} continue
        wait_for_condition 1000 50 {
            [CI $j cluster_state] eq $state
@@ -53,3 +53,289 @@ start_cluster 3 3 {tags {external:skip cluster}} {
        assert_equal $shard_0_slot_coverage [dict get [get_node_info_from_shard $node_0_id $validation_node "shard"] "slots"]
    }
 }
+# Initial slot distribution for split-slot cluster tests.
+set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926]
+set ::slot1 [list 5460 5460 5462 10922 10925 10925]
+set ::slot2 [list 10923 10924 10927 16383]
+set ::slot3 [list 1001 1001]
+
+# Slot allocator: assigns split slots to each master.
+proc split_slot_allocation {masters replicas} {
+    for {set j 0} {$j < $masters} {incr j} {
+        R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}]
+    }
+}
+
+# Replica allocator: allocates only masters replicas, leaving the last server
+# (R 8) as a standalone no-slot node for testing purposes.
+proc split_slot_replica_allocation {masters replicas} {
+    cluster_allocate_replicas $masters [expr {$replicas - 1}]
+}
+
+proc cluster_ensure_master {id} {
+    if { [regexp "master" [R $id role]] == 0 } {
+        assert_equal {OK} [R $id CLUSTER FAILOVER]
+        wait_for_condition 50 100 {
+            [regexp "master" [R $id role]] == 1
+        } else {
+            fail "instance $id is not master"
+        }
+    }
+}
+
+# start_cluster 4 masters + 5 nodes (4 replicas + 1 standalone R8)
+start_cluster 4 5 {tags {external:skip cluster}} {
+
+# cluster_master_nodes and cluster_replica_nodes refer to the active cluster members.
+set ::cluster_master_nodes 4
+set ::cluster_replica_nodes 4
+
+test "Cluster should start ok" {
+    wait_for_cluster_state ok
+}
+
+test "Set cluster hostnames and verify they are propagated" {
+    for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
+        R $j config set cluster-announce-hostname "host-$j.com"
+    }
+
+    # Wait for everyone to agree about the state
+    wait_for_cluster_propagation
+}
+
+test "Verify information about the shards" {
+    set ids {}
+    for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
+        lappend ids [R $j CLUSTER MYID]
+    }
+    set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3]
+
+    # Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent.
+    for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} {
+        for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
+            assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots]
+            assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname]
+            assert_equal "127.0.0.1"  [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip]
+            # Default value of 'cluster-preferred-endpoint-type' is ip.
+            assert_equal "127.0.0.1"  [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint]
+
+            if {$::tls} {
+                assert_equal [srv [expr -1*$i] pport] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
+                assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port]
+            } else {
+                assert_equal [srv [expr -1*$i] port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port]
+            }
+
+            if {$i < 4} {
+                assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
+                assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health]
+            } else {
+                assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role]
+                # Replica could be in online or loading
+            }
+        }
+    }
+}
+
+test "Verify no slot shard" {
+    # R 8 is a standalone node with no slots assigned (left standalone by split_slot_replica_allocation)
+    set node_8_id [R 8 CLUSTER MYID]
+    assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots]
+    assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots]
+}
+
+set node_0_id [R 0 CLUSTER MYID]
+
+test "Kill a node and tell the replica to immediately takeover" {
+    pause_process [srv 0 pid]
+    R 4 cluster failover force
+}
+
+# Primary 0 node should report as fail, wait until the new primary acknowledges it.
+test "Verify health as fail for killed node" {
+    wait_for_condition 1000 50 {
+        "fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"]
+    } else {
+        fail "New primary never detected the node failed"
+    }
+}
+
+set primary_id 4
+set replica_id 0
+
+test "Restarting primary node" {
+    restart_server [expr -1*$replica_id] true false
+}
+
+test "Instance #0 gets converted into a replica" {
+    wait_for_condition 1000 50 {
+        [s [expr -1*$replica_id] role] eq {slave}
+    } else {
+        fail "Old primary was not converted into replica"
+    }
+}
+
+test "Test the replica reports a loading state while it's loading" {
+    # Test the command is good for verifying everything moves to a happy state
+    set replica_cluster_id [R $replica_id CLUSTER MYID]
+    wait_for_condition 50 1000 {
+        [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online"
+    } else {
+        fail "Replica never transitioned to online"
+    }
+
+    # Set 1 MB of data, so there is something to load on full sync
+    R $primary_id debug populate 1000 key 1000
+
+    # Kill replica client for primary and load new data to the primary
+    R $primary_id config set repl-backlog-size 100
+
+    # Set the key load delay so that it will take at least
+    # 2 seconds to fully load the data.
+    R $replica_id config set key-load-delay 4000
+
+    # Trigger event loop processing every 1024 bytes, this trigger
+    # allows us to send and receive cluster messages, so we are setting
+    # it low so that the cluster messages are sent more frequently.
+    R $replica_id config set loading-process-events-interval-bytes 1024
+
+    R $primary_id multi
+    R $primary_id client kill type replica
+    # populate the correct data
+    set num 100
+    set value [string repeat A 1024]
+    for {set j 0} {$j < $num} {incr j} {
+        # Use hashtag valid for shard #0
+        set key "{ch3}$j"
+        R $primary_id set $key $value
+    }
+    R $primary_id exec
+
+    # The replica should reconnect and start a full sync, it will gossip about it's health to the primary.
+    wait_for_condition 50 1000 {
+        "loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
+    } else {
+        fail "Replica never transitioned to loading"
+    }
+
+    # Verify cluster shards and cluster slots (deprecated) API responds while the node is loading data.
+    R $replica_id CLUSTER SHARDS
+    R $replica_id CLUSTER SLOTS
+
+    # Speed up the key loading and verify everything resumes
+    R $replica_id config set key-load-delay 0
+
+    wait_for_condition 50 1000 {
+        "online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health]
+    } else {
+        fail "Replica never transitioned to online"
+    }
+
+    # Final sanity, the replica agrees it is online.
+    assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health]
+}
+
+test "Regression test for a crash when calling SHARDS during handshake" {
+    # Use R 8 (standalone node) to establish handshaking connections
+    set id [R 8 CLUSTER MYID]
+    R 8 CLUSTER RESET HARD
+    for {set i 0} {$i < 8} {incr i} {
+        R $i CLUSTER FORGET $id
+    }
+    R 8 cluster meet 127.0.0.1 [srv 0 port]
+    # This line would previously crash, since all the outbound
+    # connections were in handshake state.
+    R 8 CLUSTER SHARDS
+}
+
+test "Cluster is up" {
+    wait_for_cluster_state ok
+}
+
+test "Shard ids are unique" {
+    set shard_ids {}
+    for {set i 0} {$i < 4} {incr i} {
+        set shard_id [R $i cluster myshardid]
+        assert_equal [dict exists $shard_ids $shard_id] 0
+        dict set shard_ids $shard_id 1
+    }
+}
+
+test "CLUSTER MYSHARDID reports same id for both primary and replica" {
+    for {set i 0} {$i < 4} {incr i} {
+        assert_equal [R $i cluster myshardid] [R [expr $i+4] cluster myshardid]
+        assert_equal [string length [R $i cluster myshardid]] 40
+    }
+}
+
+test "New replica receives primary's shard id" {
+    # find a primary
+    set id 0
+    for {} {$id < 8} {incr id} {
+        if {[regexp "master" [R $id role]]} {
+            break
+        }
+    }
+    assert_not_equal [R 8 cluster myshardid] [R $id cluster myshardid]
+    assert_equal {OK} [R 8 cluster replicate [R $id cluster myid]]
+    assert_equal [R 8 cluster myshardid] [R $id cluster myshardid]
+}
+
+test "CLUSTER MYSHARDID reports same shard id after shard restart" {
+    set node_ids {}
+    for {set i 0} {$i < 8} {incr i 4} {
+        dict set node_ids $i [R $i cluster myshardid]
+        pause_process [srv [expr -1*$i] pid]
+    }
+    for {set i 0} {$i < 8} {incr i 4} {
+        restart_server [expr -1*$i] true false
+    }
+    wait_for_cluster_state ok
+    for {set i 0} {$i < 8} {incr i 4} {
+        assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
+    }
+}
+
+test "CLUSTER MYSHARDID reports same shard id after cluster restart" {
+    set node_ids {}
+    for {set i 0} {$i < 8} {incr i} {
+        dict set node_ids $i [R $i cluster myshardid]
+    }
+    for {set i 0} {$i < 8} {incr i} {
+        pause_process [srv [expr -1*$i] pid]
+    }
+    for {set i 0} {$i < 8} {incr i} {
+        restart_server [expr -1*$i] true false
+    }
+    wait_for_cluster_state ok
+    for {set i 0} {$i < 8} {incr i} {
+        assert_equal [dict get $node_ids $i] [R $i cluster myshardid]
+    }
+}
+
+test "CLUSTER SHARDS id response validation" {
+    # For each node in the cluster
+    for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} {
+        # Get the CLUSTER SHARDS output from this node
+        set shards [R $i CLUSTER SHARDS]
+        set seen_shard_ids {}
+
+        # For each shard in the output
+        foreach shard $shards {
+            set shard_dict [dict create {*}$shard]
+
+            # 1. Verify 'id' key exists
+            assert {[dict exists $shard_dict id]}
+            set shard_id [dict get $shard_dict id]
+
+            # 2. Verify shard_id is a 40-char string
+            assert {[string length $shard_id] == 40}
+
+            # 3. Verify that for a given node's output, all shard IDs are unique
+            assert {[dict exists $seen_shard_ids $shard_id] == 0}
+            dict set seen_shard_ids $shard_id 1
+        }
+    }
+}
+
+} split_slot_allocation split_slot_replica_allocation
@@ -152,3 +152,108 @@ start_cluster 3 1 {tags {external:skip cluster}} {
        }
    }
 } ;# start_cluster
+
+# Failover stress test.
+# In this test a different node is killed in a loop for N
+# iterations. The test checks that certain properties
+# are preserved across iterations.
+start_cluster 5 5 {tags {external:skip cluster}} {
+    set iterations 10
+    set cluster [valkey_cluster 127.0.0.1:[srv 0 port]]
+
+    while {[incr iterations -1]} {
+        set tokill [randomInt 10]
+        set other [expr {($tokill+1)%10}] ; # Some other instance.
+        set key [randstring 20 20 alpha]
+        set val [randstring 20 20 alpha]
+        set role [s [expr -1*$tokill] role]
+        if {$role eq {master}} {
+            set slave {}
+            set myid [dict get [cluster_get_myself $tokill] id]
+            for {set id 0} {$id < [llength $::servers]} {incr id} {
+                if {$id == $tokill} continue
+                if {[dict get [cluster_get_myself $id] slaveof] eq $myid} {
+                    set slave $id
+                }
+            }
+            if {$slave eq {}} {
+                fail "Unable to retrieve slave's ID for master #$tokill"
+            }
+        }
+
+        if {$role eq {master}} {
+            test "Wait for slave of #$tokill to sync" {
+                wait_for_condition 1000 50 {
+                    [string match {*state=online*} [s [expr -1*$tokill] slave0]]
+                } else {
+                    fail "Slave of node #$tokill is not ok"
+                }
+            }
+            set slave_config_epoch [CI $slave cluster_my_epoch]
+        }
+
+        test "Cluster is writable before failover" {
+            for {set i 0} {$i < 100} {incr i} {
+                catch {$cluster set $key:$i $val:$i} err
+                assert {$err eq {OK}}
+            }
+            # Wait for the write to propagate to the slave if we
+            # are going to kill a master.
+            if {$role eq {master}} {
+                R $tokill wait 1 20000
+            }
+        }
+
+        test "Terminating node #$tokill" {
+            catch {R $tokill shutdown nosave}
+        }
+
+        if {$role eq {master}} {
+            test "Wait failover by #$slave with old epoch $slave_config_epoch" {
+                wait_for_condition 1000 50 {
+                    [CI $slave cluster_my_epoch] > $slave_config_epoch
+                } else {
+                    fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
+                }
+            }
+        }
+
+        test "Cluster should eventually be up again" {
+            wait_for_cluster_state ok
+        }
+
+        test "Cluster is writable again" {
+            for {set i 0} {$i < 100} {incr i} {
+                catch {$cluster set $key:$i:2 $val:$i:2} err
+                assert {$err eq {OK}}
+            }
+        }
+
+        test "Restarting node #$tokill" {
+            restart_server [expr -1*$tokill] true false
+        }
+
+        test "Instance #$tokill is now a slave" {
+            wait_for_condition 1000 50 {
+                [s [expr -1*$tokill] role] eq {slave}
+            } else {
+                fail "Restarted instance is not a slave"
+            }
+        }
+
+        test "We can read back the value we set before" {
+            for {set i 0} {$i < 100} {incr i} {
+                catch {$cluster get $key:$i} err
+                assert {$err eq "$val:$i"}
+                catch {$cluster get $key:$i:2} err
+                assert {$err eq "$val:$i:2"}
+            }
+        }
+    }
+
+    test "Post condition: current_epoch >= my_epoch everywhere" {
+        for {set id 0} {$id < [llength $::servers]} {incr id} {
+            assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
+        }
+    }
+} ;# start_cluster
@@ -0,0 +1,76 @@
+# Check that the status of primary that can be targeted by replica migration
+# is acquired again, after being getting slots again, in a cluster where the
+# other primaries have replicas.
+tags {"slow valgrind:skip"} {
+run_solo {cluster-replica-migration-slow} {
+start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration yes}} {
+    test "Primary #0 should re-acquire one or more replicas" {
+        # Resharding all the primary #0 slots away from it
+        set primary0_id [dict get [cluster_get_myself 0] id]
+        set output [exec \
+            $::VALKEY_CLI_BIN --cluster rebalance \
+            127.0.0.1:[srv 0 port] \
+            {*}[valkeycli_tls_config "./tests"] \
+            --cluster-weight ${primary0_id}=0 >@ stdout]
+
+        # Primary #0 who lost all slots should turn into a replica without replicas
+        wait_for_condition 1000 50 {
+            [s 0 role] eq "slave" && [s 0 connected_slaves] == 0
+        } else {
+            puts [R 0 info replication]
+            fail "Primary #0 didn't turn itself into a replica"
+        }
+
+        # Resharding back some slot to primary #0
+        # Wait for the cluster config to propagate before attempting a
+        # new resharding.
+        wait_for_cluster_propagation
+        set output [exec \
+            $::VALKEY_CLI_BIN --cluster rebalance \
+            127.0.0.1:[srv 0 port] \
+            {*}[valkeycli_tls_config "./tests"] \
+            --cluster-weight ${primary0_id}=.01 \
+            --cluster-use-empty-masters >@ stdout]
+
+        wait_for_condition 1000 50 {
+            [llength [lindex [R 0 role] 2]] >= 1
+        } else {
+            fail "Primary #0 has no has replicas"
+        }
+    }
+} ;# start_cluster
+} ;# run_solo
+} ;# tag
+
+# Check that if 'cluster-allow-replica-migration' is set to 'no', replicas do not
+# migrate when primary becomes empty.
+tags {"slow valgrind:skip"} {
+run_solo {cluster-replica-migration-slow} {
+start_cluster 5 15 {tags {external:skip cluster} overrides {cluster-allow-replica-migration no}} {
+    test "Each primary should have at least two replicas attached" {
+        # Resharding all the primary #0 slots away from it
+        set primary0_id [dict get [cluster_get_myself 0] id]
+        set output [exec \
+            $::VALKEY_CLI_BIN --cluster rebalance \
+            127.0.0.1:[srv 0 port] \
+            {*}[valkeycli_tls_config "./tests"] \
+            --cluster-weight ${primary0_id}=0 >@ stdout]
+
+        wait_for_cluster_propagation
+        wait_for_cluster_state "ok"
+
+        # Primary #0 still should have its replicas
+        assert { [llength [lindex [R 0 role] 2]] >= 2 }
+
+        # Each primary should have at least two replicas attached
+        for {set id 0} {$id < 5} {incr id} {
+            wait_for_condition 1000 50 {
+                [llength [lindex [R $id role] 2]] >= 2
+            } else {
+                fail "Primary #$id does not have 2 replicas as expected"
+            }
+        }
+    }
+} ;# start_cluster
+} ;# run_solo
+} ;# tag
@@ -433,3 +433,73 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout
        assert_equal [R 2 dbsize] 0
    }
 } my_slot_allocation cluster_allocate_replicas ;# start_cluster
+
+# Replica migration test.
+# Check that orphaned primaries are joined by replicas of primaries having
+# multiple replicas attached, according to the migration barrier settings.
+start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} {
+    # Killing all the replicas of primary #0 and #1
+    pause_process [srv -5 pid]
+    pause_process [srv -10 pid]
+    pause_process [srv -6 pid]
+    pause_process [srv -11 pid]
+
+    # R0 and R1 will trigger replica migration.
+    wait_for_condition 1000 50 {
+        [expr {[count_log_message -7 "Migrating to orphaned primary"] +
+              [count_log_message -8 "Migrating to orphaned primary"] +
+              [count_log_message -9 "Migrating to orphaned primary"] +
+              [count_log_message -12 "Migrating to orphaned primary"] +
+              [count_log_message -13 "Migrating to orphaned primary"] +
+              [count_log_message -14 "Migrating to orphaned primary"]}] == 2
+    } else {
+        fail "Replicas did not trigger replica migration "
+    }
+
+    # Primary should have at least one replica"
+    for {set id 0} {$id < 5} {incr id} {
+        wait_for_condition 1000 50 {
+            [llength [lindex [R $id role] 2]] >= 1
+        } else {
+            fail "Primary #$id has no replicas"
+        }
+    }
+
+    resume_process [srv -5 pid]
+    resume_process [srv -10 pid]
+    resume_process [srv -6 pid]
+    resume_process [srv -11 pid]
+} ;# start_cluster
+
+# Now test the migration to a primary which used to be a replica, after
+# a failover.
+start_cluster 5 10 {tags {external:skip cluster} overrides {shutdown-timeout 0}} {
+    test "Primary #12 should get at least one migrated replica" {
+        # Kill replica #7 of primary #2. Only replica left is #12 now"
+        pause_process [srv -7 pid]
+
+        # Killing primary node #2, #12 should failover
+        set current_epoch [CI 1 cluster_current_epoch]
+        pause_process [srv -2 pid]
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_current_epoch] > $current_epoch
+        } else {
+            fail "No failover detected"
+        }
+
+        wait_for_cluster_state ok
+        cluster_write_test [srv -1 port]
+        assert {[s -12 role] eq {master}}
+
+        # The remaining instance is now without replicas. Some other replica
+        # should migrate to it.
+        wait_for_condition 1000 50 {
+            [llength [lindex [R 12 role] 2]] >= 1
+        } else {
+            fail "Primary #12 has no replicas"
+        }
+
+        resume_process [srv -7 pid]
+        resume_process [srv -2 pid]
+    }
+} ;# start_cluster
@@ -0,0 +1,185 @@
+# Resharding test.
+# In this test a live resharding is performed and the test checks
+# that certain properties are preserved across the operation.
+tags {"slow"} {
+run_solo {cluster-resharding} {
+start_cluster 5 5 {tags {external:skip cluster}} {
+    test "Enable AOF in all the instances" {
+        for {set id 0} {$id < [llength $::servers]} {incr id} {
+            R $id config set appendonly yes
+            # We use "appendfsync no" because it's fast but also guarantees that
+            # write(2) is performed before replying to client.
+            R $id config set appendfsync no
+        }
+
+        for {set id 0} {$id < [llength $::servers]} {incr id} {
+            wait_for_condition 1000 500 {
+                [s [expr -1*$id] aof_rewrite_in_progress] == 0 &&
+                [s [expr -1*$id] aof_enabled] == 1
+            } else {
+                fail "Failed to enable AOF on instance #$id"
+            }
+        }
+    }
+
+    # Our resharding test performs the following actions:
+    #
+    # - N commands are sent to the cluster in the course of the test.
+    # - Every command selects a random key from key:0 to key:MAX-1.
+    # - The operation RPUSH key <randomvalue> is performed.
+    # - Tcl remembers into an array all the values pushed to each list.
+    # - After N/2 commands, the resharding process is started in background.
+    # - The test continues while the resharding is in progress.
+    # - At the end of the test, we wait for the resharding process to stop.
+    # - Finally the keys are checked to see if they contain the value they should.
+
+    set numkeys 50000
+    set numops 200000
+    set start_node_port [srv 0 port]
+    set cluster [valkey_cluster 127.0.0.1:$start_node_port]
+    if {$::tls} {
+        # setup a non-TLS cluster client to the TLS cluster
+        set plaintext_port [srv 0 pport]
+        set cluster_plaintext [valkey_cluster 127.0.0.1:$plaintext_port 0]
+        puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
+    } else {
+        set cluster_plaintext $cluster
+        puts "Testing using non-TLS cluster"
+    }
+    catch {unset content}
+    array set content {}
+    set tribpid {}
+
+    test "Cluster consistency during live resharding" {
+        set ele 0
+        for {set j 0} {$j < $numops} {incr j} {
+            # Trigger the resharding once we execute half the ops.
+            if {$tribpid ne {} &&
+                ($j % 10000) == 0 &&
+                ![process_is_alive $tribpid]} {
+                set tribpid {}
+            }
+
+            if {$j >= $numops/2 && $tribpid eq {}} {
+                puts -nonewline "...Starting resharding..."
+                flush stdout
+                set target [dict get [cluster_get_myself [randomInt 5]] id]
+                set tribpid [lindex [exec \
+                    $::VALKEY_CLI_BIN --cluster reshard \
+                    127.0.0.1:[srv 0 port] \
+                    --cluster-from all \
+                    --cluster-to $target \
+                    --cluster-slots 100 \
+                    --cluster-yes \
+                    {*}[valkeycli_tls_config "./tests"] \
+                    | [info nameofexecutable] \
+                    tests/helpers/onlydots.tcl \
+                    &] 0]
+            }
+
+            # Write random data to random list.
+            set listid [randomInt $numkeys]
+            set key "key:$listid"
+            incr ele
+            # We write both with Lua scripts and with plain commands.
+            # This way we are able to stress Lua -> server command invocation
+            # as well, that has tests to prevent Lua to write into wrong
+            # hash slots.
+            # We also use both TLS and plaintext connections.
+            if {$listid % 3 == 0} {
+                $cluster rpush $key $ele
+            } elseif {$listid % 3 == 1} {
+                $cluster_plaintext rpush $key $ele
+            } else {
+                $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
+            }
+            lappend content($key) $ele
+
+            if {($j % 1000) == 0} {
+                puts -nonewline W; flush stdout
+            }
+        }
+
+        # Wait for the resharding process to end
+        wait_for_condition 1000 500 {
+            [process_is_alive $tribpid] == 0
+        } else {
+            fail "Resharding is not terminating after some time."
+        }
+        wait_for_cluster_propagation
+    }
+
+    test "Verify $numkeys keys for consistency with logical content" {
+        # Check that the Cluster content matches our logical content.
+        foreach {key value} [array get content] {
+            if {[$cluster lrange $key 0 -1] ne $value} {
+                fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
+            }
+        }
+    }
+
+    test "Terminate and restart all the instances" {
+        for {set id 0} {$id < [llength $::servers]} {incr id} {
+            # Stop AOF so that an initial AOFRW won't prevent the instance from terminating
+            R $id config set appendonly no
+            R $id bgsave
+            wait_for_condition 1000 50 {
+                [s [expr -1*$id] rdb_bgsave_in_progress] == 0
+            } else {
+                fail "bgsave didn't finish for instance #$id"
+            }
+            restart_server [expr -1*$id] true false
+        }
+    }
+
+    test "Cluster should eventually be up again" {
+        wait_for_cluster_state ok
+    }
+
+    test "Verify $numkeys keys after the restart" {
+        # Check that the Cluster content matches our logical content.
+        foreach {key value} [array get content] {
+            if {[$cluster lrange $key 0 -1] ne $value} {
+                fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
+            }
+        }
+    }
+
+    test "Disable AOF in all the instances" {
+        for {set id 0} {$id < [llength $::servers]} {incr id} {
+            R $id config set appendonly no
+        }
+    }
+
+    test "Verify slaves consistency" {
+        set verified_masters 0
+        for {set id 0} {$id < [llength $::servers]} {incr id} {
+            set role [R $id role]
+            lassign $role myrole myoffset slaves
+            if {$myrole eq {slave}} continue
+            set masterport [srv [expr -1*$id] port]
+            set masterdigest [R $id debug digest]
+            for {set sid 0} {$sid < [llength $::servers]} {incr sid} {
+                set srole [R $sid role]
+                if {[lindex $srole 0] eq {master}} continue
+                if {[lindex $srole 2] != $masterport} continue
+                wait_for_condition 1000 500 {
+                    [R $sid debug digest] eq $masterdigest
+                } else {
+                    fail "Master and slave data digest are different"
+                }
+                incr verified_masters
+            }
+        }
+        assert {$verified_masters >= 5}
+    }
+
+    test "Dump sanitization was skipped for migrations" {
+        for {set id 0} {$id < [llength $::servers]} {incr id} {
+            assert {[s [expr -1*$id] dump_payload_sanitizations] == 0}
+        }
+    }
+
+} ;# start_cluster
+} ;# run_solo
+} ;# tag