Fix some flaky tests (#3430)

Fixing multiple flaky tests.

    slave buffer are counted correctly in tests/unit/maxmemory.tcl
    Memory efficiency with values in range * in tests/unit/memefficiency.tcl

These tests send large numbers of pipelined commands using deferring
clients without reading replies, causing the server's client output
buffer to grow. On slow CI runners, this leads to TCP backpressure and
I/O errors that crash the test runner. Fix: Use CLIENT REPLY OFF to
suppress reply generation, matching the pattern from commit 87d2330c22.

---

    Sub-replica reports zero repl offset and rank, and fails to win election
    in tests/unit/cluster/replica-migration.tcl
    New non-empty replica reports zero repl offset and rank, and fails to
    win election in tests/unit/cluster/replica-migration.tcl

In the replica-migration tests, a MOVED errors results in an Tcl
exception. After failover, wait_for_condition blocks issue GET commands
to cluster nodes that may not have fully updated their slot routing. An
unhandled MOVED exception crashes the test runner. Fix: Wrap the
condition in catch so MOVED errors are retried. Also wrap debug prints
in the else clause. Fixes the following tests:

---

    Replica can update the config epoch when trigger the failover -
    automatic in tests/unit/cluster/failover2.tcl

Increase wait timeout for failover expiry. The test waits 10 seconds for
"Failover attempt expired", but the default cluster-node-timeout in
start_cluster is 3000ms, making auth_timeout 6 seconds plus ~3
seconds for failure detection — barely fitting in 10 seconds and failing
on slow CI runners. Fix: Increase wait from 1000×10ms to 1200×50ms
(60 seconds).

---

    dual-channel-replication lazyfree test

The test looks up the replica's main-channel connection id after writing
50MB of data. On slow CI runners, the replica connection may have been
disconnected by the output buffer soft limit (64MB/60s) before the
lookup, causing get_client_id_by_last_cmd to return empty. Two changes:

1. Move the connection id lookup before the write loop, while the sync
   is known to be in progress.
2. Reduce writes from 50 x 1MB to 10 x 1MB. The test only needs enough
   data to exceed the lazyfree threshold (64 blocks ~= 1MB). 10MB is
   sufficient and avoids approaching the output buffer limit.

---------

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
This commit is contained in:
Viktor Söderqvist
2026-04-02 10:51:26 +02:00
committed by GitHub
parent 8bb8d9168f
commit f3b6470502
5 changed files with 49 additions and 34 deletions
+10 -6
View File
@@ -1404,15 +1404,17 @@ test "Test dual-channel-replication replica can lazyfree the local buffer" {
fail "replica didn't start sync session in time"
}
# Get the main channel connection id while sync is still in progress.
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
assert_not_equal $replica_main_conn_id ""
# Adding more data to replica local buffer
set bigstr [string repeat x 1000000]
for {set j 0} {$j < 50} {incr j} {
for {set j 0} {$j < 10} {incr j} {
$primary set key $bigstr
}
# Kill the main channel so that the replica will abort the sync
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
assert_not_equal $replica_main_conn_id ""
$primary client kill id $replica_main_conn_id
# Wait for replica to abort the sync and lazyfree the local buffer.
@@ -1453,15 +1455,17 @@ test "Test dual-channel-replication replica can lazyfree the local buffer" {
fail "replica didn't start sync session in time"
}
# Get the main channel connection id while sync is still in progress.
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
assert_not_equal $replica_main_conn_id ""
# Adding more data to replica local buffer
set bigstr [string repeat x 1000000]
for {set j 0} {$j < 50} {incr j} {
for {set j 0} {$j < 10} {incr j} {
$primary set key $bigstr
}
# Kill the main channel so that the replica will abort the sync
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
assert_not_equal $replica_main_conn_id ""
$primary client kill id $replica_main_conn_id
# Wait for replica to abort the sync and lazyfree the local buffer.
+6 -6
View File
@@ -181,17 +181,17 @@ proc test_replica_config_epoch_failover {type} {
# Make sure both the automatic and the manual failover will fail in the first time.
if {$type == "automatic"} {
wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1000 10
wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1200 50
} elseif {$type == "manual"} {
R 3 cluster failover force
wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 10
wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1200 50
}
# Make sure the primaries prints the relevant logs.
wait_for_log_messages -1 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
wait_for_log_messages -1 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
wait_for_log_messages -2 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
wait_for_log_messages -2 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
wait_for_log_messages -1 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1200 50
wait_for_log_messages -1 {"*has old slots configuration, sending an UPDATE message about*"} 0 1200 50
wait_for_log_messages -2 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1200 50
wait_for_log_messages -2 {"*has old slots configuration, sending an UPDATE message about*"} 0 1200 50
# Make sure the replica has updated the config epoch.
wait_for_condition 1000 10 {
+22 -16
View File
@@ -106,13 +106,15 @@ proc test_migrated_replica {type} {
R 3 readonly
R 7 readonly
wait_for_condition 1000 50 {
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
[catch {expr {
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
}} result] == 0 && $result
} else {
puts "R 3: [R 3 keys *]"
puts "R 4: [R 4 keys *]"
puts "R 7: [R 7 keys *]"
catch {puts "R 3: [R 3 keys *]"}
catch {puts "R 4: [R 4 keys *]"}
catch {puts "R 7: [R 7 keys *]"}
fail "Key not consistent"
}
@@ -203,11 +205,13 @@ proc test_nonempty_replica {type} {
# Make sure the key exists and is consistent.
R 7 readonly
wait_for_condition 1000 50 {
[R 4 get key_991803] == 1024 &&
[R 7 get key_991803] == 1024
[catch {expr {
[R 4 get key_991803] == 1024 &&
[R 7 get key_991803] == 1024
}} result] == 0 && $result
} else {
puts "R 4: [R 4 get key_991803]"
puts "R 7: [R 7 get key_991803]"
catch {puts "R 4: [R 4 get key_991803]"}
catch {puts "R 7: [R 7 get key_991803]"}
fail "Key not consistent"
}
@@ -327,13 +331,15 @@ proc test_sub_replica {type} {
R 3 readonly
R 7 readonly
wait_for_condition 1000 50 {
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
[catch {expr {
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
}} result] == 0 && $result
} else {
puts "R 3: [R 3 keys *]"
puts "R 4: [R 4 keys *]"
puts "R 7: [R 7 keys *]"
catch {puts "R 3: [R 3 keys *]"}
catch {puts "R 4: [R 4 keys *]"}
catch {puts "R 7: [R 7 keys *]"}
fail "Key not consistent"
}
+6 -3
View File
@@ -336,12 +336,15 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
# send some 10mb worth of commands that don't increase the memory usage
if {$pipeline == 1} {
set rd_master [valkey_deferring_client -1]
$rd_master client reply off
$rd_master flush
for {set k 0} {$k < $cmd_count} {incr k} {
$rd_master setrange key:0 0 [string repeat A $payload_len]
if {$k % 10000 == 0} {$rd_master flush}
}
for {set k 0} {$k < $cmd_count} {incr k} {
$rd_master read
}
$rd_master client reply on
$rd_master flush
$rd_master read ;# read the +OK from CLIENT REPLY ON
} else {
for {set k 0} {$k < $cmd_count} {incr k} {
$master setrange key:0 0 [string repeat A $payload_len]
+5 -3
View File
@@ -1,6 +1,8 @@
proc test_memory_efficiency {range} {
r flushall
set rd [valkey_deferring_client]
$rd client reply off
$rd flush
set base_mem [s used_memory]
set written 0
for {set j 0} {$j < 10000} {incr j} {
@@ -11,9 +13,9 @@ proc test_memory_efficiency {range} {
incr written [string length $val]
incr written 2 ;# A separator is the minimum to store key-value data.
}
for {set j 0} {$j < 10000} {incr j} {
$rd read ; # Discard replies
}
$rd client reply on
$rd flush
$rd read ;# read the +OK from CLIENT REPLY ON
set current_mem [s used_memory]
set used [expr {$current_mem-$base_mem}]