mirror of
https://github.com/valkey-io/valkey.git
synced 2026-05-06 13:36:47 -04:00
Fix some flaky tests (#3430)
Fixing multiple flaky tests.
slave buffer are counted correctly in tests/unit/maxmemory.tcl
Memory efficiency with values in range * in tests/unit/memefficiency.tcl
These tests send large numbers of pipelined commands using deferring
clients without reading replies, causing the server's client output
buffer to grow. On slow CI runners, this leads to TCP backpressure and
I/O errors that crash the test runner. Fix: Use CLIENT REPLY OFF to
suppress reply generation, matching the pattern from commit 87d2330c22.
---
Sub-replica reports zero repl offset and rank, and fails to win election
in tests/unit/cluster/replica-migration.tcl
New non-empty replica reports zero repl offset and rank, and fails to
win election in tests/unit/cluster/replica-migration.tcl
In the replica-migration tests, a MOVED errors results in an Tcl
exception. After failover, wait_for_condition blocks issue GET commands
to cluster nodes that may not have fully updated their slot routing. An
unhandled MOVED exception crashes the test runner. Fix: Wrap the
condition in catch so MOVED errors are retried. Also wrap debug prints
in the else clause. Fixes the following tests:
---
Replica can update the config epoch when trigger the failover -
automatic in tests/unit/cluster/failover2.tcl
Increase wait timeout for failover expiry. The test waits 10 seconds for
"Failover attempt expired", but the default cluster-node-timeout in
start_cluster is 3000ms, making auth_timeout 6 seconds plus ~3
seconds for failure detection — barely fitting in 10 seconds and failing
on slow CI runners. Fix: Increase wait from 1000×10ms to 1200×50ms
(60 seconds).
---
dual-channel-replication lazyfree test
The test looks up the replica's main-channel connection id after writing
50MB of data. On slow CI runners, the replica connection may have been
disconnected by the output buffer soft limit (64MB/60s) before the
lookup, causing get_client_id_by_last_cmd to return empty. Two changes:
1. Move the connection id lookup before the write loop, while the sync
is known to be in progress.
2. Reduce writes from 50 x 1MB to 10 x 1MB. The test only needs enough
data to exceed the lazyfree threshold (64 blocks ~= 1MB). 10MB is
sufficient and avoids approaching the output buffer limit.
---------
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
This commit is contained in:
committed by
GitHub
parent
8bb8d9168f
commit
f3b6470502
@@ -1404,15 +1404,17 @@ test "Test dual-channel-replication replica can lazyfree the local buffer" {
|
||||
fail "replica didn't start sync session in time"
|
||||
}
|
||||
|
||||
# Get the main channel connection id while sync is still in progress.
|
||||
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
|
||||
assert_not_equal $replica_main_conn_id ""
|
||||
|
||||
# Adding more data to replica local buffer
|
||||
set bigstr [string repeat x 1000000]
|
||||
for {set j 0} {$j < 50} {incr j} {
|
||||
for {set j 0} {$j < 10} {incr j} {
|
||||
$primary set key $bigstr
|
||||
}
|
||||
|
||||
# Kill the main channel so that the replica will abort the sync
|
||||
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
|
||||
assert_not_equal $replica_main_conn_id ""
|
||||
$primary client kill id $replica_main_conn_id
|
||||
|
||||
# Wait for replica to abort the sync and lazyfree the local buffer.
|
||||
@@ -1453,15 +1455,17 @@ test "Test dual-channel-replication replica can lazyfree the local buffer" {
|
||||
fail "replica didn't start sync session in time"
|
||||
}
|
||||
|
||||
# Get the main channel connection id while sync is still in progress.
|
||||
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
|
||||
assert_not_equal $replica_main_conn_id ""
|
||||
|
||||
# Adding more data to replica local buffer
|
||||
set bigstr [string repeat x 1000000]
|
||||
for {set j 0} {$j < 50} {incr j} {
|
||||
for {set j 0} {$j < 10} {incr j} {
|
||||
$primary set key $bigstr
|
||||
}
|
||||
|
||||
# Kill the main channel so that the replica will abort the sync
|
||||
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
|
||||
assert_not_equal $replica_main_conn_id ""
|
||||
$primary client kill id $replica_main_conn_id
|
||||
|
||||
# Wait for replica to abort the sync and lazyfree the local buffer.
|
||||
|
||||
@@ -181,17 +181,17 @@ proc test_replica_config_epoch_failover {type} {
|
||||
|
||||
# Make sure both the automatic and the manual failover will fail in the first time.
|
||||
if {$type == "automatic"} {
|
||||
wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1000 10
|
||||
wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1200 50
|
||||
} elseif {$type == "manual"} {
|
||||
R 3 cluster failover force
|
||||
wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 10
|
||||
wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1200 50
|
||||
}
|
||||
|
||||
# Make sure the primaries prints the relevant logs.
|
||||
wait_for_log_messages -1 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
|
||||
wait_for_log_messages -1 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
|
||||
wait_for_log_messages -2 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
|
||||
wait_for_log_messages -2 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
|
||||
wait_for_log_messages -1 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1200 50
|
||||
wait_for_log_messages -1 {"*has old slots configuration, sending an UPDATE message about*"} 0 1200 50
|
||||
wait_for_log_messages -2 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1200 50
|
||||
wait_for_log_messages -2 {"*has old slots configuration, sending an UPDATE message about*"} 0 1200 50
|
||||
|
||||
# Make sure the replica has updated the config epoch.
|
||||
wait_for_condition 1000 10 {
|
||||
|
||||
@@ -106,13 +106,15 @@ proc test_migrated_replica {type} {
|
||||
R 3 readonly
|
||||
R 7 readonly
|
||||
wait_for_condition 1000 50 {
|
||||
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
|
||||
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
|
||||
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
|
||||
[catch {expr {
|
||||
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
|
||||
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
|
||||
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
|
||||
}} result] == 0 && $result
|
||||
} else {
|
||||
puts "R 3: [R 3 keys *]"
|
||||
puts "R 4: [R 4 keys *]"
|
||||
puts "R 7: [R 7 keys *]"
|
||||
catch {puts "R 3: [R 3 keys *]"}
|
||||
catch {puts "R 4: [R 4 keys *]"}
|
||||
catch {puts "R 7: [R 7 keys *]"}
|
||||
fail "Key not consistent"
|
||||
}
|
||||
|
||||
@@ -203,11 +205,13 @@ proc test_nonempty_replica {type} {
|
||||
# Make sure the key exists and is consistent.
|
||||
R 7 readonly
|
||||
wait_for_condition 1000 50 {
|
||||
[R 4 get key_991803] == 1024 &&
|
||||
[R 7 get key_991803] == 1024
|
||||
[catch {expr {
|
||||
[R 4 get key_991803] == 1024 &&
|
||||
[R 7 get key_991803] == 1024
|
||||
}} result] == 0 && $result
|
||||
} else {
|
||||
puts "R 4: [R 4 get key_991803]"
|
||||
puts "R 7: [R 7 get key_991803]"
|
||||
catch {puts "R 4: [R 4 get key_991803]"}
|
||||
catch {puts "R 7: [R 7 get key_991803]"}
|
||||
fail "Key not consistent"
|
||||
}
|
||||
|
||||
@@ -327,13 +331,15 @@ proc test_sub_replica {type} {
|
||||
R 3 readonly
|
||||
R 7 readonly
|
||||
wait_for_condition 1000 50 {
|
||||
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
|
||||
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
|
||||
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
|
||||
[catch {expr {
|
||||
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
|
||||
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
|
||||
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
|
||||
}} result] == 0 && $result
|
||||
} else {
|
||||
puts "R 3: [R 3 keys *]"
|
||||
puts "R 4: [R 4 keys *]"
|
||||
puts "R 7: [R 7 keys *]"
|
||||
catch {puts "R 3: [R 3 keys *]"}
|
||||
catch {puts "R 4: [R 4 keys *]"}
|
||||
catch {puts "R 7: [R 7 keys *]"}
|
||||
fail "Key not consistent"
|
||||
}
|
||||
|
||||
|
||||
@@ -336,12 +336,15 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
|
||||
# send some 10mb worth of commands that don't increase the memory usage
|
||||
if {$pipeline == 1} {
|
||||
set rd_master [valkey_deferring_client -1]
|
||||
$rd_master client reply off
|
||||
$rd_master flush
|
||||
for {set k 0} {$k < $cmd_count} {incr k} {
|
||||
$rd_master setrange key:0 0 [string repeat A $payload_len]
|
||||
if {$k % 10000 == 0} {$rd_master flush}
|
||||
}
|
||||
for {set k 0} {$k < $cmd_count} {incr k} {
|
||||
$rd_master read
|
||||
}
|
||||
$rd_master client reply on
|
||||
$rd_master flush
|
||||
$rd_master read ;# read the +OK from CLIENT REPLY ON
|
||||
} else {
|
||||
for {set k 0} {$k < $cmd_count} {incr k} {
|
||||
$master setrange key:0 0 [string repeat A $payload_len]
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
proc test_memory_efficiency {range} {
|
||||
r flushall
|
||||
set rd [valkey_deferring_client]
|
||||
$rd client reply off
|
||||
$rd flush
|
||||
set base_mem [s used_memory]
|
||||
set written 0
|
||||
for {set j 0} {$j < 10000} {incr j} {
|
||||
@@ -11,9 +13,9 @@ proc test_memory_efficiency {range} {
|
||||
incr written [string length $val]
|
||||
incr written 2 ;# A separator is the minimum to store key-value data.
|
||||
}
|
||||
for {set j 0} {$j < 10000} {incr j} {
|
||||
$rd read ; # Discard replies
|
||||
}
|
||||
$rd client reply on
|
||||
$rd flush
|
||||
$rd read ;# read the +OK from CLIENT REPLY ON
|
||||
|
||||
set current_mem [s used_memory]
|
||||
set used [expr {$current_mem-$base_mem}]
|
||||
|
||||
Reference in New Issue
Block a user