Files
valkey/tests/unit/cluster/misc.tcl
T
Binbin 2871efd436 Add cluster-config-save-behavior option to control nodes.conf save behavior (#3372)
This commit introduces a new configuration option `cluster-config-save-behavior`
that controls how the cluster handles nodes.conf file save failures.

The option supports two modes:
- `sync` (default): Synchronously save the config file. If the save fails,
  the process exits. This maintains backward compatibility with the old
  behavior (before 9.1).
- `best-effort`: Synchronously save the config file. If the save fails,
  only log a warning and continue running. This allows the node to survive
  disk failures (e.g., disk full, read-only filesystem) without exitting,
  giving administrators time to address the issue.

Note that this modifies the behavior of #1032, whereas #1032 was "best-effort",
we have now introduced a configuration option that defaults to "sync."
See #1032 discussion for more details.

Background:
When a disk becomes read-only or full, any cluster metadata change would
trigger a nodes.conf save attempt. With the old behavior, the node would
immediately exit via clusterSaveConfigOrDie(), potentially causing multiple
nodes on the same machine to crash simultaneously, leading to cluster
unavailability.

The new `best-effort` mode addresses this by allowing nodes to continue
operating even when disk writes fail. This is particularly useful in cloud
environments where disk failures are more common due to scale.

Note: Startup-time config saves (in clusterInit and verifyClusterConfigWithData)
still use clusterSaveConfigOrDie() since disk issues at startup should cause
immediate failure.

Signed-off-by: Binbin <binloveplay1314@qq.com>
2026-04-10 10:40:44 +08:00

109 lines
4.5 KiB
Tcl

start_cluster 2 2 {tags {external:skip cluster}} {
test {Key lazy expires during key migration} {
R 0 DEBUG SET-ACTIVE-EXPIRE 0
set key_slot [R 0 CLUSTER KEYSLOT FOO]
R 0 set FOO BAR PX 10
set src_id [R 0 CLUSTER MYID]
set trg_id [R 1 CLUSTER MYID]
R 0 CLUSTER SETSLOT $key_slot MIGRATING $trg_id
R 1 CLUSTER SETSLOT $key_slot IMPORTING $src_id
after 11
assert_error {ASK*} {R 0 GET FOO}
R 0 ping
} {PONG}
test "Coverage: Basic cluster commands" {
assert_equal {OK} [R 0 CLUSTER saveconfig]
set id [R 0 CLUSTER MYID]
assert_equal {0} [R 0 CLUSTER count-failure-reports $id]
R 0 flushall
assert_equal {OK} [R 0 CLUSTER flushslots]
}
}
start_cluster 1 1 {tags {external:skip cluster}} {
test {Cross-slot transaction} {
assert_equal OK [R 0 multi]
assert_equal QUEUED [r get foo]
assert_equal QUEUED [r get bar]
assert_error {CROSSSLOT *} {r exec}
}
}
# Create a folder called "nodes.conf" to trigger temp nodes.conf rename
# failure and it will cause cluster config file save to fail at the rename.
proc create_nodes_conf_folder {srv_idx} {
set dir [lindex [R $srv_idx config get dir] 1]
set cluster_conf [lindex [R $srv_idx config get cluster-config-file] 1]
set cluster_conf_path [file join $dir $cluster_conf]
if {[file exists $cluster_conf_path]} { exec rm -f $cluster_conf_path }
exec mkdir -p $cluster_conf_path
}
start_cluster 1 1 {tags {external:skip cluster} overrides {cluster-config-save-behavior sync}} {
test {cluster-config-save-behavior sync mode - node exits when config save fails} {
# Create folder that can cause the rename fail.
create_nodes_conf_folder 1
# Trigger a takeover so that cluster will need to update the config file.
catch {R 1 cluster failover takeover}
# Wait for R1 to exit due to config save failure.
wait_for_condition 1000 50 {
[process_is_alive [srv -1 pid]] == 0
} else {
fail "R1 did not exit"
}
# Verify that save failure and fatal exit logs were printed.
verify_log_message -1 "*Could not rename tmp cluster config file*" 0
verify_log_message -1 "*Fatal: can't update cluster config file*" 0
}
}
start_cluster 1 1 {tags {external:skip cluster} overrides {cluster-config-save-behavior best-effort}} {
test {cluster-config-save-behavior best-effort mode - node continues running when config save fails} {
# Create folder that can cause the rename fail.
create_nodes_conf_folder 0
create_nodes_conf_folder 1
# Trigger a takeover so that cluster will need to update the config file.
R 1 cluster failover takeover
wait_for_condition 1000 50 {
[s 0 role] eq {slave} &&
[s -1 role] eq {master}
} else {
fail "The failover does not happen"
}
# Make sure the process is still alive, we won't exit when fail to save the config file.
assert_equal {PONG} [R 0 ping]
assert_equal {PONG} [R 1 ping]
assert_equal 1 [process_is_alive [srv 0 pid]]
assert_equal 1 [process_is_alive [srv -1 pid]]
# Make sure relevant logs are printed.
verify_log_message 0 "*Could not rename tmp cluster config file*" 0
verify_log_message -1 "*Could not rename tmp cluster config file*" 0
verify_log_message 0 "*Cluster config updated even though writing the cluster config file to disk failed*" 0
verify_log_message -1 "*Cluster config updated even though writing the cluster config file to disk failed*" 0
# Trigger a takeover so that cluster will need to update the config file.
# We will not frequently print the "save failed" log.
R 0 cluster failover takeover
wait_for_condition 1000 50 {
[s 0 role] eq {master} &&
[s -1 role] eq {slave}
} else {
fail "The failover does not happen"
}
assert_morethan_equal [count_log_message 0 "Could not rename tmp cluster config file"] 2
assert_equal [count_log_message 0 "Cluster config updated even though writing the cluster config file to disk failed"] 1
assert_morethan_equal [count_log_message -1 "Could not rename tmp cluster config file"] 2
assert_equal [count_log_message -1 "Cluster config updated even though writing the cluster config file to disk failed"] 1
}
}