Add cluster-config-save-behavior option to control nodes.conf save behavior (#3372)

This commit introduces a new configuration option `cluster-config-save-behavior` that controls how the cluster handles nodes.conf file save failures. The option supports two modes: - `sync` (default): Synchronously save the config file. If the save fails, the process exits. This maintains backward compatibility with the old behavior (before 9.1). - `best-effort`: Synchronously save the config file. If the save fails, only log a warning and continue running. This allows the node to survive disk failures (e.g., disk full, read-only filesystem) without exitting, giving administrators time to address the issue. Note that this modifies the behavior of #1032, whereas #1032 was "best-effort", we have now introduced a configuration option that defaults to "sync." See #1032 discussion for more details. Background: When a disk becomes read-only or full, any cluster metadata change would trigger a nodes.conf save attempt. With the old behavior, the node would immediately exit via clusterSaveConfigOrDie(), potentially causing multiple nodes on the same machine to crash simultaneously, leading to cluster unavailability. The new `best-effort` mode addresses this by allowing nodes to continue operating even when disk writes fail. This is particularly useful in cloud environments where disk failures are more common due to scale. Note: Startup-time config saves (in clusterInit and verifyClusterConfigWithData) still use clusterSaveConfigOrDie() since disk issues at startup should cause immediate failure. Signed-off-by: Binbin <binloveplay1314@qq.com>
2026-05-06 05:26:42 -04:00 · 2026-04-10 10:40:44 +08:00
parent c0289c6a72
commit 2871efd436
5 changed files with 69 additions and 3 deletions
@@ -6345,7 +6345,13 @@ void clusterBeforeSleep(void) {
    /* Save the config, possibly using fsync. */
    if (flags & CLUSTER_TODO_SAVE_CONFIG) {
        int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG;
-        clusterSaveConfigOrLog(fsync);
+        if (server.cluster_configfile_save_behavior == CLUSTER_CONFIGFILE_SAVE_BEHAVIOR_SYNC) {
+            /* Sync mode: exit the process if saving fails. */
+            clusterSaveConfigOrDie(fsync);
+        } else if (server.cluster_configfile_save_behavior == CLUSTER_CONFIGFILE_SAVE_BEHAVIOR_BEST_EFFORT) {
+            /* Best-effort mode: log (don't exit) if saving fails and wait for the next retry. */
+            clusterSaveConfigOrLog(fsync);
+        }
    }

    if (flags & CLUSTER_TODO_BROADCAST_ALL) {
@@ -160,6 +160,11 @@ configEnum cluster_preferred_endpoint_type_enum[] = {
    {"unknown-endpoint", CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT},
    {NULL, 0}};

+configEnum cluster_configfile_save_behavior_enum[] = {
+    {"sync", CLUSTER_CONFIGFILE_SAVE_BEHAVIOR_SYNC},
+    {"best-effort", CLUSTER_CONFIGFILE_SAVE_BEHAVIOR_BEST_EFFORT},
+    {NULL, 0}};
+
 configEnum propagation_error_behavior_enum[] = {
    {"ignore", PROPAGATION_ERR_BEHAVIOR_IGNORE},
    {"panic", PROPAGATION_ERR_BEHAVIOR_PANIC},
@@ -3352,6 +3357,7 @@ standardConfig static_configs[] = {
    createEnumConfig("enable-debug-command", NULL, IMMUTABLE_CONFIG, protected_action_enum, server.enable_debug_cmd, PROTECTED_ACTION_ALLOWED_NO, NULL, NULL),
    createEnumConfig("enable-module-command", NULL, IMMUTABLE_CONFIG, protected_action_enum, server.enable_module_cmd, PROTECTED_ACTION_ALLOWED_NO, NULL, NULL),
    createEnumConfig("cluster-preferred-endpoint-type", NULL, MODIFIABLE_CONFIG, cluster_preferred_endpoint_type_enum, server.cluster_preferred_endpoint_type, CLUSTER_ENDPOINT_TYPE_IP, NULL, invalidateClusterSlotsResp),
+    createEnumConfig("cluster-config-save-behavior", NULL, MODIFIABLE_CONFIG, cluster_configfile_save_behavior_enum, server.cluster_configfile_save_behavior, CLUSTER_CONFIGFILE_SAVE_BEHAVIOR_SYNC, NULL, NULL),
    createEnumConfig("propagation-error-behavior", NULL, MODIFIABLE_CONFIG, propagation_error_behavior_enum, server.propagation_error_behavior, PROPAGATION_ERR_BEHAVIOR_IGNORE, NULL, NULL),
    createEnumConfig("shutdown-on-sigint", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, shutdown_on_sig_enum, server.shutdown_on_sigint, 0, isValidShutdownOnSigFlags, NULL),
    createEnumConfig("shutdown-on-sigterm", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, shutdown_on_sig_enum, server.shutdown_on_sigterm, 0, isValidShutdownOnSigFlags, NULL),
@@ -636,6 +636,12 @@ typedef enum {
    CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT /* Show NULL or empty */
 } cluster_endpoint_type;

+/* Cluster persist config mode. */
+typedef enum {
+    CLUSTER_CONFIGFILE_SAVE_BEHAVIOR_SYNC = 0,    /* Perform a synchronous save, exit the process if it fails. */
+    CLUSTER_CONFIGFILE_SAVE_BEHAVIOR_BEST_EFFORT, /* Attempt to save on a "best-effort" basis, process will not exit if it fails. */
+} cluster_persist_config_mode;
+
 /* RDB active child save type. */
 #define RDB_CHILD_TYPE_NONE 0
 #define RDB_CHILD_TYPE_DISK 1   /* RDB is written to disk. */
@@ -2253,6 +2259,7 @@ struct valkeyServer {
    mstime_t cluster_ping_interval;                        /* A debug configuration for setting how often cluster nodes send ping messages. */
    int cluster_message_gossip_perc;                       /* A configuration for setting the percentage of peer nodes to be gossiped in ping/pong messages. */
    char *cluster_configfile;                              /* Cluster auto-generated config file name. */
+    int cluster_configfile_save_behavior;                  /* Cluster config file save behavior. */
    struct clusterState *cluster;                          /* State of the cluster */
    int cluster_migration_barrier;                         /* Cluster replicas migration barrier. */
    int cluster_allow_replica_migration;                   /* Automatic replica migrations to orphaned primaries and from empty primaries */
@@ -43,8 +43,29 @@ proc create_nodes_conf_folder {srv_idx} {
    exec mkdir -p $cluster_conf_path
 }

-start_cluster 1 1 {tags {external:skip cluster}} {
-    test {Fail to save the cluster configuration file will not exit the process} {
+start_cluster 1 1 {tags {external:skip cluster} overrides {cluster-config-save-behavior sync}} {
+    test {cluster-config-save-behavior sync mode - node exits when config save fails} {
+        # Create folder that can cause the rename fail.
+        create_nodes_conf_folder 1
+
+        # Trigger a takeover so that cluster will need to update the config file.
+        catch {R 1 cluster failover takeover}
+
+        # Wait for R1 to exit due to config save failure.
+        wait_for_condition 1000 50 {
+            [process_is_alive [srv -1 pid]] == 0
+        } else {
+            fail "R1 did not exit"
+        }
+
+        # Verify that save failure and fatal exit logs were printed.
+        verify_log_message -1 "*Could not rename tmp cluster config file*" 0
+        verify_log_message -1 "*Fatal: can't update cluster config file*" 0
+    }
+}
+
+start_cluster 1 1 {tags {external:skip cluster} overrides {cluster-config-save-behavior best-effort}} {
+    test {cluster-config-save-behavior best-effort mode - node continues running when config save fails} {
        # Create folder that can cause the rename fail.
        create_nodes_conf_folder 0
        create_nodes_conf_folder 1
@@ -1810,6 +1810,32 @@ aof-timestamp-enabled no
 #
 # cluster-config-file nodes-6379.conf

+# This option controls how the cluster handles the saving behavior of the
+# "cluster-config-file" file.
+#
+# When cluster metadata changes (e.g., node joins/leaves, slot migrations,
+# failovers), the cluster needs to save the updated configuration to the
+# "cluster-config-file" file.
+#
+# Available options:
+#
+# - sync (default): Synchronously save the config file. If the save fails,
+#   the process exits immediately. This is the traditional behavior that
+#   prioritizes configuration consistency.
+#
+# - best-effort: Synchronously save the config file. If the save fails,
+#   only log a warning and continue running. The node will retry saving
+#   on the next configuration change. Passive exit may bring unexpected
+#   effects, such as cluster down. This mode allows the node to survive
+#   temporary disk failures, giving administrators time to address the
+#   issue without causing immediate service disruption.
+#
+# Note: The 'best-effort' mode is particularly useful in some environments.
+# However, if the disk issue persists and the node restarts, it may load
+# stale configuration data. Use with caution and ensure proper monitoring.
+#
+# cluster-config-save-behavior sync
+
 # Cluster node timeout is the amount of milliseconds a node must be unreachable
 # for it to be considered in failure state.
 # Most other internal time limits are a multiple of the node timeout.