Fix: prevent NULL dereference crash in connectSlotExportJob when target node disappears (#3596)

### Summary

This PR fixes a NULL pointer dereference (SIGSEGV) in
`connectSlotExportJob()`
(`src/cluster_migrateslots.c`) that can crash a Valkey cluster node,
causing a
denial-of-service condition.

### Root Cause

When `CLUSTER MIGRATESLOTS` is issued, a migration job is created with
state
`SLOT_EXPORT_CONNECTING`. On the next `clusterCron()` tick,
`proceedWithSlotMigration()` calls `connectSlotExportJob()`, which looks
up the
target node via `clusterLookupNode()`.

`clusterLookupNode()` can legitimately return `NULL` — for example, if
the
target node is removed from the cluster (e.g. via `CLUSTER FORGET`)
between the
time the migration job is created and the time the cron fires. This is a
realistic race condition in any cluster topology change scenario.

The return value was **never checked**, so the subsequent call to
`getNodeDefaultReplicationPort(n)` immediately dereferences the NULL
pointer,
crashing the process:

```c
// Before fix — vulnerable
clusterNode *n = clusterLookupNode(job->target_node_name, CLUSTER_NAMELEN);
int port = getNodeDefaultReplicationPort(n);  // SIGSEGV if n == NULL
serverLog(..., n->ip, port);                  // second dereference

Signed-off-by: chenshi5012 <chenshi5012@163.com>
This commit is contained in:
chenshi
2026-05-01 07:25:58 +08:00
committed by GitHub
parent 72fc5b14b1
commit cba05103de
+9 -1
View File
@@ -1303,6 +1303,13 @@ void slotExportConnectHandler(connection *conn) {
* the job as private data. */
int connectSlotExportJob(slotMigrationJob *job) {
clusterNode *n = clusterLookupNode(job->target_node_name, CLUSTER_NAMELEN);
if (n == NULL) {
serverLog(LL_WARNING,
"Slot migration %s: target node %.40s not found in cluster, "
"aborting connection attempt.",
job->description, job->target_node_name);
return C_ERR;
}
int port = getNodeDefaultReplicationPort(n);
serverLog(LL_NOTICE, "Connecting slot migration %s (ip: %s, port %d)",
job->description,
@@ -1995,10 +2002,11 @@ void proceedWithSlotMigration(slotMigrationJob *job) {
status = proceedWithSlotExportJobConnecting(job, &completed);
}
if (status == C_ERR) {
const char *conn_err = job->conn ? connGetLastError(job->conn) : "target node not found";
sds status_msg =
sdscatfmt(sdsempty(),
"Unable to connect to target node: %s",
connGetLastError(job->conn));
conn_err);
finishSlotMigrationJob(job, SLOT_MIGRATION_JOB_FAILED,
status_msg);
sdsfree(status_msg);