Fix crash that occurs sometimes when aborting a slot migration while child snapshot is active (#2721)

murphyjacob4 · madolson · commit b2dc84b07e75 · 2025-10-21T09:05:42.000-07:00
The race condition causes the client to be used and subsequently double freed by the slot migration read pipe handler. The order of events is: 1. We kill the slot migration child process during CANCELSLOTMIGRATIONS 1. We then free the associated client to the target node 1. Although we kill the child process, it is not guaranteed that the pipe will be empty from child to parent 1. If the pipe is not empty, we later will read that out in the slotMigrationPipeReadHandler 1. In the pipe read handler, we attempt to write to the connection. If writing to the connection fails, we will attempt to free the client 1. However, the client was already freed, so this a double free Notably, the slot migration being aborted doesn't need to be triggered by `CANCELSLOTMIGRATIONS`, it can be any failure. To solve this, we simply: 1. Set the slot migration pipe connection to NULL whenever it is unlinked 2. Bail out early in slot migration pipe read handler if the connection is NULL I also consolidate the killSlotMigrationChild call to one code path, which is executed on client unlink. Before, there were two code paths that would do this twice (once on slot migration job finish, and once on client unlink). Sending the signal twice is fine, but inefficient. Also, add a test to cancel during the slot migration snapshot to make sure this case is covered (we only caught it during the module test). --------- Signed-off-by: Jacob Murphy <jkmurphy@google.com> (cherry picked from commit 28e5dcc) Signed-off-by: cherukum-amazon <cherukum@amazon.com>
diff --git a/src/cluster_migrateslots.c b/src/cluster_migrateslots.c
@@ -2309,9 +2309,6 @@ void finishSlotMigrationJob(slotMigrationJob *job,
         /* If we finish the export, we should not remain paused */
         job->mf_end = 0;
         slotExportTryUnpause();
-        /* Fast fail the child process, which will be cleaned up fully in
-         * checkChildrenDone. */
-        if (job->state == SLOT_EXPORT_SNAPSHOTTING) killSlotMigrationChild();
     }
 
     /* Imports that are not successful on primaries need to be cleaned up (if
diff --git a/src/networking.c b/src/networking.c
@@ -1883,8 +1883,7 @@ void unlinkClient(client *c) {
          * snapshot child may take some time to die, during which the migration will continue past
          * the snapshot state. */
         if (c->repl_data && server.rdb_pipe_conns &&
-            ((c->flag.replica && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) ||
-             (c->slot_migration_job && !isImportSlotMigrationJob(c->slot_migration_job)))) {
+            ((c->flag.replica && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END))) {
             int i;
             int still_alive = 0;
             for (i = 0; i < server.rdb_pipe_numconns; i++) {
@@ -1895,14 +1894,18 @@ void unlinkClient(client *c) {
                 if (server.rdb_pipe_conns[i]) still_alive++;
             }
             if (still_alive == 0) {
-                if (c->slot_migration_job && !isImportSlotMigrationJob(c->slot_migration_job)) {
-                    serverLog(LL_NOTICE, "Slot migration snapshot, migration target dropped, killing fork child.");
-                } else {
-                    serverLog(LL_NOTICE, "Diskless rdb transfer, last replica dropped, killing fork child.");
-                }
+                serverLog(LL_NOTICE, "Diskless rdb transfer, last replica dropped, killing fork child.");
                 killRDBChild();
             }
         }
+        /* Check if this is the slot migration client we are writing to in a
+         * child process*/
+        if (c->slot_migration_job && !isImportSlotMigrationJob(c->slot_migration_job) &&
+            server.slot_migration_pipe_conn == c->conn) {
+            server.slot_migration_pipe_conn = NULL;
+            serverLog(LL_NOTICE, "Slot migration target dropped, killing fork child.");
+            killSlotMigrationChild();
+        }
         /* Only use shutdown when the fork is active and we are the parent. */
         if (server.child_type && !c->flag.repl_rdb_channel) {
             connShutdown(c->conn);
diff --git a/src/replication.c b/src/replication.c
@@ -1849,15 +1849,17 @@ void slotMigrationPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *c
     UNUSED(eventLoop);
     if (!server.slot_migration_pipe_buff) server.slot_migration_pipe_buff = zmalloc(PROTO_IOBUF_LEN);
 
+    /* No work to do if our connection has been closed. */
+    if (!server.slot_migration_pipe_conn) return;
+
     while (1) {
         server.slot_migration_pipe_bufflen = read(fd, server.slot_migration_pipe_buff, PROTO_IOBUF_LEN);
         if (server.slot_migration_pipe_bufflen < 0) {
             if (errno == EAGAIN || errno == EWOULDBLOCK) return;
             serverLog(LL_WARNING, "Slot migration, read error sending snapshot to target: %s", strerror(errno));
             client *target = connGetPrivateData(server.slot_migration_pipe_conn);
-            freeClient(target);
+            freeClient(target); /* Free client will kill the slot migration child */
             server.slot_migration_pipe_conn = NULL;
-            killSlotMigrationChild();
             return;
         }
 
@@ -1879,7 +1881,7 @@ void slotMigrationPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *c
             if (connGetState(conn) != CONN_STATE_CONNECTED) {
                 serverLog(LL_WARNING, "Slot migration transfer, write error sending DB to target: %s",
                           connGetLastError(conn));
-                freeClient(target);
+                freeClient(target); /* Free client will kill the slot migration child */
                 server.slot_migration_pipe_conn = NULL;
                 return;
             }
diff --git a/tests/unit/cluster/cluster-migrateslots.tcl b/tests/unit/cluster/cluster-migrateslots.tcl
@@ -386,6 +386,15 @@ start_cluster 3 3 {tags {logreqres:skip external:skip cluster} overrides {cluste
         set_debug_prevent_pause 0
     }
 
+    set 0_slot_tag "{06S}"
+    set 1_slot_tag "{Qi}"
+    set 5462_slot_tag "{450}"
+    set 16379_slot_tag "{YY}"
+    set 16380_slot_tag "{wu}"
+    set 16381_slot_tag "{0TG}"
+    set 16382_slot_tag "{4oi}"
+    set 16383_slot_tag "{6ZJ}"
+
     test "Test CLUSTER CANCELSLOTMIGRATIONS" {
         set_debug_prevent_pause 1
         assert_match "OK" [R 2 CLUSTER MIGRATESLOTS SLOTSRANGE 16382 16382 NODE $node0_id]
@@ -407,19 +416,24 @@ start_cluster 3 3 {tags {logreqres:skip external:skip cluster} overrides {cluste
         wait_for_migration_field 0 $jobname1 state failed
         wait_for_migration_field 0 $jobname2 state failed
 
+        # Do it again, but during snapshotting
+        # 50 keys * 100ms/key = 5 sec snapshot time
+        R 2 CONFIG SET rdb-key-save-delay 100000
+        populate 50 "$16383_slot_tag:1:" 1000 -2
+        assert_match "OK" [R 2 CLUSTER MIGRATESLOTS SLOTSRANGE 16383 16383 NODE $node0_id]
+        set jobname [get_job_name 2 16383]
+        wait_for_migration_field 2 $jobname state snapshotting
+        assert_match "OK" [R 2 CLUSTER CANCELSLOTMIGRATIONS]
+        R 2 CONFIG SET rdb-key-save-delay 0
+
+        # Jobs are no longer up, migration logs say cancelled
+        assert {[dict get [get_migration_by_name 2 $jobname] state] eq "cancelled"}
+        wait_for_migration_field 0 $jobname state failed
+
         # Cleanup
         set_debug_prevent_pause 0
     }
 
-    set 0_slot_tag "{06S}"
-    set 1_slot_tag "{Qi}"
-    set 5462_slot_tag "{450}"
-    set 16379_slot_tag "{YY}"
-    set 16380_slot_tag "{wu}"
-    set 16381_slot_tag "{0TG}"
-    set 16382_slot_tag "{4oi}"
-    set 16383_slot_tag "{6ZJ}"
-
     test "Slot migration won't migrate the functions" {
         assert_does_not_resync {
             # R 2 load a function then trigger a slot migration to R 0

Original file line number	Diff line number	Diff line change
`@@ -2309,9 +2309,6 @@ void finishSlotMigrationJob(slotMigrationJob *job,`
`2309`	`2309`	`/* If we finish the export, we should not remain paused */`
`2310`	`2310`	`job->mf_end = 0;`
`2311`	`2311`	`slotExportTryUnpause();`
`2312`		`- /* Fast fail the child process, which will be cleaned up fully in`
`2313`		`- * checkChildrenDone. */`
`2314`		`- if (job->state == SLOT_EXPORT_SNAPSHOTTING) killSlotMigrationChild();`
`2315`	`2312`	`}`
`2316`	`2313`
`2317`	`2314`	`/* Imports that are not successful on primaries need to be cleaned up (if`