apache · kfaraz · Jan 4, 2026 · Dec 29, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/.github/workflows/cron-job-its.yml b/.github/workflows/cron-job-its.yml
@@ -56,34 +56,6 @@ jobs:
         run: |
           ./it.sh ci
 
-  integration-index-tests-middleManager:
-    strategy:
-      fail-fast: false
-      matrix:
-        testing_group: [kafka-index, kafka-index-slow, kafka-transactional-index, kafka-transactional-index-slow, kafka-data-format, realtime-index]
-    uses: ./.github/workflows/reusable-standard-its.yml
-    needs: build
-    with:
-      build_jdk: 17
-      runtime_jdk: 17
-      testing_groups: -Dgroups=${{ matrix.testing_group }}
-      use_indexer: middleManager
-      group: ${{ matrix.testing_group }}
-
-  integration-index-tests-indexer:
-    strategy:
-      fail-fast: false
-      matrix:
-        testing_group: [ kafka-index, kafka-transactional-index, kafka-index-slow, kafka-transactional-index-slow, kafka-data-format ]
-    uses: ./.github/workflows/reusable-standard-its.yml
-    needs: build
-    with:
-      build_jdk: 17
-      runtime_jdk: 17
-      testing_groups: -Dgroups=${{ matrix.testing_group }}
-      use_indexer: indexer
-      group: ${{ matrix.testing_group }}
-
   integration-query-tests-middleManager:
     strategy:
       fail-fast: false

diff --git a/.github/workflows/standard-its.yml b/.github/workflows/standard-its.yml
@@ -42,37 +42,6 @@ jobs:
             core:
               - '!extension*/**'
 
-  integration-index-tests-middleManager:
-    needs: changes
-    strategy:
-      fail-fast: false
-      matrix:
-        testing_group: [kafka-index, kafka-index-slow, kafka-transactional-index, kafka-transactional-index-slow, realtime-index]
-    uses: ./.github/workflows/reusable-standard-its.yml
-    if: ${{ needs.changes.outputs.core == 'true' || needs.changes.outputs.common-extensions == 'true' }}
-    with:
-      build_jdk: 17
-      runtime_jdk: 17
-      testing_groups: -Dgroups=${{ matrix.testing_group }}
-      override_config_path: ./environment-configs/test-groups/prepopulated-data
-      use_indexer: middleManager
-      group: ${{ matrix.testing_group }}
-
-  integration-index-tests-indexer:
-    needs: changes
-    strategy:
-      fail-fast: false
-      matrix:
-        testing_group: [kafka-index]
-    uses: ./.github/workflows/reusable-standard-its.yml
-    if: ${{ needs.changes.outputs.core == 'true' || needs.changes.outputs.common-extensions == 'true' }}
-    with:
-      build_jdk: 17
-      runtime_jdk: 17
-      testing_groups: -Dgroups=${{ matrix.testing_group }}
-      use_indexer: indexer
-      group: ${{ matrix.testing_group }}
-
   integration-query-tests-middleManager:
     needs: changes
     strategy:

diff --git a/docs/api-reference/supervisor-api.md b/docs/api-reference/supervisor-api.md
@@ -3598,12 +3598,33 @@ Host: http://ROUTER_IP:ROUTER_PORT
 
 ### Handoff task groups for a supervisor early
 
-Trigger handoff for specified task groups of a supervisor early. This is a best effort API and makes no guarantees of handoff execution
+Trigger handoff for specified task groups of a supervisor early. This is a best effort API and makes no guarantees of handoff execution.
 
 #### URL
 
 `POST` `/druid/indexer/v1/supervisor/{supervisorId}/taskGroups/handoff`
 
+#### Responses
+
+<Tabs>
+
+<TabItem value="1" label="202 ACCEPTED">
+
+*Request has been accepted and handoff will be initiated in the background.*
+
+</TabItem>
+<TabItem value="2" label="404 NOT FOUND">
+
+*Invalid supervisor ID or the supervisor is not running.*
+
+</TabItem>
+<TabItem value="3" label="400 BAD REQUEST">
+
+*Supervisor does not support early handoff.*
+
+</TabItem>
+</Tabs>
+
 #### Sample request
 
 The following example shows how to handoff task groups for a supervisor with the name `social_media` and has the task groups: `1,2,3`.
@@ -3639,8 +3660,11 @@ Content-Type: application/json
 #### Sample response
 
 <details>
-  <summary>View the response</summary>
-(empty response)
+  <summary>202 Accepted</summary>
+
+```json
+{}
+```
 </details>
 
 ### Shut down a supervisor

diff --git a/...sts/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaFaultToleranceTest.java b/...sts/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaFaultToleranceTest.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.indexing;
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import org.apache.druid.indexer.TaskStatusPlus;
+import org.apache.druid.indexing.overlord.supervisor.SupervisorSpec;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.query.DruidMetrics;
+import org.apache.druid.rpc.RequestBuilder;
+import org.jboss.netty.handler.codec.http.HttpMethod;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class KafkaFaultToleranceTest extends KafkaTestBase
+{
+  private SupervisorSpec supervisorSpec = null;
+  private String topic = null;
+  private int totalRecords = 0;
+
+  @BeforeEach
+  public void setupTopicAndSupervisor()
+  {
+    totalRecords = 0;
+    topic = "topic_" + dataSource;
+    kafkaServer.createTopicWithPartitions(topic, 2);
+
+    supervisorSpec = createSupervisor().withId("supe_" + dataSource).build(dataSource, topic);
+    cluster.callApi().postSupervisor(supervisorSpec);
+  }
+
+  @AfterEach
+  public void verifyAndTearDown()
+  {
+    waitUntilPublishedRecordsAreIngested(totalRecords);
+    verifySupervisorIsRunningHealthy(supervisorSpec.getId());
+    cluster.callApi().postSupervisor(supervisorSpec.createSuspendedSpec());
+    kafkaServer.deleteTopic(topic);
+    verifyRowCount(totalRecords);
+  }
+
+  @ParameterizedTest
+  @ValueSource(booleans = {true, false})
+  public void test_supervisorRecovers_afterOverlordRestart(boolean useTransactions) throws Exception
+  {
+    totalRecords = publish1kRecords(topic, useTransactions);
+    waitUntilPublishedRecordsAreIngested(totalRecords);
+
+    overlord.stop();
+    totalRecords += publish1kRecords(topic, useTransactions);
+
+    overlord.start();
+    totalRecords += publish1kRecords(topic, useTransactions);
+  }
+
+  @Test
+  public void test_supervisorRecovers_afterCoordinatorRestart() throws Exception
+  {
+    final boolean useTransactions = true;
+    totalRecords = publish1kRecords(topic, useTransactions);
+    waitUntilPublishedRecordsAreIngested(totalRecords);
+
+    coordinator.stop();
+    totalRecords += publish1kRecords(topic, useTransactions);
+
+    coordinator.start();
+    totalRecords += publish1kRecords(topic, useTransactions);
+  }
+
+  @Test
+  public void test_supervisorRecovers_afterHistoricalRestart() throws Exception
+  {
+    final boolean useTransactions = false;
+    totalRecords = publish1kRecords(topic, useTransactions);
+    waitUntilPublishedRecordsAreIngested(totalRecords);
+
+    historical.stop();
+    totalRecords += publish1kRecords(topic, useTransactions);
+
+    historical.start();
+    totalRecords += publish1kRecords(topic, useTransactions);
+  }
+
+  @ParameterizedTest
+  @ValueSource(booleans = {true, false})
+  public void test_supervisorRecovers_afterSuspendResume(boolean useTransactions)
+  {
+    totalRecords = publish1kRecords(topic, useTransactions);
+    waitUntilPublishedRecordsAreIngested(totalRecords);
+
+    cluster.callApi().postSupervisor(supervisorSpec.createSuspendedSpec());
+    totalRecords += publish1kRecords(topic, useTransactions);
+
+    cluster.callApi().postSupervisor(supervisorSpec.createRunningSpec());
+    totalRecords += publish1kRecords(topic, useTransactions);
+  }
+
+  @ParameterizedTest
+  @ValueSource(booleans = {true, false})
+  public void test_supervisorRecovers_afterChangeInTopicPartitions(boolean useTransactions)
+  {
+    totalRecords = publish1kRecords(topic, useTransactions);
+
+    kafkaServer.increasePartitionsInTopic(topic, 4);
+    totalRecords += publish1kRecords(topic, useTransactions);
+  }
+
+  @Test
+  public void test_supervisorLaunchesNewTask_ifEarlyHandoff()
+  {
+    final boolean useTransactions = true;
+    totalRecords = publish1kRecords(topic, useTransactions);
+
+    waitUntilPublishedRecordsAreIngested(totalRecords);
+
+    final Set<String> taskIdsBeforeHandoff = getRunningTaskIds(dataSource);
+    Assertions.assertFalse(taskIdsBeforeHandoff.isEmpty());
+
+    final String path = StringUtils.format(
+        "/druid/indexer/v1/supervisor/%s/taskGroups/handoff",
+        supervisorSpec.getId()
+    );
+    cluster.callApi().serviceClient().onLeaderOverlord(
+        mapper -> new RequestBuilder(HttpMethod.POST, path)
+            .jsonContent(mapper, Map.of("taskGroupIds", List.of(0, 1))),
+        new TypeReference<>() {}
+    );
+
+    // Wait for the handoff notice to be processed
+    overlord.latchableEmitter().waitForEvent(
+        event -> event.hasMetricName("ingest/notices/time")
+                      .hasDimension(DruidMetrics.SUPERVISOR_ID, supervisorSpec.getId())
+                      .hasDimension("noticeType", "handoff_task_group_notice")
+    );
+
+    totalRecords += publish1kRecords(topic, useTransactions);
+    waitUntilPublishedRecordsAreIngested(totalRecords);
+
+    // Verify that the running task IDs have changed
+    final Set<String> taskIdsAfterHandoff = getRunningTaskIds(dataSource);
+    Assertions.assertFalse(taskIdsAfterHandoff.isEmpty());
+    Assertions.assertFalse(taskIdsBeforeHandoff.stream().anyMatch(taskIdsAfterHandoff::contains));
+  }
+
+  private Set<String> getRunningTaskIds(String dataSource)
+  {
+    return cluster.callApi()
+                  .getTasks(dataSource, "running")
+                  .stream()
+                  .map(TaskStatusPlus::getId)
+                  .collect(Collectors.toSet());
+  }
+}