apache
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 3 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎.github/workflows/cron-job-its.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cron-job-its.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/standard-its.yml‎
Lines changed: 2 additions & 60 deletions b/‎.github/workflows/standard-its.yml‎
Lines changed: 2 additions & 60 deletions
diff --git a/‎docs/development/extensions-core/k8s-jobs.md‎
Lines changed: 40 additions & 9 deletions b/‎docs/development/extensions-core/k8s-jobs.md‎
Lines changed: 40 additions & 9 deletions
diff --git a/‎docs/multi-stage-query/reference.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/multi-stage-query/reference.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/querying/sql-metadata-tables.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/querying/sql-metadata-tables.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎embedded-tests/src/test/java/org/apache/druid/testing/embedded/auth/AbstractAuthConfigurationTest.java‎
Lines changed: 4 additions & 3 deletions b/‎embedded-tests/src/test/java/org/apache/druid/testing/embedded/auth/AbstractAuthConfigurationTest.java‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/IngestionSmokeTest.java‎
Lines changed: 5 additions & 12 deletions b/‎embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/IngestionSmokeTest.java‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaClusterMetricsTest.java‎
Lines changed: 2 additions & 2 deletions b/‎embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaClusterMetricsTest.java‎
Lines changed: 2 additions & 2 deletions
@@ -29,7 +29,7 @@ jobs:
     uses: ./.github/workflows/worker.yml
     with:
       script: .github/scripts/run_unit-tests -Dtest=!QTest,'${{ matrix.pattern }}' -Dmaven.test.failure.ignore=true
-      artifact_prefix: "unit-test-reports"
+      artifact_prefix: "unit-test-reports-jdk${{ matrix.jdk }}"
       jdk: ${{ matrix.jdk }}
       key: "test-jdk${{ matrix.jdk }}-[${{ matrix.pattern }}]"
 
@@ -43,17 +43,21 @@ jobs:
     name: "test-report"
     needs: run-unit-tests
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        jdk: [ "17", "21" ]
     steps:
       - name: Download reports for all unit test jobs
         uses: actions/download-artifact@v4
         with:
-          pattern: "unit-test-reports-*"
+          pattern: "unit-test-reports-jdk${{ matrix.jdk }}*"
           path: target/surefire-reports
 
       - name: Publish Test Report
         uses: mikepenz/action-junit-report@v5
         with:
-          check_name: "Unit Test Report"
+          check_name: "Unit Test Report (JDK ${{ matrix.jdk }})"
           report_paths: '**/target/surefire-reports/TEST-*.xml'
           detailed_summary: true
           flaky_summary: true
 
@@ -88,7 +88,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        testing_group: [ query, query-retry, query-error, security ]
+        testing_group: [ query, security ]
     uses: ./.github/workflows/reusable-standard-its.yml
     needs: build
     with:
 
@@ -78,7 +78,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        testing_group: [query, query-retry, query-error, security, centralized-datasource-schema]
+        testing_group: [query, security, centralized-datasource-schema]
     uses: ./.github/workflows/reusable-standard-its.yml
     if: ${{ needs.changes.outputs.core == 'true' || needs.changes.outputs.common-extensions == 'true' }}
     with:
@@ -112,62 +112,4 @@ jobs:
       testing_groups: -Dgroups=custom-coordinator-duties
       use_indexer: middleManager
       override_config_path: ./environment-configs/test-groups/custom-coordinator-duties
-      group: custom coordinator duties
-
-  integration-k8s-leadership-tests:
-    needs: changes
-    name: (Compile=openjdk17, Run=openjdk17, Cluster Build On K8s) ITNestedQueryPushDownTest integration test
-    if: ${{ needs.changes.outputs.core == 'true' || needs.changes.outputs.common-extensions == 'true' }}
-    runs-on: ubuntu-22.04
-    env:
-      MVN: mvn --no-snapshot-updates
-      MAVEN_SKIP: -P skip-static-checks -Dweb.console.skip=true -Dmaven.javadoc.skip=true
-      CONFIG_FILE: k8s_run_config_file.json
-      IT_TEST: -Dit.test=ITNestedQueryPushDownTest
-      POD_NAME: int-test
-      POD_NAMESPACE: default
-      BUILD_DRUID_CLUSTER: true
-    steps:
-      - name: Checkout branch
-        uses: actions/checkout@v4
-
-      - name: setup java
-        uses: actions/setup-java@v4
-        with:
-          java-version: '17'
-          distribution: 'zulu'
-
-      # the build step produces SNAPSHOT artifacts into the local maven repository,
-      # we include github.sha in the cache key to make it specific to that build/jdk
-      - name: Restore Maven repository
-        id: maven-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: ~/.m2/repository
-          key: maven-${{ runner.os }}-17-${{ github.sha }}
-          restore-keys: setup-java-Linux-maven-${{ hashFiles('**/pom.xml') }}
-
-      - name: Maven build
-        if: steps.maven-restore.outputs.cache-hit != 'true'
-        run: |
-          ./it.sh ci
-
-      - name: Run IT
-        id: test
-        timeout-minutes: 90
-        run: |
-          set -x
-          mvn -B -ff install -pl '!web-console' -Pdist,bundle-contrib-exts -Pskip-static-checks,skip-tests -Dmaven.javadoc.skip=true -T1C
-          # Note: The above command relies on the correct version of the JARs being installed in the local m2 repository.
-          # For any changes, please rebuild it using the command from the previous step (./it.sh ci).
-
-          MAVEN_OPTS='-Xmx2048m' ${MVN} verify -pl integration-tests -P int-tests-config-file ${IT_TEST} ${MAVEN_SKIP} -Dpod.name=${POD_NAME} -Dpod.namespace=${POD_NAMESPACE} -Dbuild.druid.cluster=${BUILD_DRUID_CLUSTER}
-
-      - name: Debug on failure
-        if: ${{ failure() && steps.test.conclusion == 'failure' }}
-        run: |
-          for v in broker middlemanager router coordinator historical ; do
-          echo "------------------------druid-tiny-cluster-"$v"s-0-------------------------";
-          /usr/local/bin/kubectl logs --tail 1000 druid-tiny-cluster-"$v"s-0 ||:;
-          /usr/local/bin/kubectl get events | grep druid-tiny-cluster-"$v"s-0 ||:;
-          done
+      group: custom coordinator duties
@@ -33,6 +33,31 @@ Consider this an [EXPERIMENTAL](../experimental.md) feature mostly because it ha
 
 The K8s extension builds a pod spec for each task using the specified pod adapter. All jobs are natively restorable, they are decoupled from the Druid deployment, thus restarting pods or doing upgrades has no effect on tasks in flight.  They will continue to run and when the overlord comes back up it will start tracking them again.  
 
+## Kubernetes Client Mode
+
+### "Direct" K8s API Interaction per task *(Default)*
+
+Task lifecycle code in Druid talks directly to the Kubernetes API server for all operations that require interaction with the Kubernetes cluster.
+
+### `SharedInformer` "Caching" *(Experimental)*
+
+Enabled by setting `druid.indexer.runner.useK8sSharedInformers=true`, this mode uses `Fabric8` `SharedInformer` objects for monitoring state changes in the remote K8s cluster, reducing the number of direct API calls to the Kubernetes API server. This can greatly reduce load on the API server, especially in environments with a high volume of tasks.
+
+This mode is experimental and should be used with caution in production until it has been vetted more thoroughly by the community.
+
+The core idea is to use two `SharedInformers`, one for jobs and one for pods, to watch for changes in the remote K8s cluster. These informers maintain a local cache of jobs and pods that tasks can query. The informers can also notify listeners when changes occur, allowing tasks to react to state changes without polling the API server or creating per-task watches on the K8s cluster.
+
+#### Architecture: Direct vs. Caching Mode
+
+**Key Differences:**
+
+- `DirectKubernetesPeonClient` (Default): Every read operation makes a direct HTTP call to the K8s API server. With 100 concurrent tasks, this results in 100+ active API connections with continuous polling.
+
+- `CachingKubernetesPeonClient` (Experimental): All read operations query an in-memory cache maintained by `SharedInformers`. With 100 concurrent tasks, only 2 persistent watch connections are used (one for Jobs, one for Pods), achieving a large reduction in API calls.
+
+**Shared Operations**: 
+
+Both implementations share the same write (job creation, deletion) and log read operations code, which always use direct API calls.
 
 ## Configuration
 
@@ -48,9 +73,9 @@ Other configurations required are:
 Druid operators can dynamically tune certain features within this extension. You don't need to restart the Overlord
 service for these changes to take effect.
 
-Druid can dynamically tune [pod template selection](#pod-template-selection), which allows you to configure the pod 
-template based on the task to be run. To enable dynamic pod template selection, first configure the 
-[custom template pod adapter](#custom-template-pod-adapter).
+Druid can dynamically tune [pod template selection](#pod-template-selection) and [capacity](#properties). Where capacity refers to `druid.indexer.runner.capacity`.
+
+Pod template selection allows you to configure the pod template based on the task to be run. To enable dynamic pod template selection, first configure the [custom template pod adapter](#custom-template-pod-adapter).
 
 Use the following APIs to view and update the dynamic configuration for the Kubernetes task runner.
 
@@ -126,7 +151,8 @@ Host: http://ROUTER_IP:ROUTER_PORT
         "type": ["index_kafka"]
       }
     ]
-  }
+  },
+  "capacity": 12
 }
 ```
 </details>
@@ -135,6 +161,8 @@ Host: http://ROUTER_IP:ROUTER_PORT
 
 Updates the dynamic configuration for the Kubernetes Task Runner
 
+Note: Both `podTemplateSelectStrategy` and `capacity` are optional fields. A POST request may include either, both, or neither.
+
 ##### URL
 
 `POST` `/druid/indexer/v1/k8s/taskrunner/executionconfig`
@@ -193,7 +221,8 @@ curl "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/k8s/taskrunner/executionconf
         "type": ["index_kafka"]
       }
     ]
-  }
+  },
+  "capacity": 6
 }'
 ```
 
@@ -225,7 +254,8 @@ Content-Type: application/json
         "type": ["index_kafka"]
       }
     ]
-  }
+  },
+  "capacity": 6
 }
 ```
 
@@ -309,7 +339,7 @@ Host: http://ROUTER_IP:ROUTER_PORT
       "comment": "",
       "ip": "127.0.0.1"
     },
-    "payload": "{\"type\": \"default\",\"podTemplateSelectStrategy\":{\"type\": \"taskType\"}",
+    "payload": "{\"type\": \"default\",\"podTemplateSelectStrategy\":{\"type\": \"taskType\"},\"capacity\":6",
     "auditTime": "2024-06-13T20:59:51.622Z"
   }
 ]
@@ -790,10 +820,11 @@ Should you require the needed permissions for interacting across Kubernetes name
 | `druid.indexer.runner.annotations` | `JsonObject` | Additional annotations you want to add to peon pod. | `{}` | No |
 | `druid.indexer.runner.peonMonitors` | `JsonArray` | Overrides `druid.monitoring.monitors`. Use this property if you don't want to inherit monitors from the Overlord. | `[]` | No |
 | `druid.indexer.runner.graceTerminationPeriodSeconds` | `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. | `PT30S` (K8s default) | No |
-| `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No |
+| `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. Value will be overridden if a dynamic config value has been set. | `2147483647` | No |
 | `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No |
 | `druid.indexer.runner.logSaveTimeout` | `Duration` | The peon executing the ingestion task makes a best effort to persist the pod logs from `k8s` to persistent task log storage. The timeout ensures that `k8s` connection issues do not cause the pod to hang indefinitely thereby blocking Overlord operations. If the timeout occurs before the logs are saved, those logs will not be available in Druid. | `PT300S` | NO |
-
+| `druid.indexer.runner.useK8sSharedInformers` | `boolean` | Whether to use shared informers to watch for pod/job changes. This is more efficient on the Kubernetes API server, but may use more memory in the Overlord. | `false` | No |
+| `druid.indexer.runner.k8sSharedInformerResyncPeriod` | `Duration` | When using shared informers, controls how frequently the informers resync with the Kubernetes API server. This prevents change events from being missed, keeping the informer cache clean and accurate. | `PT300S` | No |
 
 ### Metrics added
 
 
@@ -395,7 +395,8 @@ The following table lists the context parameters for the MSQ task engine:
 | `finalizeAggregations` | SELECT, INSERT, REPLACE<br /><br />Determines the type of aggregation to return. If true, Druid finalizes the results of complex aggregations that directly appear in query results. If false, Druid returns the aggregation's intermediate type rather than finalized type. This parameter is useful during ingestion, where it enables storing sketches directly in Druid tables. For more information about aggregations, see [SQL aggregation functions](../querying/sql-aggregations.md). | `true` |
 | `arrayIngestMode` | INSERT, REPLACE<br /><br /> Controls how ARRAY type values are stored in Druid segments. When set to `array` (recommended for SQL compliance), Druid will store all ARRAY typed values in [ARRAY typed columns](../querying/arrays.md), and supports storing both VARCHAR and numeric typed arrays. When set to `mvd` (the default, for backwards compatibility), Druid only supports VARCHAR typed arrays, and will store them as [multi-value string columns](../querying/multi-value-dimensions.md). See [`arrayIngestMode`] in the [Arrays](../querying/arrays.md) page for more details. | `mvd` (for backwards compatibility, recommended to use `array` for SQL compliance)|
 | `sqlJoinAlgorithm` | SELECT, INSERT, REPLACE<br /><br />Algorithm to use for JOIN. Use `broadcast` (the default) for broadcast hash join or `sortMerge` for sort-merge join. Affects all JOIN operations in the query. This is a hint to the MSQ engine and the actual joins in the query may proceed in a different way than specified. See [Joins](#joins) for more details. | `broadcast` |
-| `rowsInMemory` | INSERT or REPLACE<br /><br />Maximum number of rows to store in memory at once before flushing to disk during the segment generation process. Ignored for non-INSERT queries. In most cases, use the default value. You may need to override the default if you run into one of the [known issues](./known-issues.md) around memory usage. | 100,000 |
+| `maxRowsInMemory` | INSERT or REPLACE<br /><br />Maximum number of rows to store in memory at once before flushing to disk during the segment generation process. Ignored for non-INSERT queries. In most cases, use the default value. You may need to override the default if you run into one of the [known issues](./known-issues.md) around memory usage. | 100,000 |
+| `rowsInMemory` | INSERT or REPLACE<br /><br />Alternate spelling of `maxRowsInMemory`. Ignored if `maxRowsInMemory` is set. | 100,000 |
 | `segmentSortOrder` | INSERT or REPLACE<br /><br />Normally, Druid sorts rows in individual segments using `__time` first, followed by the [CLUSTERED BY](#clustered-by) clause. When you set `segmentSortOrder`, Druid uses the order from this context parameter instead. Provide the column list as comma-separated values or as a JSON array in string form.<br />< br/>For example, consider an INSERT query that uses `CLUSTERED BY country` and has `segmentSortOrder` set to `__time,city,country`. Within each time chunk, Druid assigns rows to segments based on `country`, and then within each of those segments, Druid sorts those rows by `__time` first, then `city`, then `country`. | empty list |
 | `forceSegmentSortByTime` | INSERT or REPLACE<br /><br />When set to `true` (the default), Druid prepends `__time` to [CLUSTERED BY](#clustered-by) when determining the sort order for individual segments. Druid also requires that `segmentSortOrder`, if provided, starts with `__time`.<br /><br />When set to `false`, Druid uses the [CLUSTERED BY](#clustered-by) alone to determine the sort order for individual segments, and does not require that `segmentSortOrder` begin with `__time`. Setting this parameter to `false` is an experimental feature; see [Sorting](../ingestion/partitioning.md#sorting) for details. | `true` |
 | `maxParseExceptions`| SELECT, INSERT, REPLACE<br /><br />Maximum number of parse exceptions that are ignored while executing the query before it stops with `TooManyWarningsFault`. To ignore all the parse exceptions, set the value to -1. | 0 |
 
@@ -238,6 +238,8 @@ Servers table lists all discovered servers in the cluster.
 |start_time|STRING|Timestamp in ISO8601 format when the server was announced in the cluster|
 |version|VARCHAR|Druid version running on the server|
 |labels|VARCHAR|Labels for the server configured using the property [`druid.labels`](../configuration/index.md)|
+|available_processors|BIGINT|Total number of CPU processors available to the server|
+|total_memory|BIGINT|Total memory in bytes available to the server|
 
 To retrieve information about all servers, use the query:
 
 
@@ -1018,9 +1018,8 @@ protected String getBrokerUrl()
 
   /**
    * curr_size on historicals changes because cluster state is not isolated across
-   * different
-   * integration tests, zero it out for consistent test results
-   * version and start_time are not configurable therefore we zero them as well
+   * different integration tests, zero it out for consistent test results.
+   * version, start_time, available_processors, total_memory are not configurable therefore we zero them as well
    */
   protected static List<Map<String, Object>> getServersWithoutNonConfigurableFields(List<Map<String, Object>> servers)
   {
@@ -1031,6 +1030,8 @@ protected static List<Map<String, Object>> getServersWithoutNonConfigurableField
           newServer.put("curr_size", 0);
           newServer.put("start_time", "0");
           newServer.put("version", "0.0.0");
+          newServer.put("available_processors", 0);
+          newServer.put("total_memory", 0);
           return newServer;
         }
     );
 
@@ -20,13 +20,10 @@
 package org.apache.druid.testing.embedded.indexing;
 
 import com.google.common.base.Optional;
-import com.google.common.collect.ImmutableList;
 import org.apache.commons.io.IOUtils;
 import org.apache.druid.common.utils.IdUtils;
 import org.apache.druid.data.input.impl.CsvInputFormat;
 import org.apache.druid.data.input.impl.TimestampSpec;
-import org.apache.druid.indexer.TaskState;
-import org.apache.druid.indexer.TaskStatusPlus;
 import org.apache.druid.indexing.common.task.CompactionTask;
 import org.apache.druid.indexing.common.task.IndexTask;
 import org.apache.druid.indexing.common.task.NoopTask;
@@ -40,7 +37,6 @@
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.StringUtils;
-import org.apache.druid.java.util.common.parsers.CloseableIterator;
 import org.apache.druid.metadata.storage.postgresql.PostgreSQLMetadataStorageModule;
 import org.apache.druid.query.DruidMetrics;
 import org.apache.druid.query.http.SqlTaskStatus;
@@ -243,7 +239,7 @@ public void test_runIndexParallelTask_andCompactData()
         .dynamicPartitionWithMaxRows(5000)
         .withId(compactTaskId);
     cluster.callApi().onLeaderOverlord(o -> o.runTask(compactTaskId, compactionTask));
-    cluster.callApi().waitForTaskToSucceed(taskId, eventCollector.latchableEmitter());
+    cluster.callApi().waitForTaskToSucceed(compactTaskId, eventCollector.latchableEmitter());
 
     // Verify the compacted data
     final int numCompactedSegments = 5;
@@ -308,13 +304,10 @@ public void test_runKafkaSupervisor()
     Assertions.assertEquals("RUNNING", supervisorStatus.getState());
     Assertions.assertEquals(topic, supervisorStatus.getSource());
 
-    // Get the task statuses
-    List<TaskStatusPlus> taskStatuses = ImmutableList.copyOf(
-        (CloseableIterator<TaskStatusPlus>)
-            cluster.callApi().onLeaderOverlord(o -> o.taskStatuses(null, dataSource, 1))
-    );
-    Assertions.assertFalse(taskStatuses.isEmpty());
-    Assertions.assertEquals(TaskState.RUNNING, taskStatuses.get(0).getStatusCode());
+    // Confirm tasks are being created and running
+    int runningTasks = cluster.callApi().getTaskCount("running", dataSource);
+    int completedTasks = cluster.callApi().getTaskCount("complete", dataSource);
+    Assertions.assertTrue(runningTasks + completedTasks > 0);
 
     // Suspend the supervisor and verify the state
     cluster.callApi().onLeaderOverlord(
 
@@ -51,7 +51,6 @@
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.Timeout;
 
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
@@ -154,7 +153,8 @@ public void test_ingest10kRows_ofSelfClusterMetrics_andVerifyValues()
     // Wait for segments to be handed off
     indexer.latchableEmitter().waitForEventAggregate(
         event -> event.hasMetricName("ingest/handoff/count")
-                      .hasDimension(DruidMetrics.DATASOURCE, List.of(dataSource)),
+                      .hasDimension(DruidMetrics.DATASOURCE, dataSource)
+                      .hasDimension(DruidMetrics.SUPERVISOR_ID, supervisorId),
         agg -> agg.hasSumAtLeast(expectedSegmentsHandedOff)
     );