Skip to content

Commit ba27c5c

Browse files
committed
Merge branch 'master' into fix/70147-autoscaler-persisted-cfg
2 parents aaf84c9 + 69505a3 commit ba27c5c

File tree

429 files changed

+14234
-5540
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

429 files changed

+14234
-5540
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
uses: ./.github/workflows/worker.yml
3030
with:
3131
script: .github/scripts/run_unit-tests -Dtest=!QTest,'${{ matrix.pattern }}' -Dmaven.test.failure.ignore=true
32-
artifact_prefix: "unit-test-reports"
32+
artifact_prefix: "unit-test-reports-jdk${{ matrix.jdk }}"
3333
jdk: ${{ matrix.jdk }}
3434
key: "test-jdk${{ matrix.jdk }}-[${{ matrix.pattern }}]"
3535

@@ -43,17 +43,21 @@ jobs:
4343
name: "test-report"
4444
needs: run-unit-tests
4545
runs-on: ubuntu-latest
46+
strategy:
47+
fail-fast: false
48+
matrix:
49+
jdk: [ "17", "21" ]
4650
steps:
4751
- name: Download reports for all unit test jobs
4852
uses: actions/download-artifact@v4
4953
with:
50-
pattern: "unit-test-reports-*"
54+
pattern: "unit-test-reports-jdk${{ matrix.jdk }}*"
5155
path: target/surefire-reports
5256

5357
- name: Publish Test Report
5458
uses: mikepenz/action-junit-report@v5
5559
with:
56-
check_name: "Unit Test Report"
60+
check_name: "Unit Test Report (JDK ${{ matrix.jdk }})"
5761
report_paths: '**/target/surefire-reports/TEST-*.xml'
5862
detailed_summary: true
5963
flaky_summary: true

.github/workflows/cron-job-its.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ jobs:
8888
strategy:
8989
fail-fast: false
9090
matrix:
91-
testing_group: [ query, query-retry, query-error, security ]
91+
testing_group: [ query, security ]
9292
uses: ./.github/workflows/reusable-standard-its.yml
9393
needs: build
9494
with:

.github/workflows/standard-its.yml

Lines changed: 2 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ jobs:
7878
strategy:
7979
fail-fast: false
8080
matrix:
81-
testing_group: [query, query-retry, query-error, security, centralized-datasource-schema]
81+
testing_group: [query, security, centralized-datasource-schema]
8282
uses: ./.github/workflows/reusable-standard-its.yml
8383
if: ${{ needs.changes.outputs.core == 'true' || needs.changes.outputs.common-extensions == 'true' }}
8484
with:
@@ -112,62 +112,4 @@ jobs:
112112
testing_groups: -Dgroups=custom-coordinator-duties
113113
use_indexer: middleManager
114114
override_config_path: ./environment-configs/test-groups/custom-coordinator-duties
115-
group: custom coordinator duties
116-
117-
integration-k8s-leadership-tests:
118-
needs: changes
119-
name: (Compile=openjdk17, Run=openjdk17, Cluster Build On K8s) ITNestedQueryPushDownTest integration test
120-
if: ${{ needs.changes.outputs.core == 'true' || needs.changes.outputs.common-extensions == 'true' }}
121-
runs-on: ubuntu-22.04
122-
env:
123-
MVN: mvn --no-snapshot-updates
124-
MAVEN_SKIP: -P skip-static-checks -Dweb.console.skip=true -Dmaven.javadoc.skip=true
125-
CONFIG_FILE: k8s_run_config_file.json
126-
IT_TEST: -Dit.test=ITNestedQueryPushDownTest
127-
POD_NAME: int-test
128-
POD_NAMESPACE: default
129-
BUILD_DRUID_CLUSTER: true
130-
steps:
131-
- name: Checkout branch
132-
uses: actions/checkout@v4
133-
134-
- name: setup java
135-
uses: actions/setup-java@v4
136-
with:
137-
java-version: '17'
138-
distribution: 'zulu'
139-
140-
# the build step produces SNAPSHOT artifacts into the local maven repository,
141-
# we include github.sha in the cache key to make it specific to that build/jdk
142-
- name: Restore Maven repository
143-
id: maven-restore
144-
uses: actions/cache/restore@v4
145-
with:
146-
path: ~/.m2/repository
147-
key: maven-${{ runner.os }}-17-${{ github.sha }}
148-
restore-keys: setup-java-Linux-maven-${{ hashFiles('**/pom.xml') }}
149-
150-
- name: Maven build
151-
if: steps.maven-restore.outputs.cache-hit != 'true'
152-
run: |
153-
./it.sh ci
154-
155-
- name: Run IT
156-
id: test
157-
timeout-minutes: 90
158-
run: |
159-
set -x
160-
mvn -B -ff install -pl '!web-console' -Pdist,bundle-contrib-exts -Pskip-static-checks,skip-tests -Dmaven.javadoc.skip=true -T1C
161-
# Note: The above command relies on the correct version of the JARs being installed in the local m2 repository.
162-
# For any changes, please rebuild it using the command from the previous step (./it.sh ci).
163-
164-
MAVEN_OPTS='-Xmx2048m' ${MVN} verify -pl integration-tests -P int-tests-config-file ${IT_TEST} ${MAVEN_SKIP} -Dpod.name=${POD_NAME} -Dpod.namespace=${POD_NAMESPACE} -Dbuild.druid.cluster=${BUILD_DRUID_CLUSTER}
165-
166-
- name: Debug on failure
167-
if: ${{ failure() && steps.test.conclusion == 'failure' }}
168-
run: |
169-
for v in broker middlemanager router coordinator historical ; do
170-
echo "------------------------druid-tiny-cluster-"$v"s-0-------------------------";
171-
/usr/local/bin/kubectl logs --tail 1000 druid-tiny-cluster-"$v"s-0 ||:;
172-
/usr/local/bin/kubectl get events | grep druid-tiny-cluster-"$v"s-0 ||:;
173-
done
115+
group: custom coordinator duties

.github/workflows/static-checks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ jobs:
197197
198198
- name: Tar druid logs
199199
if: ${{ failure() && steps.web-console-test.conclusion == 'failure' }}
200-
run: tar cvzf ./druid-logs.tgz -C ./distribution/target/apache-druid-*-SNAPSHOT/ log
200+
run: tar cvzf ./druid-logs.tgz -C ./distribution/target/apache-druid-*-SNAPSHOT/ log -C ./var druid/indexing-logs
201201

202202
- name: Upload druid logs to GitHub
203203
if: ${{ failure() && steps.web-console-test.conclusion == 'failure' }}

benchmarks/pom.xml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,6 @@
184184
<dependency>
185185
<groupId>com.github.seancfoley</groupId>
186186
<artifactId>ipaddress</artifactId>
187-
<version>5.3.4</version>
188187
</dependency>
189188
<dependency>
190189
<groupId>junit</groupId>

cloud/aws-common/pom.xml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
<dependency>
5050
<groupId>org.checkerframework</groupId>
5151
<artifactId>checker-qual</artifactId>
52-
<version>${checkerframework.version}</version>
5352
</dependency>
5453
<dependency>
5554
<groupId>com.google.code.findbugs</groupId>

distribution/bin/check-licenses.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ def build_compatible_license_names():
288288
compatible_licenses['Eclipse Public License, Version 2.0'] = 'Eclipse Public License 2.0'
289289
compatible_licenses['Eclipse Public License v2.0'] = 'Eclipse Public License 2.0'
290290
compatible_licenses['EPL 2.0'] = 'Eclipse Public License 2.0'
291+
compatible_licenses['EPL-2.0'] = 'Eclipse Public License 2.0'
291292

292293
compatible_licenses['Eclipse Distribution License 1.0'] = 'Eclipse Distribution License 1.0'
293294
compatible_licenses['Eclipse Distribution License - v 1.0'] = 'Eclipse Distribution License 1.0'

distribution/docker/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ volumes:
3131
services:
3232
postgres:
3333
container_name: postgres
34-
image: postgres:latest
34+
image: postgres:17.6
3535
ports:
3636
- "5432:5432"
3737
volumes:

docs/configuration/index.md

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,7 @@ These Overlord static configurations can be defined in the `overlord/runtime.pro
992992
|Property|Description|Default|
993993
|--------|-----------|-------|
994994
|`druid.indexer.runner.type`|Indicates whether tasks should be run locally using `local` or in a distributed environment using `remote`. The recommended option is `httpRemote`, which is similar to `remote` but uses HTTP to interact with Middle Managers instead of ZooKeeper.|`httpRemote`|
995+
|`druid.indexer.server.maxConcurrentActions`|Maximum number of concurrent action requests (such as getting locks, creating segments, fetching segments etc) that the Overlord will process simultaneously. This prevents thread exhaustion while preserving access to health check endpoints. Set to `0` to disable quality of service filtering entirely. If not specified, defaults to `max(1, max(serverHttpNumThreads - 4, serverHttpNumThreads * 0.8))`.|`max(1, max(serverHttpNumThreads - 4, serverHttpNumThreads * 0.8))`|
995996
|`druid.indexer.storage.type`|Indicates whether incoming tasks should be stored locally (in heap) or in metadata storage. One of `local` or `metadata`. `local` is mainly for internal testing while `metadata` is recommended in production because storing incoming tasks in metadata storage allows for tasks to be resumed if the Overlord should fail.|`local`|
996997
|`druid.indexer.storage.recentlyFinishedThreshold`|Duration of time to store task results. Default is 24 hours. If you have hundreds of tasks running in a day, consider increasing this threshold.|`PT24H`|
997998
|`druid.indexer.tasklock.forceTimeChunkLock`|**Setting this to false is still experimental**<br/> If set, all tasks are enforced to use time chunk lock. If not set, each task automatically chooses a lock type to use. This configuration can be overwritten by setting `forceTimeChunkLock` in the [task context](../ingestion/tasks.md#context-parameters). See [Task lock system](../ingestion/tasks.md#task-lock-system) for more details about locking in tasks.|true|
@@ -1011,7 +1012,7 @@ The following configs only apply if the Overlord is running in remote mode. For
10111012
|--------|-----------|-------|
10121013
|`druid.indexer.runner.taskAssignmentTimeout`|How long to wait after a task has been assigned to a Middle Manager before throwing an error.|`PT5M`|
10131014
|`druid.indexer.runner.minWorkerVersion`|The minimum Middle Manager version to send tasks to. The version number is a string. This affects the expected behavior during certain operations like comparison against `druid.worker.version`. Specifically, the version comparison follows dictionary order. Use ISO8601 date format for the version to accommodate date comparisons. |"0"|
1014-
| `druid.indexer.runner.parallelIndexTaskSlotRatio`| The ratio of task slots available for parallel indexing supervisor tasks per worker. The specified value must be in the range `[0, 1]`. |1|
1015+
|`druid.indexer.runner.parallelIndexTaskSlotRatio`| The ratio of task slots available for parallel indexing supervisor tasks per worker. The specified value must be in the range `[0, 1]`. |1|
10151016
|`druid.indexer.runner.compressZnodes`|Indicates whether or not the Overlord should expect Middle Managers to compress Znodes.|true|
10161017
|`druid.indexer.runner.maxZnodeBytes`|The maximum size Znode in bytes that can be created in ZooKeeper, should be in the range of `[10KiB, 2GiB)`. [Human-readable format](human-readable-byte.md) is supported.| 512 KiB |
10171018
|`druid.indexer.runner.taskCleanupTimeout`|How long to wait before failing a task after a Middle Manager is disconnected from ZooKeeper.|`PT15M`|
@@ -1956,13 +1957,14 @@ The following table lists available monitors and the respective services where t
19561957
|`org.apache.druid.java.util.metrics.JvmCpuMonitor`|Reports statistics of CPU consumption by the JVM.|Any|
19571958
|`org.apache.druid.java.util.metrics.CpuAcctDeltaMonitor`|Reports consumed CPU as per the cpuacct cgroup.|Any|
19581959
|`org.apache.druid.java.util.metrics.JvmThreadsMonitor`|Reports Thread statistics in the JVM, like numbers of total, daemon, started, died threads.|Any|
1959-
|`org.apache.druid.java.util.metrics.CgroupCpuMonitor`|Reports CPU shares and quotas as per the `cpu` cgroup.|Any|
1960-
|`org.apache.druid.java.util.metrics.CgroupCpuSetMonitor`|Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup.|Any|
1961-
|`org.apache.druid.java.util.metrics.CgroupDiskMonitor`|Reports disk statistic as per the blkio cgroup.|Any|
1962-
|`org.apache.druid.java.util.metrics.CgroupMemoryMonitor`|Reports memory statistic as per the memory cgroup.|Any|
1963-
|`org.apache.druid.java.util.metrics.CgroupV2CpuMonitor`| **EXPERIMENTAL** Reports CPU usage from `cpu.stat` file. Only applicable to `cgroupv2`.|Any|
1964-
|`org.apache.druid.java.util.metrics.CgroupV2DiskMonitor`| **EXPERIMENTAL** Reports disk usage from `io.stat` file. Only applicable to `cgroupv2`.|Any|
1965-
|`org.apache.druid.java.util.metrics.CgroupV2MemoryMonitor`| **EXPERIMENTAL** Reports memory usage from `memory.current` and `memory.max` files. Only applicable to `cgroupv2`.|Any|
1960+
|`org.apache.druid.java.util.metrics.CgroupCpuMonitor`|Reports CPU shares and quotas as per the `cpu` cgroup. Automatically switches to `CgroupV2CpuMonitor` in case `cgroupv2` type is detected.|Any|
1961+
|`org.apache.druid.java.util.metrics.CgroupCpuSetMonitor`|Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup. Automatically switches to `CgroupV2CpuSetMonitor` in case `cgroupv2` type is detected.|Any|
1962+
|`org.apache.druid.java.util.metrics.CgroupDiskMonitor`|Reports disk statistic as per the blkio cgroup. Automatically switches to `CgroupV2DiskMonitor` in case `cgroupv2` type is detected.|Any|
1963+
|`org.apache.druid.java.util.metrics.CgroupMemoryMonitor`|Reports memory statistic as per the memory cgroup. Automatically switches to `CgroupV2MemoryMonitor` in case `cgroupv2` type is detected.|Any|
1964+
|`org.apache.druid.java.util.metrics.CgroupV2CpuMonitor`| Reports CPU usage from `cpu.stat` file. Only applicable to `cgroupv2`.|Any|
1965+
|`org.apache.druid.java.util.metrics.CgroupV2CpuSetMonitor`|Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup. Only applicable to `cgroupv2`.|Any|
1966+
|`org.apache.druid.java.util.metrics.CgroupV2DiskMonitor`| Reports disk usage from `io.stat` file. Only applicable to `cgroupv2`.|Any|
1967+
|`org.apache.druid.java.util.metrics.CgroupV2MemoryMonitor`| Reports memory usage from `memory.current` and `memory.max` files. Only applicable to `cgroupv2`.|Any|
19661968
|`org.apache.druid.server.metrics.HistoricalMetricsMonitor`|Reports statistics on Historical services.|Historical|
19671969
|`org.apache.druid.server.metrics.SegmentStatsMonitor` | **EXPERIMENTAL** Reports statistics about segments on Historical services. Not to be used when lazy loading is configured.|Historical|
19681970
|`org.apache.druid.server.metrics.QueryCountStatsMonitor`|Reports how many queries have been successful/failed/interrupted.|Broker, Historical, Router, Indexer, Peon|

docs/development/extensions-core/k8s-jobs.md

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,31 @@ Consider this an [EXPERIMENTAL](../experimental.md) feature mostly because it ha
3333

3434
The K8s extension builds a pod spec for each task using the specified pod adapter. All jobs are natively restorable, they are decoupled from the Druid deployment, thus restarting pods or doing upgrades has no effect on tasks in flight. They will continue to run and when the overlord comes back up it will start tracking them again.
3535

36+
## Kubernetes Client Mode
37+
38+
### "Direct" K8s API Interaction per task *(Default)*
39+
40+
Task lifecycle code in Druid talks directly to the Kubernetes API server for all operations that require interaction with the Kubernetes cluster.
41+
42+
### `SharedInformer` "Caching" *(Experimental)*
43+
44+
Enabled by setting `druid.indexer.runner.useK8sSharedInformers=true`, this mode uses `Fabric8` `SharedInformer` objects for monitoring state changes in the remote K8s cluster, reducing the number of direct API calls to the Kubernetes API server. This can greatly reduce load on the API server, especially in environments with a high volume of tasks.
45+
46+
This mode is experimental and should be used with caution in production until it has been vetted more thoroughly by the community.
47+
48+
The core idea is to use two `SharedInformers`, one for jobs and one for pods, to watch for changes in the remote K8s cluster. These informers maintain a local cache of jobs and pods that tasks can query. The informers can also notify listeners when changes occur, allowing tasks to react to state changes without polling the API server or creating per-task watches on the K8s cluster.
49+
50+
#### Architecture: Direct vs. Caching Mode
51+
52+
**Key Differences:**
53+
54+
- `DirectKubernetesPeonClient` (Default): Every read operation makes a direct HTTP call to the K8s API server. With 100 concurrent tasks, this results in 100+ active API connections with continuous polling.
55+
56+
- `CachingKubernetesPeonClient` (Experimental): All read operations query an in-memory cache maintained by `SharedInformers`. With 100 concurrent tasks, only 2 persistent watch connections are used (one for Jobs, one for Pods), achieving a large reduction in API calls.
57+
58+
**Shared Operations**:
59+
60+
Both implementations share the same write (job creation, deletion) and log read operations code, which always use direct API calls.
3661

3762
## Configuration
3863

@@ -48,9 +73,9 @@ Other configurations required are:
4873
Druid operators can dynamically tune certain features within this extension. You don't need to restart the Overlord
4974
service for these changes to take effect.
5075

51-
Druid can dynamically tune [pod template selection](#pod-template-selection), which allows you to configure the pod
52-
template based on the task to be run. To enable dynamic pod template selection, first configure the
53-
[custom template pod adapter](#custom-template-pod-adapter).
76+
Druid can dynamically tune [pod template selection](#pod-template-selection) and [capacity](#properties). Where capacity refers to `druid.indexer.runner.capacity`.
77+
78+
Pod template selection allows you to configure the pod template based on the task to be run. To enable dynamic pod template selection, first configure the [custom template pod adapter](#custom-template-pod-adapter).
5479

5580
Use the following APIs to view and update the dynamic configuration for the Kubernetes task runner.
5681

@@ -126,7 +151,8 @@ Host: http://ROUTER_IP:ROUTER_PORT
126151
"type": ["index_kafka"]
127152
}
128153
]
129-
}
154+
},
155+
"capacity": 12
130156
}
131157
```
132158
</details>
@@ -135,6 +161,8 @@ Host: http://ROUTER_IP:ROUTER_PORT
135161

136162
Updates the dynamic configuration for the Kubernetes Task Runner
137163

164+
Note: Both `podTemplateSelectStrategy` and `capacity` are optional fields. A POST request may include either, both, or neither.
165+
138166
##### URL
139167

140168
`POST` `/druid/indexer/v1/k8s/taskrunner/executionconfig`
@@ -193,7 +221,8 @@ curl "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/k8s/taskrunner/executionconf
193221
"type": ["index_kafka"]
194222
}
195223
]
196-
}
224+
},
225+
"capacity": 6
197226
}'
198227
```
199228

@@ -225,7 +254,8 @@ Content-Type: application/json
225254
"type": ["index_kafka"]
226255
}
227256
]
228-
}
257+
},
258+
"capacity": 6
229259
}
230260
```
231261

@@ -309,7 +339,7 @@ Host: http://ROUTER_IP:ROUTER_PORT
309339
"comment": "",
310340
"ip": "127.0.0.1"
311341
},
312-
"payload": "{\"type\": \"default\",\"podTemplateSelectStrategy\":{\"type\": \"taskType\"}",
342+
"payload": "{\"type\": \"default\",\"podTemplateSelectStrategy\":{\"type\": \"taskType\"},\"capacity\":6",
313343
"auditTime": "2024-06-13T20:59:51.622Z"
314344
}
315345
]
@@ -790,10 +820,11 @@ Should you require the needed permissions for interacting across Kubernetes name
790820
| `druid.indexer.runner.annotations` | `JsonObject` | Additional annotations you want to add to peon pod. | `{}` | No |
791821
| `druid.indexer.runner.peonMonitors` | `JsonArray` | Overrides `druid.monitoring.monitors`. Use this property if you don't want to inherit monitors from the Overlord. | `[]` | No |
792822
| `druid.indexer.runner.graceTerminationPeriodSeconds` | `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. | `PT30S` (K8s default) | No |
793-
| `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. | `2147483647` | No |
823+
| `druid.indexer.runner.capacity` | `Integer` | Number of concurrent jobs that can be sent to Kubernetes. Value will be overridden if a dynamic config value has been set. | `2147483647` | No |
794824
| `druid.indexer.runner.cpuCoreInMicro` | `Integer` | Number of CPU micro core for the task. | `1000` | No |
795825
| `druid.indexer.runner.logSaveTimeout` | `Duration` | The peon executing the ingestion task makes a best effort to persist the pod logs from `k8s` to persistent task log storage. The timeout ensures that `k8s` connection issues do not cause the pod to hang indefinitely thereby blocking Overlord operations. If the timeout occurs before the logs are saved, those logs will not be available in Druid. | `PT300S` | NO |
796-
826+
| `druid.indexer.runner.useK8sSharedInformers` | `boolean` | Whether to use shared informers to watch for pod/job changes. This is more efficient on the Kubernetes API server, but may use more memory in the Overlord. | `false` | No |
827+
| `druid.indexer.runner.k8sSharedInformerResyncPeriod` | `Duration` | When using shared informers, controls how frequently the informers resync with the Kubernetes API server. This prevents change events from being missed, keeping the informer cache clean and accurate. | `PT300S` | No |
797828

798829
### Metrics added
799830

0 commit comments

Comments
 (0)