Skip to content

Commit 706d493

Browse files
Add Karpenter metric coverage up to v1.8 (#21819)
* Add Karpenter metric coverage up to v1.8 * Fix: Validate metadata.csv * Fix changelog to match pr number
1 parent 3301852 commit 706d493

File tree

6 files changed

+136
-2
lines changed

6 files changed

+136
-2
lines changed

karpenter/changelog.d/21819.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support for Karpenter v1.8 metrics

karpenter/datadog_checks/karpenter/metrics.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
'aws_sdk_go_request_duration_seconds': 'aws.sdk_go.request.duration_seconds',
1515
'aws_sdk_go_request_attempt': 'aws.sdk_go.request_attempt',
1616
'aws_sdk_go_request_attempt_duration_seconds': 'aws.sdk_go.request_attempt.duration_seconds',
17+
'client_go_request': 'client_go_request',
18+
'client_go_request_duration_seconds': 'client_go_request_duration_seconds',
19+
'controller_runtime_conversion_webhook_panics': 'controller_runtime_conversion_webhook_panics',
1720
'certwatcher_read_certificate': 'certwatcher.read.certificate',
1821
'certwatcher_read_certificate_errors': 'certwatcher.read.certificate.errors',
1922
'controller_runtime_active_workers': 'controller.runtime.active_workers',
@@ -101,6 +104,10 @@
101104
'karpenter_nodepool_usage': 'nodepool_usage',
102105
'karpenter_nodes_allocatable': 'nodes.allocatable',
103106
'karpenter_nodes_created': 'nodes.created',
107+
'karpenter_nodes_current_lifetime_seconds': 'nodes_current_lifetime_seconds',
108+
'karpenter_nodes_drained': 'nodes_drained',
109+
'karpenter_nodes_eviction_requests': 'nodes_eviction_requests',
110+
'karpenter_nodes_lifetime_duration_seconds': 'nodes_lifetime_duration_seconds',
104111
'karpenter_nodes_terminated': 'nodes.terminated',
105112
'karpenter_nodes_leases_deleted': 'nodes.leases_deleted',
106113
'karpenter_nodes_system_overhead': 'nodes.system_overhead',
@@ -156,20 +163,28 @@
156163
'operator_ec2nodeclass_status_condition_transitions': 'operator.ec2nodeclass.status_condition.transitions',
157164
'operator_ec2nodeclass_status_condition_current_status_seconds': 'operator.ec2nodeclass.status_condition.current_status.seconds',
158165
'operator_ec2nodeclass_status_condition_count': 'operator.ec2nodeclass.status_condition_count',
166+
'operator_ec2nodeclass_status_condition_transition_seconds': 'operator_ec2nodeclass_status_condition_transition_seconds',
167+
'operator_ec2nodeclass_termination_current_time_seconds': 'operator_ec2nodeclass_termination_current_time_seconds',
168+
'operator_ec2nodeclass_termination_duration_seconds': 'operator_ec2nodeclass_termination_duration_seconds',
159169
'operator_node_event_count': 'operator.node.event_count',
160170
'operator_node_status_condition_transitions': 'operator.node.status_condition.transitions',
161171
'operator_node_status_condition_transition_seconds': 'operator.node.status_condition.transitions.seconds',
162172
'operator_node_status_condition_current_status_seconds': 'operator.node.status_condition.current_status.seconds',
163173
'operator_node_status_condition_count': 'operator.node.status_condition_count',
164174
'operator_node_termination_duration_seconds': 'operator.node.termination.duration_seconds',
175+
'operator_node_termination_current_time_seconds': 'operator_node_termination_current_time_seconds',
165176
'operator_nodeclaim_status_condition_transitions': 'operator.nodeclaim.status_condition.transitions',
166177
'operator_nodeclaim_status_condition_transition_seconds': 'operator.nodeclaim.status_condition.transitions.seconds',
167178
'operator_nodeclaim_status_condition_current_status_seconds': 'operator.nodeclaim.status_condition.current_status.seconds',
168179
'operator_nodeclaim_status_condition_count': 'operator.nodeclaim.status_condition_count',
169180
'operator_nodeclaim_termination_duration_seconds': 'operator.nodeclaim.termination.duration_seconds',
181+
'operator_nodeclaim_termination_current_time_seconds': 'operator_nodeclaim_termination_current_time_seconds',
170182
'operator_nodepool_status_condition_transitions': 'operator.nodepool.status_condition.transitions',
171183
'operator_nodepool_status_condition_current_status_seconds': 'operator.nodepool.status_condition.current_status.seconds',
172184
'operator_nodepool_status_condition_count': 'operator.nodepool.status_condition_count',
185+
'operator_nodepool_status_condition_transition_seconds': 'operator_nodepool_status_condition_transition_seconds',
186+
'operator_nodepool_termination_current_time_seconds': 'operator_nodepool_termination_current_time_seconds',
187+
'operator_nodepool_termination_duration_seconds': 'operator_nodepool_termination_duration_seconds',
173188
}
174189

175190
RENAME_LABELS_MAP = {

karpenter/metadata.csv

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ karpenter.aws.sdk_go.request_attempt.duration_seconds.sum,count,,second,,Sum of
1010
karpenter.build_info,gauge,,,,A metric with a constant '1' value labeled by version from which Karpenter was built.,0,karpenter,,,
1111
karpenter.certwatcher.read.certificate.count,count,,read,,The count of certificate reads,0,karpenter,,,
1212
karpenter.certwatcher.read.certificate.errors.count,count,,error,,The count of certificate read errors,0,karpenter,,,
13+
karpenter.client_go_request.count,count,,request,,The count of client-go requests,0,karpenter,,,
14+
karpenter.client_go_request_duration_seconds.bucket,count,,,,Histogram buckets for client-go request durations,0,karpenter,,,
15+
karpenter.client_go_request_duration_seconds.count,count,,,,Count of client-go request durations,0,karpenter,,,
16+
karpenter.client_go_request_duration_seconds.sum,count,,second,,Sum of client-go request durations,0,karpenter,,,
1317
karpenter.cloudprovider.batcher.batch.time_seconds.bucket,count,,,,The count of observation in the batching window histogram by `upper_bound` buckets,0,karpenter,,,
1418
karpenter.cloudprovider.batcher.batch.time_seconds.count,count,,,,The count of observation in the batching window histogram,0,karpenter,,,
1519
karpenter.cloudprovider.batcher.batch.time_seconds.sum,count,,second,,The sum of the duration of the batching window per batcher,0,karpenter,,,
@@ -38,6 +42,7 @@ karpenter.controller.runtime.reconcile.time_seconds.sum,count,,second,,The sum o
3842
karpenter.controller.runtime.reconcile_errors.count,count,,error,,The count of reconciliation errors per controller,0,karpenter,,,
3943
karpenter.controller.runtime.reconcile_panics.count,count,,,,Total number of reconciliation panics per controller,0,karpenter,,,
4044
karpenter.controller.runtime.terminal.reconcile.errors.count,count,,,,Total number of terminal reconciliation errors per controller,0,karpenter,,,
45+
karpenter.controller_runtime_conversion_webhook_panics.count,count,,,,Total number of conversion webhook panics,0,karpenter,,,
4146
karpenter.deprovisioning.actions_performed.count,count,,execution,,The count of deprovisioning actions performed. Labeled by deprovisioner (Deprecated in v1.0+),0,karpenter,,,
4247
karpenter.deprovisioning.consolidation_timeouts,gauge,,timeout,,Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type (Deprecated in v1.0+),0,karpenter,,,
4348
karpenter.deprovisioning.eligible_machines,gauge,,,,Number of machines eligible for deprovisioning by Karpenter. Labeled by deprovisioner (Deprecated in v1.0+),0,karpenter,,,
@@ -133,6 +138,12 @@ karpenter.nodes.total.daemon_limits,gauge,,,,Total resources specified by Daemon
133138
karpenter.nodes.total.daemon_requests,gauge,,,,Total resources requested by DaemonSet pods,0,karpenter,,,
134139
karpenter.nodes.total.pod_limits,gauge,,,,Total pod resources specified by non-DaemonSet pod limits,0,karpenter,,,
135140
karpenter.nodes.total.pod_requests,gauge,,,,Total pod resources requested by non-DaemonSet pods bound,0,karpenter,,,
141+
karpenter.nodes_current_lifetime_seconds,gauge,,second,,Current lifetime of nodes in seconds,0,karpenter,,,
142+
karpenter.nodes_drained.count,count,,node,,Count of nodes drained,0,karpenter,,,
143+
karpenter.nodes_eviction_requests.count,count,,request,,Count of node eviction requests,0,karpenter,,,
144+
karpenter.nodes_lifetime_duration_seconds.bucket,count,,,,Histogram buckets for node lifetime durations,0,karpenter,,,
145+
karpenter.nodes_lifetime_duration_seconds.count,count,,,,Count of node lifetime durations,0,karpenter,,,
146+
karpenter.nodes_lifetime_duration_seconds.sum,count,,second,,Sum of node lifetime durations,0,karpenter,,,
136147
karpenter.operator.ec2nodeclass.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for ec2nodeclass,0,karpenter,,,
137148
karpenter.operator.ec2nodeclass.status_condition.transitions.count,count,,,,Count of status condition transitions for ec2nodeclass,0,karpenter,,,
138149
karpenter.operator.ec2nodeclass.status_condition_count,gauge,,,,Number of conditions for ec2nodeclass,0,karpenter,,,
@@ -149,6 +160,22 @@ karpenter.operator.nodeclaim.termination.duration_seconds.bucket,count,,,,Histog
149160
karpenter.operator.nodepool.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for nodepool,0,karpenter,,,
150161
karpenter.operator.nodepool.status_condition.transitions.count,count,,,,Count of status condition transitions for nodepool,0,karpenter,,,
151162
karpenter.operator.nodepool.status_condition_count,gauge,,,,Number of conditions for nodepool,0,karpenter,,,
163+
karpenter.operator_ec2nodeclass_status_condition_transition_seconds.bucket,count,,,,Histogram buckets for ec2nodeclass status condition transitions,0,karpenter,,,
164+
karpenter.operator_ec2nodeclass_status_condition_transition_seconds.count,count,,,,Count of ec2nodeclass status condition transitions,0,karpenter,,,
165+
karpenter.operator_ec2nodeclass_status_condition_transition_seconds.sum,count,,,,Sum of ec2nodeclass status condition transitions,0,karpenter,,,
166+
karpenter.operator_ec2nodeclass_termination_current_time_seconds,gauge,,second,,Current time for ec2nodeclass termination,0,karpenter,,,
167+
karpenter.operator_ec2nodeclass_termination_duration_seconds.bucket,count,,,,Histogram buckets for ec2nodeclass termination durations,0,karpenter,,,
168+
karpenter.operator_ec2nodeclass_termination_duration_seconds.count,count,,,,Count of ec2nodeclass termination durations,0,karpenter,,,
169+
karpenter.operator_ec2nodeclass_termination_duration_seconds.sum,count,,,,Sum of ec2nodeclass termination durations,0,karpenter,,,
170+
karpenter.operator_node_termination_current_time_seconds,gauge,,second,,Current time for node termination,0,karpenter,,,
171+
karpenter.operator_nodeclaim_termination_current_time_seconds,gauge,,second,,Current time for nodeclaim termination,0,karpenter,,,
172+
karpenter.operator_nodepool_status_condition_transition_seconds.bucket,count,,,,Histogram buckets for nodepool status condition transitions,0,karpenter,,,
173+
karpenter.operator_nodepool_status_condition_transition_seconds.count,count,,,,Count of nodepool status condition transitions,0,karpenter,,,
174+
karpenter.operator_nodepool_status_condition_transition_seconds.sum,count,,,,Sum of nodepool status condition transitions,0,karpenter,,,
175+
karpenter.operator_nodepool_termination_current_time_seconds,gauge,,second,,Current time for nodepool termination,0,karpenter,,,
176+
karpenter.operator_nodepool_termination_duration_seconds.bucket,count,,,,Histogram buckets for nodepool termination durations,0,karpenter,,,
177+
karpenter.operator_nodepool_termination_duration_seconds.count,count,,,,Count of nodepool termination durations,0,karpenter,,,
178+
karpenter.operator_nodepool_termination_duration_seconds.sum,count,,,,Sum of nodepool termination durations,0,karpenter,,,
152179
karpenter.pods.startup.time_seconds.count,count,,,,The count of the observations in the pod startup summary,0,karpenter,,,
153180
karpenter.pods.startup.time_seconds.quantile,gauge,,,,The time taken between pod creation and the pod being in a running state by `quantile`,0,karpenter,,,
154181
karpenter.pods.startup.time_seconds.sum,count,,second,,The sum of the time from pod creation and the pod being in a running state,0,karpenter,,,
@@ -181,4 +208,4 @@ karpenter.workqueue.work.duration_seconds.count,count,,,,The count of observatio
181208
karpenter.workqueue.work.duration_seconds.sum,count,,second,,The sum of the amount of seconds spent processing an item from workqueue takes,0,karpenter,,,
182209
karpenter.workqueue_adds.count,count,,,,The count of adds handled by workqueue,0,karpenter,,,
183210
karpenter.workqueue_depth,gauge,,,,Current depth of workqueue,0,karpenter,,,
184-
karpenter.workqueue_retries.count,count,,attempt,,The count of retries handled by workqueue,0,karpenter,,,
211+
karpenter.workqueue_retries.count,count,,attempt,,The count of retries handled by workqueue,0,karpenter,,,

karpenter/tests/common.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ def get_fixture_path(filename):
2525
'karpenter.build_info',
2626
'karpenter.certwatcher.read.certificate.count',
2727
'karpenter.certwatcher.read.certificate.errors.count',
28+
'karpenter.client_go_request.count',
29+
'karpenter.client_go_request_duration_seconds.bucket',
30+
'karpenter.client_go_request_duration_seconds.count',
31+
'karpenter.client_go_request_duration_seconds.sum',
2832
'karpenter.cloudprovider.batcher.batch.time_seconds.bucket',
2933
'karpenter.cloudprovider.batcher.batch.time_seconds.count',
3034
'karpenter.cloudprovider.batcher.batch.time_seconds.sum',
@@ -46,6 +50,7 @@ def get_fixture_path(filename):
4650
'karpenter.controller.runtime.reconcile.time_seconds.count',
4751
'karpenter.controller.runtime.reconcile.time_seconds.sum',
4852
'karpenter.controller.runtime.reconcile_errors.count',
53+
'karpenter.controller_runtime_conversion_webhook_panics.count',
4954
'karpenter.deprovisioning.actions_performed.count',
5055
'karpenter.deprovisioning.eligible_machines',
5156
'karpenter.deprovisioning.evaluation.duration_seconds.bucket',
@@ -108,8 +113,14 @@ def get_fixture_path(filename):
108113
'karpenter.machines_registered.count',
109114
'karpenter.machines_terminated.count',
110115
'karpenter.nodes.allocatable',
116+
'karpenter.nodes_current_lifetime_seconds',
117+
'karpenter.nodes_drained.count',
111118
'karpenter.nodes.eviction.queue_depth',
119+
'karpenter.nodes_eviction_requests.count',
112120
'karpenter.nodes.leases_deleted.count',
121+
'karpenter.nodes_lifetime_duration_seconds.bucket',
122+
'karpenter.nodes_lifetime_duration_seconds.count',
123+
'karpenter.nodes_lifetime_duration_seconds.sum',
113124
'karpenter.nodes.system_overhead',
114125
'karpenter.nodes.terminated.count',
115126
'karpenter.nodes.termination.time_seconds.count',
@@ -200,19 +211,35 @@ def get_fixture_path(filename):
200211
'karpenter.operator.ec2nodeclass.status_condition.current_status.seconds',
201212
'karpenter.operator.ec2nodeclass.status_condition.transitions.count',
202213
'karpenter.operator.ec2nodeclass.status_condition_count',
214+
'karpenter.operator_ec2nodeclass_status_condition_transition_seconds.bucket',
215+
'karpenter.operator_ec2nodeclass_status_condition_transition_seconds.count',
216+
'karpenter.operator_ec2nodeclass_status_condition_transition_seconds.sum',
217+
'karpenter.operator_ec2nodeclass_termination_current_time_seconds',
218+
'karpenter.operator_ec2nodeclass_termination_duration_seconds.bucket',
219+
'karpenter.operator_ec2nodeclass_termination_duration_seconds.count',
220+
'karpenter.operator_ec2nodeclass_termination_duration_seconds.sum',
203221
'karpenter.operator.node.status_condition.current_status.seconds',
204222
'karpenter.operator.node.status_condition.transitions.count',
205223
'karpenter.operator.node.status_condition.transitions.seconds.bucket',
206224
'karpenter.operator.node.status_condition_count',
207225
'karpenter.operator.node.termination.duration_seconds.bucket',
226+
'karpenter.operator_node_termination_current_time_seconds',
208227
'karpenter.operator.nodeclaim.status_condition.current_status.seconds',
209228
'karpenter.operator.nodeclaim.status_condition.transitions.count',
210229
'karpenter.operator.nodeclaim.status_condition.transitions.seconds.bucket',
211230
'karpenter.operator.nodeclaim.status_condition_count',
212231
'karpenter.operator.nodeclaim.termination.duration_seconds.bucket',
232+
'karpenter.operator_nodeclaim_termination_current_time_seconds',
213233
'karpenter.operator.nodepool.status_condition.current_status.seconds',
214234
'karpenter.operator.nodepool.status_condition.transitions.count',
215235
'karpenter.operator.nodepool.status_condition_count',
236+
'karpenter.operator_nodepool_status_condition_transition_seconds.bucket',
237+
'karpenter.operator_nodepool_status_condition_transition_seconds.count',
238+
'karpenter.operator_nodepool_status_condition_transition_seconds.sum',
239+
'karpenter.operator_nodepool_termination_current_time_seconds',
240+
'karpenter.operator_nodepool_termination_duration_seconds.bucket',
241+
'karpenter.operator_nodepool_termination_duration_seconds.count',
242+
'karpenter.operator_nodepool_termination_duration_seconds.sum',
216243
]
217244
RENAMED_LABELS = [
218245
'go_version:go1.20.6',

karpenter/tests/fixtures/karpenter_metrics.txt

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2576,4 +2576,32 @@ karpenter_cluster_state_synced 1
25762576
karpenter_cluster_state_node_count 265
25772577
# HELP karpenter_disruption_consolidation_timeouts_total Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type.
25782578
# TYPE karpenter_disruption_consolidation_timeouts_total counter
2579-
karpenter_disruption_consolidation_timeouts_total{consolidation_type="single"} 5
2579+
karpenter_disruption_consolidation_timeouts_total{consolidation_type="single"} 5
2580+
# HELP client_go_request_total Total number of HTTP requests to the Kubernetes API by HTTP method and response code.
2581+
# TYPE client_go_request_total counter
2582+
client_go_request_total{code="200",host="10.100.0.1:443",method="GET"} 100
2583+
client_go_request_total{code="200",host="10.100.0.1:443",method="PUT"} 50
2584+
# HELP client_go_request_duration_seconds Latency of HTTP requests to the Kubernetes API by HTTP method and response code.
2585+
# TYPE client_go_request_duration_seconds histogram
2586+
client_go_request_duration_seconds_bucket{host="10.100.0.1:443",verb="GET",le="0.005"} 10
2587+
client_go_request_duration_seconds_bucket{host="10.100.0.1:443",verb="GET",le="+Inf"} 100
2588+
client_go_request_duration_seconds_sum{host="10.100.0.1:443",verb="GET"} 5.5
2589+
client_go_request_duration_seconds_count{host="10.100.0.1:443",verb="GET"} 100
2590+
# HELP controller_runtime_conversion_webhook_panics_total Total number of conversion webhook panics
2591+
# TYPE controller_runtime_conversion_webhook_panics_total counter
2592+
controller_runtime_conversion_webhook_panics_total 0
2593+
# HELP karpenter_nodes_current_lifetime_seconds The current age of each node in seconds
2594+
# TYPE karpenter_nodes_current_lifetime_seconds gauge
2595+
karpenter_nodes_current_lifetime_seconds{nodepool="default"} 3600
2596+
# HELP karpenter_nodes_drained_total Total number of nodes drained
2597+
# TYPE karpenter_nodes_drained_total counter
2598+
karpenter_nodes_drained_total{nodepool="default"} 5
2599+
# HELP karpenter_nodes_eviction_requests_total Total number of eviction requests made
2600+
# TYPE karpenter_nodes_eviction_requests_total counter
2601+
karpenter_nodes_eviction_requests_total 10
2602+
# HELP karpenter_nodes_lifetime_duration_seconds The lifetime duration of nodes that have been deleted or replaced
2603+
# TYPE karpenter_nodes_lifetime_duration_seconds histogram
2604+
karpenter_nodes_lifetime_duration_seconds_bucket{nodepool="default",le="3600"} 5
2605+
karpenter_nodes_lifetime_duration_seconds_bucket{nodepool="default",le="+Inf"} 10
2606+
karpenter_nodes_lifetime_duration_seconds_sum{nodepool="default"} 36000
2607+
karpenter_nodes_lifetime_duration_seconds_count{nodepool="default"} 10

0 commit comments

Comments
 (0)