-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Description
Using Kube-prometheus-stack helm chart.
version: 72.9.0
-->
What happened:
csi_node_labels stopped producing data when I upgraded prometheus.
The type: Info metrics are no longer valid, I tried with a guage, to no avail.
What you expected to happen:
I'd expect the scrapes to work again.
How to reproduce it (as minimally and precisely as possible):
kube-prometheus-stack:
kube-state-metrics:
enabled: true
fullnameOverride: prometheus-kube-state-metrics
replicas: 1
metricLabelsAllowlist: '*=[team,label_team]'
# Ensure all label metrics are exposed (Prometheus v3 compatibility)
metricAllowlist: []
metricDenylist: []
collectors:
- certificatesigningrequests
- configmaps
- cronjobs
- daemonsets
- deployments
- endpoints
- horizontalpodautoscalers
- ingresses
- jobs
- leases
- limitranges
- mutatingwebhookconfigurations
- namespaces
- networkpolicies
- nodes
- persistentvolumeclaims
- persistentvolumes
- poddisruptionbudgets
- pods
- replicasets
- replicationcontrollers
- resourcequotas
- secrets
- services
- statefulsets
- storageclasses
- validatingwebhookconfigurations
- volumeattachments
podAnnotations:
cluster-autoscaler.kubernetes.io/safe-to-evict: 'true'
# Use Recreate strategy to ensure only one instance runs at a time
# This prevents duplicate metrics during pod restarts - critical for Prometheus v3
strategy:
type: Recreate
# Enable ServiceMonitor for Prometheus scraping
prometheus:
monitor:
enabled: true
additionalLabels: {}
namespace: ""
namespaceSelector: {}
# Scrape interval, if not set use Prometheus default
interval: "30s"
# Scrape timeout, if not set use Prometheus default
scrapeTimeout: "10s"
# Metric relabelings to apply to samples before ingestion
metricRelabelings: []
# Relabelings to apply to samples before scraping
relabelings: []
# HonorLabels chooses the metric's labels on collisions with target labels
honorLabels: true
# HonorTimestamps controls whether Prometheus respects the timestamps present in scraped data
honorTimestamps: true
# Enable custom resource state metrics
customResourceState:
enabled: true
config:
kind: CustomResourceStateMetrics
spec:
resources:
- groupVersionKind:
group: storage.k8s.io
version: v1
kind: CSINode
metricNamePrefix: csi_node
metrics:
- name: labels
help: "CSINode basic information"
each:
type: Gauge
gauge:
labelsFromPath:
node: [metadata, name]
uid: [metadata, uid]
valueFrom: [metadata, name]
nilIsZero: false
- name: driver_info
help: "CSI driver information per node"
each:
type: Gauge
gauge:
path: [spec, drivers]
labelsFromPath:
node: [metadata, name]
driver_name: [name]
node_id: [nodeID]
valueFrom: [name]
nilIsZero: false
- name: driver_allocatable
help: "CSI driver allocatable storage count"
each:
type: Gauge
gauge:
path: [spec, drivers]
labelsFromPath:
node: [metadata, name]
driver_name: [name]
valueFrom: [allocatable, count]
nilIsZero: true
- groupVersionKind:
group: external-secrets.io
version: v1beta1
kind: ExternalSecret
metricNamePrefix: externalsecret
metrics:
- name: labels
help: "External Secret labels"
each:
type: Gauge
gauge:
labelsFromPath:
name: [metadata, name]
exported_namespace: [metadata, namespace]
team: [metadata, labels, team]
valueFrom: [metadata, name]
nilIsZero: true
- groupVersionKind:
group: cert-manager.io
version: v1
kind: ClusterIssuer
labelsFromPath:
name: [metadata, name]
metricNamePrefix: clusterissuer
metrics:
- name: status
help: "ClusterIssuer Status"
each:
type: Gauge
gauge:
path: [status, conditions]
labelsFromPath:
type: ["type"]
valueFrom: ["status"]
commonLabels:
team: "platform"
- groupVersionKind:
group: sparkoperator.k8s.io
version: v1beta2
kind: SparkApplication
metricNamePrefix: spark_application
labelsFromPath:
application_name: [metadata, name]
namespace: [metadata, namespace]
state: [status, applicationState, state]
submission_id: [status, submissionID]
app_id: [status, sparkApplicationId]
last_submission_attempt_time: [status, applicationState, lastSubmissionAttemptTime]
termination_time: [status, applicationState, terminationTime]
metrics:
- name: labels
help: "SparkApplication info including applicationState"
each:
type: Gauge
gauge:
labelsFromPath:
application_name: [metadata, name]
namespace: [metadata, namespace]
state: [status, applicationState, state]
submission_id: [status, submissionID]
app_id: [status, sparkApplicationId]
submission_attempts: [status, submissionAttempts]
last_submission_attempt_time: [status, applicationState, lastSubmissionAttemptTime]
termination_time: [status, applicationState, terminationTime]
valueFrom: [metadata, name]
nilIsZero: true
- name: submit_count
help: "Number of submission attempts for the SparkApplication"
each:
type: Gauge
gauge:
valueFrom: [status, submissionAttempts]
- name: executor_state
help: "Count of executors in each state"
each:
type: Gauge
gauge:
valueFrom: [status, executorState]
labelKeysFromPath: [status, executorState]
- groupVersionKind:
group: sparkoperator.k8s.io
version: v1beta2
kind: ScheduledSparkApplication
metricNamePrefix: scheduled_spark_application
labelsFromPath:
application_name: [metadata, name]
namespace: [metadata, namespace]
schedule: [spec, schedule]
concurrency_policy: [spec, concurrencyPolicy]
schedule_state: [status, scheduleState]
last_run: [status, lastRun]
last_run_name: [status, lastRunName]
next_run: [status, nextRun]
metrics:
- name: labels
help: "ScheduledSparkApplication info including schedule and status"
each:
type: Gauge
gauge:
labelsFromPath:
application_name: [metadata, name]
namespace: [metadata, namespace]
schedule: [spec, schedule]
concurrency_policy: [spec, concurrencyPolicy]
schedule_state: [status, scheduleState]
last_run: [status, lastRun]
last_run_name: [status, lastRunName]
next_run: [status, nextRun]
valueFrom: [metadata, name]
nilIsZero: true
- name: schedule_state
help: "ScheduledSparkApplication schedule state (1 for Scheduled, 0 for others)"
each:
type: Gauge
gauge:
path: [status, scheduleState]
valueFrom: [status, scheduleState]
nilIsZero: true
- name: last_run_timestamp
help: "Last run timestamp as Unix timestamp"
each:
type: Gauge
gauge:
path: [status, lastRun]
valueFrom: [status, lastRun]
nilIsZero: true
- name: next_run_timestamp
help: "Next run timestamp as Unix timestamp"
each:
type: Gauge
gauge:
path: [status, nextRun]
valueFrom: [status, nextRun]
nilIsZero: true
rbac:
extraRules:
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["list", "watch"]
- apiGroups: ['external-secrets.io']
resources: ['externalsecrets']
verbs: ['list', 'watch']
- apiGroups: ["storage.k8s.io"]
resources: ["csinodes"]
verbs: ["list", "watch"]
- apiGroups: ["cert-manager.io"]
resources: ["clusterissuers"]
verbs: ["list", "watch"]
- apiGroups: ["sparkoperator.k8s.io"]
resources: ["sparkapplications"]
verbs: ["list", "watch"]
- apiGroups: ["sparkoperator.k8s.io"]
resources: ["scheduledsparkapplications"]
verbs: ["list", "watch"]
Anything else we need to know?:
When it starts, I see this:
Environment:
containing
I0605 13:34:16.710479 1 wrapper.go:120] "Starting kube-state-metrics"
W0605 13:34:16.711857 1 client_config.go:667] Neither --kubeconfig nor --master was specified. Using the inClusterConfig. This might not work.
I0605 13:34:16.712067 1 server.go:200] "Used resources" resources=["poddisruptionbudgets","validatingwebhookconfigurations","volumeattachments","cronjobs","leases","limitranges","mutatingwebhookconfigurations","persistentvolumes","replicationcontrollers","storageclasses","configmaps","ingresses","nodes","horizontalpodautoscalers","jobs","namespaces","networkpolicies","persistentvolumeclaims","endpoints","resourcequotas","services","daemonsets","deployments","pods","replicasets","secrets","certificatesigningrequests","statefulsets"]
I0605 13:34:16.712109 1 types.go:195] "Using all namespaces"
I0605 13:34:16.712130 1 server.go:233] "Metric allow-denylisting" allowDenyStatus="Excluding the following lists that were on denylist: "
W0605 13:34:16.712146 1 client_config.go:667] Neither --kubeconfig nor --master was specified. Using the inClusterConfig. This might not work.
I0605 13:34:16.712565 1 utils.go:70] "Tested communication with server"
I0605 13:34:16.718716 1 utils.go:75] "Run with Kubernetes cluster version" major="1" minor="31" gitVersion="v1.31.7-gke.1265000" gitTreeState="clean" gitCommit="d83cb9ce90bb38abf245cdfd7145e5a7a10e6006" platform="linux/amd64"
I0605 13:34:16.718738 1 utils.go:76] "Communication with server successful"
I0605 13:34:16.721262 1 server.go:361] "Started metrics server" metricsServerAddress="[::]:8080"
I0605 13:34:16.721266 1 metrics_handler.go:110] "Autosharding disabled"
I0605 13:34:16.721291 1 server.go:350] "Started kube-state-metrics self metrics server" telemetryAddress="[::]:8081"
I0605 13:34:16.721490 1 tls_config.go:347] "Listening on" address="[::]:8080"
I0605 13:34:16.721519 1 tls_config.go:350] "TLS is disabled." http2=false address="[::]:8080"
I0605 13:34:16.721594 1 tls_config.go:347] "Listening on" address="[::]:8081"
I0605 13:34:16.721636 1 tls_config.go:350] "TLS is disabled." http2=false address="[::]:8081"
I0605 13:34:16.729139 1 builder.go:283] "Active resources" activeStoreNames="certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments"
I0605 13:34:19.721854 1 config.go:85] "Using custom resource plural" resource="external-secrets.io_v1beta1_ExternalSecret" plural="externalsecrets"
I0605 13:34:19.721902 1 config.go:85] "Using custom resource plural" resource="cert-manager.io_v1_ClusterIssuer" plural="clusterissuers"
I0605 13:34:19.721947 1 config.go:85] "Using custom resource plural" resource="sparkoperator.k8s.io_v1beta2_SparkApplication" plural="sparkapplications"
I0605 13:34:19.721997 1 config.go:85] "Using custom resource plural" resource="sparkoperator.k8s.io_v1beta2_ScheduledSparkApplication" plural="scheduledsparkapplications"
I0605 13:34:19.722298 1 custom_resource_metrics.go:79] "Custom resource state added metrics" familyNames=["clusterissuer_status"]
I0605 13:34:19.722504 1 custom_resource_metrics.go:79] "Custom resource state added metrics" familyNames=["externalsecret_labels"]
I0605 13:34:19.722983 1 custom_resource_metrics.go:79] "Custom resource state added metrics" familyNames=["scheduled_spark_application_labels","scheduled_spark_application_schedule_state","scheduled_spark_application_last_run_timestamp","scheduled_spark_application_next_run_timestamp"]
I0605 13:34:19.723082 1 custom_resource_metrics.go:79] "Custom resource state added metrics" familyNames=["spark_application_labels","spark_application_submit_count","spark_application_executor_state"]
I0605 13:34:19.723210 1 builder.go:283] "Active resources" activeStoreNames="cert-manager.io/v1, Resource=clusterissuers,certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,external-secrets.io/v1beta1, Resource=externalsecrets,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,sparkoperator.k8s.io/v1beta2, Resource=scheduledsparkapplications,sparkoperator.k8s.io/v1beta2, Resource=sparkapplications,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments"
I0605 13:34:19.723247 1 discovery.go:262] "discovery finished, cache updated"
-
kube-state-metrics version:
- containerID: containerd://98d67e9631516c47ab8eda282c667146ef9101e19f450ee7a35c09fbe3e328a7
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0
imageID: registry.k8s.io/kube-state-metrics/kube-state-metrics@sha256:db384bf43222b066c378e77027a675d4cd9911107adba46c2922b3a55e10d6fb
- containerID: containerd://98d67e9631516c47ab8eda282c667146ef9101e19f450ee7a35c09fbe3e328a7
-
Kubernetes version (use
kubectl version
): 1.31.7-gke.1265000 -
Cloud provider or hardware configuration: GKE