[tempo-distributed] added more autoscaling configurations to Tempo components

jordan-simonovski · jordan-simonovski · commit 8b5bcfc6dd88 · 2025-09-18T22:21:51.000+10:00
diff --git a/charts/tempo-distributed/Chart.yaml b/charts/tempo-distributed/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: tempo-distributed
 description: Grafana Tempo in MicroService mode
 type: application
-version: 1.47.4
+version: 1.48.0
 appVersion: 2.8.2
 engine: gotpl
 home: https://grafana.com/docs/tempo/latest/
diff --git a/charts/tempo-distributed/README.md b/charts/tempo-distributed/README.md
@@ -361,6 +361,8 @@ The memcached default args are removed and should be provided manually. The sett
 | distributor.appProtocol.grpc | string | `nil` | Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" |
 | distributor.autoscaling.behavior | object | `{}` | Autoscaling behavior configuration for the distributor |
 | distributor.autoscaling.enabled | bool | `false` | Enable autoscaling for the distributor |
+| distributor.autoscaling.keda | object | `{"enabled":false,"triggers":[]}` | Autoscaling via keda/ScaledObject |
+| distributor.autoscaling.keda.triggers | list | `[]` | List of autoscaling triggers for the distributor |
 | distributor.autoscaling.maxReplicas | int | `3` | Maximum autoscaling replicas for the distributor |
 | distributor.autoscaling.minReplicas | int | `1` | Minimum autoscaling replicas for the distributor |
 | distributor.autoscaling.targetCPUUtilizationPercentage | int | `60` | Target CPU utilisation percentage for the distributor |
@@ -573,6 +575,8 @@ The memcached default args are removed and should be provided manually. The sett
 | ingester.appProtocol.grpc | string | `nil` | Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" |
 | ingester.autoscaling.behavior | object | `{}` | Autoscaling behavior configuration for the ingester |
 | ingester.autoscaling.enabled | bool | `false` | Enable autoscaling for the ingester. WARNING: Autoscaling ingesters can result in lost data. Only do this if you know what you're doing. |
+| ingester.autoscaling.keda | object | `{"enabled":false,"triggers":[]}` | Autoscaling via keda/ScaledObject |
+| ingester.autoscaling.keda.triggers | list | `[]` | List of autoscaling triggers for the ingester |
 | ingester.autoscaling.maxReplicas | int | `3` | Maximum autoscaling replicas for the ingester |
 | ingester.autoscaling.minReplicas | int | `2` | Minimum autoscaling replicas for the ingester |
 | ingester.autoscaling.targetCPUUtilizationPercentage | int | `60` | Target CPU utilisation percentage for the ingester |
@@ -723,6 +727,14 @@ The memcached default args are removed and should be provided manually. The sett
 | metricsGenerator.annotations | object | `{}` | Annotations for the metrics-generator StatefulSet |
 | metricsGenerator.appProtocol | object | `{"grpc":null}` | Adds the appProtocol field to the metricsGenerator service. This allows metricsGenerator to work with istio protocol selection. |
 | metricsGenerator.appProtocol.grpc | string | `nil` | Set the optional grpc service protocol. Ex: "grpc", "http2" or "https" |
+| metricsGenerator.autoscaling.behavior | object | `{}` | Autoscaling behavior configuration for the metrics-generator |
+| metricsGenerator.autoscaling.enabled | bool | `false` | Scaling down metrics-generators can cause backpressure on the distributor. |
+| metricsGenerator.autoscaling.keda | object | `{"enabled":false,"triggers":[]}` | Autoscaling via keda/ScaledObject |
+| metricsGenerator.autoscaling.keda.triggers | list | `[]` | List of autoscaling triggers for the metrics-generator |
+| metricsGenerator.autoscaling.maxReplicas | int | `3` | Maximum autoscaling replicas for the metrics-generator |
+| metricsGenerator.autoscaling.minReplicas | int | `2` | Minimum autoscaling replicas for the metrics-generator |
+| metricsGenerator.autoscaling.targetCPUUtilizationPercentage | int | `60` | Target CPU utilisation percentage for the metrics-generator |
+| metricsGenerator.autoscaling.targetMemoryUtilizationPercentage | string | `nil` | Target memory utilisation percentage for the metrics-generator |
 | metricsGenerator.config | object | `{"metrics_ingestion_time_range_slack":"30s","processor":{"service_graphs":{"dimensions":[],"histogram_buckets":[0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8],"max_items":10000,"wait":"10s","workers":10},"span_metrics":{"dimensions":[],"histogram_buckets":[0.002,0.004,0.008,0.016,0.032,0.064,0.128,0.256,0.512,1.02,2.05,4.1]}},"registry":{"collection_interval":"15s","external_labels":{},"stale_duration":"15m"},"storage":{"path":"/var/tempo/wal","remote_write":[],"remote_write_add_org_id_header":true,"remote_write_flush_deadline":"1m","wal":null},"traces_storage":{"path":"/var/tempo/traces"}}` | More information on configuration: https://grafana.com/docs/tempo/latest/configuration/#metrics-generator |
 | metricsGenerator.config.processor.service_graphs | object | `{"dimensions":[],"histogram_buckets":[0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8],"max_items":10000,"wait":"10s","workers":10}` | For processors to be enabled and generate metrics, pass the names of the processors to `overrides.defaults.metrics_generator.processors` value like `[service-graphs, span-metrics]`. |
 | metricsGenerator.config.processor.service_graphs.dimensions | list | `[]` | The resource and span attributes to be added to the service graph metrics, if present. |
diff --git a/charts/tempo-distributed/templates/distributor/keda-scaledObject.yaml b/charts/tempo-distributed/templates/distributor/keda-scaledObject.yaml
@@ -0,0 +1,28 @@
+{{- if and .Values.distributor.autoscaling.enabled .Values.distributor.autoscaling.keda.enabled }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ include "tempo.resourceName" (dict "ctx" . "component" "distributor") }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tempo.labels" (dict "ctx" . "component" "distributor") | nindent 4 }}
+spec:
+  minReplicaCount: {{ .Values.distributor.autoscaling.minReplicas }}
+  maxReplicaCount: {{ .Values.distributor.autoscaling.maxReplicas }}
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "tempo.resourceName" (dict "ctx" . "component" "distributor") }}
+  triggers:
+  {{- range .Values.distributor.autoscaling.keda.triggers }}
+  - type: {{ .type | quote }}
+    metadata:
+      serverAddress: {{ .metadata.serverAddress }}
+      threshold: {{ .metadata.threshold | quote }}
+      query: |
+        {{- .metadata.query | nindent 8 }}
+      {{- if .metadata.customHeaders }}
+      customHeaders: {{ .metadata.customHeaders }}
+      {{- end }}
+  {{- end }}
+{{- end }}
diff --git a/charts/tempo-distributed/templates/ingester/keda-scaledObject.yaml b/charts/tempo-distributed/templates/ingester/keda-scaledObject.yaml
@@ -0,0 +1,28 @@
+{{- if and .Values.ingester.autoscaling.enabled .Values.ingester.autoscaling.keda.enabled }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ include "tempo.resourceName" (dict "ctx" . "component" "ingester") }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tempo.labels" (dict "ctx" . "component" "ingester") | nindent 4 }}
+spec:
+  minReplicaCount: {{ .Values.ingester.autoscaling.minReplicas }}
+  maxReplicaCount: {{ .Values.ingester.autoscaling.maxReplicas }}
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "tempo.resourceName" (dict "ctx" . "component" "ingester") }}
+  triggers:
+  {{- range .Values.ingester.autoscaling.keda.triggers }}
+  - type: {{ .type | quote }}
+    metadata:
+      serverAddress: {{ .metadata.serverAddress }}
+      threshold: {{ .metadata.threshold | quote }}
+      query: |
+        {{- .metadata.query | nindent 8 }}
+      {{- if .metadata.customHeaders }}
+      customHeaders: {{ .metadata.customHeaders }}
+      {{- end }}
+  {{- end }}
+{{- end }}
diff --git a/charts/tempo-distributed/templates/metrics-generator/hpa.yaml b/charts/tempo-distributed/templates/metrics-generator/hpa.yaml
@@ -0,0 +1,46 @@
+{{- if .Values.metricsGenerator.autoscaling.enabled }}
+{{- $apiVersion := include "tempo.hpa.apiVersion" . -}}
+apiVersion: {{ $apiVersion }}
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "tempo.resourceName" (dict "ctx" . "component" "metricsGenerator") }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tempo.labels" (dict "ctx" . "component" "metricsGenerator") | nindent 4 }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: StatefulSet
+    name: {{ include "tempo.resourceName" (dict "ctx" . "component" "metricsGenerator") }}
+  minReplicas: {{ .Values.metricsGenerator.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.metricsGenerator.autoscaling.maxReplicas }}
+  {{- with .Values.metricsGenerator.autoscaling.behavior }}
+  behavior:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  metrics:
+  {{- with .Values.metricsGenerator.autoscaling.targetMemoryUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: memory
+        {{- if (eq $apiVersion "autoscaling/v2") }}
+        target:
+          type: Utilization
+          averageUtilization: {{ . }}
+        {{- else }}
+        targetAverageUtilization: {{ . }}
+        {{- end }}
+  {{- end }}
+  {{- with .Values.metricsGenerator.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        {{- if (eq $apiVersion "autoscaling/v2") }}
+        target:
+          type: Utilization
+          averageUtilization: {{ . }}
+        {{- else }}
+        targetAverageUtilization: {{ . }}
+        {{- end }}
+  {{- end }}
+{{- end }}
diff --git a/charts/tempo-distributed/templates/metrics-generator/keda-scaledObject.yaml b/charts/tempo-distributed/templates/metrics-generator/keda-scaledObject.yaml
@@ -0,0 +1,28 @@
+{{- if and .Values.metricsGenerator.autoscaling.enabled .Values.metricsGenerator.autoscaling.keda.enabled }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ include "tempo.resourceName" (dict "ctx" . "component" "metricsGenerator") }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "tempo.labels" (dict "ctx" . "component" "metricsGenerator") | nindent 4 }}
+spec:
+  minReplicaCount: {{ .Values.metricsGenerator.autoscaling.minReplicas }}
+  maxReplicaCount: {{ .Values.metricsGenerator.autoscaling.maxReplicas }}
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "tempo.resourceName" (dict "ctx" . "component" "metricsGenerator") }}
+  triggers:
+  {{- range .Values.metricsGenerator.autoscaling.keda.triggers }}
+  - type: {{ .type | quote }}
+    metadata:
+      serverAddress: {{ .metadata.serverAddress }}
+      threshold: {{ .metadata.threshold | quote }}
+      query: |
+        {{- .metadata.query | nindent 8 }}
+      {{- if .metadata.customHeaders }}
+      customHeaders: {{ .metadata.customHeaders }}
+      {{- end }}
+  {{- end }}
+{{- end }}
diff --git a/charts/tempo-distributed/values.yaml b/charts/tempo-distributed/values.yaml
@@ -150,6 +150,22 @@ ingester:
     targetCPUUtilizationPercentage: 60
     # -- Target memory utilisation percentage for the ingester
     targetMemoryUtilizationPercentage:
+    # -- Autoscaling via keda/ScaledObject
+    keda:
+      # requires https://keda.sh/
+      enabled: false
+      # -- List of autoscaling triggers for the ingester
+      triggers: []
+      # - type: prometheus
+      #   metadata:
+      #     serverAddress: "http://<prometheus-host>:9090"
+      #     threshold: "<set to a value below your rate limit>"
+      # -- KEDA autoscaling will automatically average out the value received by the number of replicas.
+      #     query: |-
+      #       sum(
+      #         tempo_ingester_traces_created_total{cluster=\"$cluster\", namespace=\"$namespace\"}
+      #       ) by (name)
+      #     customHeaders: X-Scope-OrgID=<tenant-id>
   image:
     # -- The Docker registry for the ingester image. Overrides `tempo.image.registry`
     registry: null
@@ -337,6 +353,37 @@ metricsGenerator:
   #      - domain.tld
   # -- Init containers for the metrics generator pod
   initContainers: []
+  autoscaling:
+    # -- Enable autoscaling for the metrics-generator. WARNING: Autoscaling metrics-generators can result in lost data. Only do this if you know what you're doing.
+    # -- Scaling down metrics-generators can cause backpressure on the distributor.
+    enabled: false
+    # -- Minimum autoscaling replicas for the metrics-generator
+    minReplicas: 2
+    # -- Maximum autoscaling replicas for the metrics-generator
+    maxReplicas: 3
+    # -- Autoscaling behavior configuration for the metrics-generator
+    behavior: {}
+    # -- Target CPU utilisation percentage for the metrics-generator
+    targetCPUUtilizationPercentage: 60
+    # -- Target memory utilisation percentage for the metrics-generator
+    targetMemoryUtilizationPercentage:
+    # -- Autoscaling via keda/ScaledObject
+    keda:
+      # requires https://keda.sh/
+      enabled: false
+      # -- List of autoscaling triggers for the metrics-generator
+      triggers: []
+      # - type: prometheus
+      #   metadata:
+      #     serverAddress: "http://<prometheus-host>:9090"
+      #     threshold: "<set to a value below your rate limit>"
+      # -- KEDA autoscaling will automatically average out the value received by the number of replicas.
+      # -- This example scales on distributor queue length to alleviate backpressure.
+      #     query: |-
+      #       sum(
+      #         tempo_distributor_queue_length{namespace=~".*"}
+      #       ) by (name)
+      #     customHeaders: X-Scope-OrgID=<tenant-id>
   image:
     # -- The Docker registry for the metrics-generator image. Overrides `tempo.image.registry`
     registry: null
@@ -506,6 +553,22 @@ distributor:
     targetCPUUtilizationPercentage: 60
     # -- Target memory utilisation percentage for the distributor
     targetMemoryUtilizationPercentage:
+    # -- Autoscaling via keda/ScaledObject
+    keda:
+      # requires https://keda.sh/
+      enabled: false
+      # -- List of autoscaling triggers for the distributor
+      triggers: []
+      # - type: prometheus
+      #   metadata:
+      #     serverAddress: "http://<prometheus-host>:9090"
+      #     threshold: "<set to a value below your rate limit>"
+      # -- KEDA autoscaling will automatically average out the value received by the number of replicas.
+      #     query: |-
+      #       sum by(cluster) (
+      #       rate(tempo_distributor_spans_received_total{namespace=~".*"})
+      #       )
+      #     customHeaders: X-Scope-OrgID=<tenant-id>
   image:
     # -- The Docker registry for the distributor image. Overrides `tempo.image.registry`
     registry: null
@@ -665,12 +728,10 @@ compactor:
       #   metadata:
       #     serverAddress: "http://<prometheus-host>:9090"
       #     threshold: "250"
+      # -- KEDA autoscaling will automatically average out the value received by the number of replicas.
       #     query: |-
       #       sum by (cluster, namespace, tenant) (
       #         tempodb_compaction_outstanding_blocks{container="compactor", namespace=~".*"}
-      #       ) /
-      #       ignoring(tenant) group_left count by (cluster, namespace)(
-      #         tempo_build_info{container="compactor", namespace=~".*"}
       #       )
       #     customHeaders: X-Scope-OrgID=<tenant-id>