feat(k8s-observability-monitoring): add sending_queue size limits to prevent OOM

loafoe · loafoe · commit bc233fee07fd · 2026-03-20T07:56:01.000+01:00
Add configurable queueSize and numConsumers parameters to the OTLP
exporter sending_queue. Without queue_size limits, memory can grow
unbounded during destination outages, causing OOM kills.

Default queueSize set to 500 batches to balance resilience with memory usage.
diff --git a/charts/k8s-observability-monitoring/Chart.yaml b/charts/k8s-observability-monitoring/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
 name: k8s-observability-monitoring
-version: 0.37.0
+version: 0.38.0
 description: Helm chart for k8s-observability-monitoring
 
 # renovate: datasource=helm depName=k8s-monitoring registryUrl=https://grafana.github.io/helm-charts
diff --git a/charts/k8s-observability-monitoring/templates/custom-alloy-configmap.yaml b/charts/k8s-observability-monitoring/templates/custom-alloy-configmap.yaml
@@ -410,6 +410,12 @@ data:
       {{- if $.Values.customAlloy.sendingQueue.enabled }}
       sending_queue {
         enabled = true
+        {{- if $.Values.customAlloy.sendingQueue.queueSize }}
+        queue_size = {{ $.Values.customAlloy.sendingQueue.queueSize }}
+        {{- end }}
+        {{- if $.Values.customAlloy.sendingQueue.numConsumers }}
+        num_consumers = {{ $.Values.customAlloy.sendingQueue.numConsumers }}
+        {{- end }}
       }
       {{- end }}
     }
diff --git a/charts/k8s-observability-monitoring/values.yaml b/charts/k8s-observability-monitoring/values.yaml
@@ -155,10 +155,17 @@ customAlloy:
   liveDebugging:
     # -- Enable live debugging
     enabled: true
-  # -- Sending queue configuration for resilience during destination outages
+  # -- Sending queue configuration for resilience during destination outages.
+  # The queue buffers batches when the destination is unavailable.
+  # Without queue_size limit, memory can grow unbounded during outages causing OOM.
   sendingQueue:
     # -- Enable sending queue
     enabled: true
+    # -- Maximum number of batches kept in memory (default: 1000 if unset).
+    # Lower values prevent OOM during extended outages. Recommended: 100-500.
+    queueSize: 500
+    # -- Number of parallel consumers sending batches (default: 10 if unset)
+    numConsumers: 10
   # -- Remove high-cardinality attributes to reduce storage costs
   # Matches k8s-monitoring attribute cleanup
   attributeCleanup:

Original file line number	Diff line number	Diff line change
`@@ -410,6 +410,12 @@ data:`
`410`	`410`	`{{- if $.Values.customAlloy.sendingQueue.enabled }}`
`411`	`411`	`sending_queue {`
`412`	`412`	`enabled = true`
	`413`	`+ {{- if $.Values.customAlloy.sendingQueue.queueSize }}`
	`414`	`+ queue_size = {{ $.Values.customAlloy.sendingQueue.queueSize }}`
	`415`	`+ {{- end }}`
	`416`	`+ {{- if $.Values.customAlloy.sendingQueue.numConsumers }}`
	`417`	`+ num_consumers = {{ $.Values.customAlloy.sendingQueue.numConsumers }}`
	`418`	`+ {{- end }}`
`413`	`419`	`}`
`414`	`420`	`{{- end }}`
`415`	`421`	`}`