diff --git a/charts/amazon-cloudwatch-observability/templates/_helpers.tpl b/charts/amazon-cloudwatch-observability/templates/_helpers.tpl index 99bd128b..65e87aa1 100644 --- a/charts/amazon-cloudwatch-observability/templates/_helpers.tpl +++ b/charts/amazon-cloudwatch-observability/templates/_helpers.tpl @@ -248,6 +248,26 @@ Get the current recommended neuron-monitor image for a region {{- printf "%s/%s:%s" $imageDomain .Values.neuronMonitor.image.repository .Values.neuronMonitor.image.tag -}} {{- end -}} +{{/* +Set DCGM_EXPORTER_INTERVAL environment variable for dcgmExporter if accelerated_compute_gpu_metrics_collection_interval is set and less than 60 +*/}} +{{- define "dcgm-exporter.env" -}} +{{- $intervalFound := false -}} +{{- $intervalValue := 0 -}} +{{- range .Values.agents -}} + {{- $agent := merge . (deepCopy $.Values.agent) -}} + {{- $agentConfig := $agent.config | default $agent.defaultConfig -}} + {{- if and (hasKey $agentConfig "logs") (hasKey $agentConfig.logs "metrics_collected") (hasKey $agentConfig.logs.metrics_collected "kubernetes") (hasKey $agentConfig.logs.metrics_collected.kubernetes "accelerated_compute_gpu_metrics_collection_interval") -}} + {{- $intervalFound = true -}} + {{- $intervalValue = $agentConfig.logs.metrics_collected.kubernetes.accelerated_compute_gpu_metrics_collection_interval -}} + {{- end -}} +{{- end -}} +{{- if and $intervalFound (lt ($intervalValue | int) 60) -}} +- name: DCGM_EXPORTER_INTERVAL + value: "1000" +{{- end -}} +{{- end -}} + {{/* Get the current recommended auto instrumentation java image */}} @@ -407,5 +427,3 @@ Get namespaceSelector value for admission webhooks {{- end -}} {{- end -}} {{- end -}} - - diff --git a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml index 0d485514..f1937992 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml @@ -31,6 +31,7 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + {{- include "dcgm-exporter.env" . | nindent 2 }} ports: - name: "metrics" port: {{ .Values.dcgmExporter.service.port }} @@ -69,4 +70,4 @@ spec: cert_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.crt key_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.key {{- dict "component" .Values.dcgmExporter "context" . | include "amazon-cloudwatch-observability.common.tolerations" | nindent 2 }} -{{- end }} \ No newline at end of file +{{- end }}