Skip to content

Commit f37c1d5

Browse files
authored
Add Neuron Core and EFA capacity metrics (#1820)
1 parent 562f735 commit f37c1d5

File tree

3 files changed

+28
-2
lines changed

3 files changed

+28
-2
lines changed

translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,14 @@ exporters:
157157
- pod_gpu_limit
158158
- pod_gpu_usage_total
159159
- pod_gpu_reserved_capacity
160+
- pod_neuroncore_request
161+
- pod_neuroncore_limit
162+
- pod_neuroncore_usage_total
163+
- pod_neuroncore_reserved_capacity
164+
- pod_efa_request
165+
- pod_efa_limit
166+
- pod_efa_usage_total
167+
- pod_efa_reserved_capacity
160168
- dimensions:
161169
- - ClusterName
162170
- InstanceId
@@ -187,6 +195,16 @@ exporters:
187195
- node_gpu_reserved_capacity
188196
- node_gpu_unreserved_capacity
189197
- node_gpu_available_capacity
198+
- node_neuroncore_limit
199+
- node_neuroncore_usage_total
200+
- node_neuroncore_reserved_capacity
201+
- node_neuroncore_unreserved_capacity
202+
- node_neuroncore_available_capacity
203+
- node_efa_limit
204+
- node_efa_usage_total
205+
- node_efa_reserved_capacity
206+
- node_efa_unreserved_capacity
207+
- node_efa_available_capacity
190208
- dimensions:
191209
- - ClusterName
192210
- InstanceId

translator/translate/otel/exporter/awsemf/kubernetes.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ func getPodMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
108108
}...)
109109
if awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) {
110110
selectors = append(selectors, "pod_gpu_request", "pod_gpu_limit", "pod_gpu_usage_total", "pod_gpu_reserved_capacity")
111+
selectors = append(selectors, "pod_neuroncore_request", "pod_neuroncore_limit", "pod_neuroncore_usage_total", "pod_neuroncore_reserved_capacity")
112+
selectors = append(selectors, "pod_efa_request", "pod_efa_limit", "pod_efa_usage_total", "pod_efa_reserved_capacity")
111113
}
112114
}
113115

@@ -155,6 +157,8 @@ func getNodeMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDecla
155157
}
156158
if awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) {
157159
nodeMetrics = append(nodeMetrics, "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity")
160+
nodeMetrics = append(nodeMetrics, "node_neuroncore_limit", "node_neuroncore_usage_total", "node_neuroncore_reserved_capacity", "node_neuroncore_unreserved_capacity", "node_neuroncore_available_capacity")
161+
nodeMetrics = append(nodeMetrics, "node_efa_limit", "node_efa_usage_total", "node_efa_reserved_capacity", "node_efa_unreserved_capacity", "node_efa_available_capacity")
158162
}
159163
if enhancedContainerInsightsEnabled {
160164
return []*awsemfexporter.MetricDeclaration{

translator/translate/otel/exporter/awsemf/translator_test.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,8 @@ func TestTranslator(t *testing.T) {
311311
"pod_container_status_waiting_reason_image_pull_error", "pod_container_status_waiting_reason_start_error", "pod_container_status_waiting_reason_create_container_error",
312312
"pod_container_status_waiting_reason_create_container_config_error", "pod_container_status_terminated_reason_oom_killed",
313313
"pod_gpu_request", "pod_gpu_limit", "pod_gpu_usage_total", "pod_gpu_reserved_capacity",
314+
"pod_neuroncore_request", "pod_neuroncore_limit", "pod_neuroncore_usage_total", "pod_neuroncore_reserved_capacity",
315+
"pod_efa_request", "pod_efa_limit", "pod_efa_usage_total", "pod_efa_reserved_capacity",
314316
},
315317
},
316318
{
@@ -319,8 +321,10 @@ func TestTranslator(t *testing.T) {
319321
"node_memory_reserved_capacity", "node_number_of_running_pods", "node_number_of_running_containers",
320322
"node_cpu_usage_total", "node_cpu_limit", "node_memory_working_set", "node_memory_limit",
321323
"node_status_condition_ready", "node_status_condition_disk_pressure", "node_status_condition_memory_pressure",
322-
"node_status_condition_pid_pressure", "node_status_condition_network_unavailable", "node_status_condition_unknown",
323-
"node_status_capacity_pods", "node_status_allocatable_pods", "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity"},
324+
"node_status_condition_pid_pressure", "node_status_condition_network_unavailable", "node_status_condition_unknown", "node_status_capacity_pods", "node_status_allocatable_pods",
325+
"node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity",
326+
"node_neuroncore_limit", "node_neuroncore_usage_total", "node_neuroncore_reserved_capacity", "node_neuroncore_unreserved_capacity", "node_neuroncore_available_capacity",
327+
"node_efa_limit", "node_efa_usage_total", "node_efa_reserved_capacity", "node_efa_unreserved_capacity", "node_efa_available_capacity"},
324328
},
325329
{
326330
Dimensions: [][]string{

0 commit comments

Comments
 (0)