Skip to content

Commit af455c2

Browse files
authored
Add new EFA resource allocation metrics (#342)
1 parent 726f1bd commit af455c2

File tree

7 files changed

+210
-1
lines changed

7 files changed

+210
-1
lines changed

internal/aws/containerinsight/const.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,13 @@ const (
155155
NeuroncoreUnreservedCapacity = "neuroncore_unreserved_capacity"
156156
NeuroncoreAvailableCapacity = "neuroncore_available_capacity"
157157

158+
EfaLimit = "efa_limit"
159+
EfaUsageTotal = "efa_usage_total"
160+
EfaRequest = "efa_request"
161+
EfaReservedCapacity = "efa_reserved_capacity"
162+
EfaUnreservedCapacity = "efa_unreserved_capacity"
163+
EfaAvailableCapacity = "efa_available_capacity"
164+
158165
HyperPodUnschedulablePendingReplacement = "unschedulable_pending_replacement"
159166
HyperPodUnschedulablePendingReboot = "unschedulable_pending_reboot"
160167
HyperPodSchedulable = "schedulable"
@@ -363,14 +370,20 @@ func init() {
363370
ContainerCount: UnitCount,
364371
ContainerRestartCount: UnitCount,
365372
RunningTaskCount: UnitCount,
366-
367373
EfaRdmaReadBytes: UnitBytesPerSec,
368374
EfaRdmaWriteBytes: UnitBytesPerSec,
369375
EfaRdmaWriteRecvBytes: UnitBytesPerSec,
370376
EfaRxBytes: UnitBytesPerSec,
371377
EfaRxDropped: UnitCountPerSec,
372378
EfaTxBytes: UnitBytesPerSec,
373379

380+
EfaLimit: UnitCount,
381+
EfaUsageTotal: UnitCount,
382+
EfaRequest: UnitCount,
383+
EfaReservedCapacity: UnitPercent,
384+
EfaUnreservedCapacity: UnitPercent,
385+
EfaAvailableCapacity: UnitCount,
386+
374387
GpuLimit: UnitCount,
375388
GpuUsageTotal: UnitCount,
376389
GpuRequest: UnitCount,

internal/aws/containerinsight/utils_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,12 @@ func TestConvertToOTLPMetricsForNodeMetrics(t *testing.T) {
443443
"node_gpu_reserved_capacity": 3.0093851356081194,
444444
"node_gpu_unreserved_capacity": 2.9303736689169724,
445445
"node_gpu_available_capacity": uint64(35),
446+
"node_efa_request": int32(2),
447+
"node_efa_limit": int32(4),
448+
"node_efa_usage_total": int32(3),
449+
"node_efa_reserved_capacity": 50.0,
450+
"node_efa_unreserved_capacity": 50.0,
451+
"node_efa_available_capacity": uint64(2),
446452
}
447453
expectedUnits = map[string]string{
448454
"node_cpu_limit": "",
@@ -485,6 +491,12 @@ func TestConvertToOTLPMetricsForNodeMetrics(t *testing.T) {
485491
"node_gpu_reserved_capacity": UnitPercent,
486492
"node_gpu_unreserved_capacity": UnitPercent,
487493
"node_gpu_available_capacity": UnitCount,
494+
"node_efa_request": UnitCount,
495+
"node_efa_limit": UnitCount,
496+
"node_efa_usage_total": UnitCount,
497+
"node_efa_reserved_capacity": UnitPercent,
498+
"node_efa_unreserved_capacity": UnitPercent,
499+
"node_efa_available_capacity": UnitCount,
488500
}
489501
tags = map[string]string{
490502
"AutoScalingGroupName": "eks-a6bb9db9-267c-401c-db55-df8ef645b06f",
@@ -738,6 +750,11 @@ func TestConvertToOTLPMetricsForPodMetrics(t *testing.T) {
738750
"pod_gpu_limit": 1,
739751
"pod_gpu_usage_total": 1,
740752
"pod_gpu_reserved_capacity": 2.3677681271483983,
753+
"pod_efa_request": 1,
754+
"pod_efa_limit": 2,
755+
"pod_efa_usage_total": 2,
756+
"pod_efa_reserved_capacity": 50.0,
757+
"pod_efa_available_capacity": uint64(3),
741758
}
742759
expectedUnits = map[string]string{
743760
"pod_cpu_limit": "",
@@ -788,6 +805,11 @@ func TestConvertToOTLPMetricsForPodMetrics(t *testing.T) {
788805
"pod_gpu_limit": UnitCount,
789806
"pod_gpu_usage_total": UnitCount,
790807
"pod_gpu_reserved_capacity": UnitPercent,
808+
"pod_efa_request": UnitCount,
809+
"pod_efa_limit": UnitCount,
810+
"pod_efa_usage_total": UnitCount,
811+
"pod_efa_reserved_capacity": UnitPercent,
812+
"pod_efa_available_capacity": UnitCount,
791813
}
792814
tags = map[string]string{
793815
"ClusterName": "eks-aoc",

receiver/awscontainerinsightreceiver/internal/stores/nodeinfo.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ type nodeStats struct {
2323
gpuUsageTotal uint64
2424
neuroncoreReq uint64
2525
neuroncoreUsageTotal uint64
26+
efaReq uint64
27+
efaUsageTotal uint64
2628
}
2729

2830
type nodeInfo struct {
@@ -120,6 +122,15 @@ func (n *nodeInfo) getNodeStatusCapacityGPUs() (uint64, bool) {
120122
return forceConvertToInt64(gpus, n.logger), true
121123
}
122124

125+
func (n *nodeInfo) getNodeStatusCapacityEfas() (uint64, bool) {
126+
capacityResources, ok := n.provider.NodeToCapacityMap()[n.nodeName]
127+
if !ok {
128+
return 0, false
129+
}
130+
efas := capacityResources.Name(resourceSpecEfaKey, resource.DecimalExponent).Value()
131+
return forceConvertToInt64(efas, n.logger), true
132+
}
133+
123134
func (n *nodeInfo) getNeuronResourceCapacity(resourceKey v1.ResourceName) (uint64, bool) {
124135
capacityResources, ok := n.provider.NodeToCapacityMap()[n.nodeName]
125136
if !ok {

receiver/awscontainerinsightreceiver/internal/stores/podstore.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ const (
3232
neuronKey = "aws.amazon.com/neuron"
3333
neuroncoreKey = "aws.amazon.com/neuroncore"
3434
neuronDeviceKey = "aws.amazon.com/neurondevice"
35+
efaKey = "vpc.amazonaws.com/efa"
3536
splitRegexStr = "\\.|-"
3637
kubeProxy = "kube-proxy"
3738
)
@@ -251,6 +252,7 @@ func (p *PodStore) Decorate(ctx context.Context, metric CIMetric, kubernetesBlob
251252
p.decorateMem(metric, &entry.pod)
252253
p.decorateGPU(metric, &entry.pod)
253254
p.decorateNeuron(metric, &entry.pod)
255+
p.decorateEfa(metric, &entry.pod)
254256
p.addStatus(metric, &entry.pod)
255257
addContainerCount(metric, &entry.pod)
256258
addContainerID(&entry.pod, metric, kubernetesBlob, p.logger)
@@ -304,6 +306,8 @@ func (p *PodStore) refreshInternal(now time.Time, podList []corev1.Pod) {
304306
var gpuUsageTotal uint64
305307
var neuroncoreRequest uint64
306308
var neuroncoreUsageTotal uint64
309+
var efaRequest uint64
310+
var efaUsageTotal uint64
307311

308312
for i := range podList {
309313
pod := podList[i]
@@ -331,6 +335,13 @@ func (p *PodStore) refreshInternal(now time.Time, podList []corev1.Pod) {
331335
gpuUsageTotal += tmpGpuLimit
332336
}
333337
}
338+
if tmpEfaLimit, ok := getResourceSettingForPod(&pod, 0, efaKey, getLimitForContainer); ok {
339+
tmpEfaReq, _ := getResourceSettingForPod(&pod, 0, efaKey, getRequestForContainer)
340+
efaRequest += tmpEfaReq
341+
if pod.Status.Phase == corev1.PodRunning {
342+
efaUsageTotal += tmpEfaLimit
343+
}
344+
}
334345
}
335346
if pod.Status.Phase == corev1.PodRunning {
336347
podCount++
@@ -357,6 +368,8 @@ func (p *PodStore) refreshInternal(now time.Time, podList []corev1.Pod) {
357368
gpuUsageTotal: gpuUsageTotal,
358369
neuroncoreReq: neuroncoreRequest,
359370
neuroncoreUsageTotal: neuroncoreUsageTotal,
371+
efaReq: efaRequest,
372+
efaUsageTotal: efaUsageTotal,
360373
})
361374
}
362375

@@ -437,6 +450,17 @@ func (p *PodStore) decorateNode(metric CIMetric) {
437450
metric.AddField(ci.MetricName(ci.TypeNode, ci.NeuroncoreUnreservedCapacity), 100.0-reservedCapacity)
438451
metric.AddField(ci.MetricName(ci.TypeNode, ci.NeuroncoreAvailableCapacity), nodeStatusCapacityNeuroncore-nodeStats.neuroncoreReq)
439452
}
453+
454+
if nodeStatusCapacityEfas, ok := p.nodeInfo.getNodeStatusCapacityEfas(); ok && nodeStatusCapacityEfas != 0 {
455+
metric.AddField(ci.MetricName(ci.TypeNode, ci.EfaRequest), nodeStats.efaReq)
456+
metric.AddField(ci.MetricName(ci.TypeNode, ci.EfaLimit), nodeStatusCapacityEfas)
457+
metric.AddField(ci.MetricName(ci.TypeNode, ci.EfaUsageTotal), nodeStats.efaUsageTotal)
458+
459+
reservedCapacity := float64(nodeStats.efaReq) / float64(nodeStatusCapacityEfas) * 100
460+
metric.AddField(ci.MetricName(ci.TypeNode, ci.EfaReservedCapacity), reservedCapacity)
461+
metric.AddField(ci.MetricName(ci.TypeNode, ci.EfaUnreservedCapacity), 100.0-reservedCapacity)
462+
metric.AddField(ci.MetricName(ci.TypeNode, ci.EfaAvailableCapacity), nodeStatusCapacityEfas-nodeStats.efaReq)
463+
}
440464
}
441465
}
442466
}
@@ -511,6 +535,26 @@ func (p *PodStore) decorateNeuron(metric CIMetric, pod *corev1.Pod) {
511535
}
512536
}
513537

538+
func (p *PodStore) decorateEfa(metric CIMetric, pod *corev1.Pod) {
539+
if p.includeEnhancedMetrics && p.enableAcceleratedComputeMetrics && metric.GetTag(ci.MetricType) == ci.TypePod &&
540+
pod.Status.Phase != corev1.PodSucceeded && pod.Status.Phase != corev1.PodFailed {
541+
if podEfaLimit, ok := getResourceSettingForPod(pod, 0, efaKey, getLimitForContainer); ok {
542+
podEfaRequest, _ := getResourceSettingForPod(pod, 0, efaKey, getRequestForContainer)
543+
metric.AddField(ci.MetricName(ci.TypePod, ci.EfaRequest), podEfaRequest)
544+
metric.AddField(ci.MetricName(ci.TypePod, ci.EfaLimit), podEfaLimit)
545+
var podEfaUsageTotal uint64
546+
if pod.Status.Phase == corev1.PodRunning { // Set the efa limit as the usage_total for running pods only
547+
podEfaUsageTotal = podEfaLimit
548+
}
549+
metric.AddField(ci.MetricName(ci.TypePod, ci.EfaUsageTotal), podEfaUsageTotal)
550+
if nodeStatusCapacityEfas, ok := p.nodeInfo.getNodeStatusCapacityEfas(); ok && nodeStatusCapacityEfas != 0 {
551+
reservedCapacity := float64(podEfaLimit) / float64(nodeStatusCapacityEfas) * 100
552+
metric.AddField(ci.MetricName(ci.TypePod, ci.EfaReservedCapacity), reservedCapacity)
553+
}
554+
}
555+
}
556+
}
557+
514558
func (p *PodStore) decorateCPU(metric CIMetric, pod *corev1.Pod) {
515559
if metric.GetTag(ci.MetricType) == ci.TypePod {
516560
// add cpu limit and request for pod cpu

receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,6 +1109,13 @@ func TestPodStore_decorateNode(t *testing.T) {
11091109
assert.Equal(t, float64(100), metric.GetField("node_neuroncore_unreserved_capacity").(float64))
11101110
assert.Equal(t, uint64(32), metric.GetField("node_neuroncore_available_capacity").(uint64))
11111111

1112+
assert.Equal(t, uint64(0), metric.GetField("node_efa_request").(uint64))
1113+
assert.Equal(t, uint64(4), metric.GetField("node_efa_limit").(uint64))
1114+
assert.Equal(t, uint64(0), metric.GetField("node_efa_usage_total").(uint64))
1115+
assert.Equal(t, float64(0), metric.GetField("node_efa_reserved_capacity").(float64))
1116+
assert.Equal(t, float64(100), metric.GetField("node_efa_unreserved_capacity").(float64))
1117+
assert.Equal(t, uint64(4), metric.GetField("node_efa_available_capacity").(uint64))
1118+
11121119
assert.Equal(t, uint64(1), metric.GetField("node_status_condition_ready").(uint64))
11131120
assert.Equal(t, uint64(0), metric.GetField("node_status_condition_disk_pressure").(uint64))
11141121
assert.Equal(t, uint64(0), metric.GetField("node_status_condition_memory_pressure").(uint64))
@@ -1346,3 +1353,112 @@ func generateRawCIMetric() CIMetric {
13461353
fields := map[string]any{ci.MetricName(ci.TypePod, ci.CPUTotal): float64(1)}
13471354
return generateMetric(fields, tags)
13481355
}
1356+
1357+
func getPodStoreWithEfaCapacity() *PodStore {
1358+
nodeInfo := newNodeInfo("testNode1", &mockNodeInfoProvider{}, zap.NewNop())
1359+
nodeInfo.setCPUCapacity(4000)
1360+
nodeInfo.setMemCapacity(400 * 1024 * 1024)
1361+
return &PodStore{
1362+
cache: newMapWithExpiry(time.Minute),
1363+
nodeInfo: nodeInfo,
1364+
prevMeasurements: sync.Map{},
1365+
logger: zap.NewNop(),
1366+
}
1367+
}
1368+
1369+
func TestPodStore_decorateEfa(t *testing.T) {
1370+
podStore := getPodStoreWithEfaCapacity()
1371+
defer require.NoError(t, podStore.Shutdown())
1372+
1373+
pod := getBaseTestPodInfo()
1374+
pod.Spec.Containers[0].Resources.Requests["vpc.amazonaws.com/efa"] = resource.MustParse("1")
1375+
pod.Spec.Containers[0].Resources.Limits["vpc.amazonaws.com/efa"] = resource.MustParse("2")
1376+
1377+
podList := []corev1.Pod{*pod}
1378+
podStore.refreshInternal(time.Now(), podList)
1379+
1380+
tags := map[string]string{ci.MetricType: ci.TypePod}
1381+
fields := map[string]any{}
1382+
1383+
metric := generateMetric(fields, tags)
1384+
podStore.includeEnhancedMetrics = true
1385+
podStore.enableAcceleratedComputeMetrics = true
1386+
podStore.decorateEfa(metric, pod)
1387+
1388+
assert.Equal(t, uint64(1), metric.GetField("pod_efa_request").(uint64))
1389+
assert.Equal(t, uint64(2), metric.GetField("pod_efa_limit").(uint64))
1390+
assert.Equal(t, uint64(2), metric.GetField("pod_efa_usage_total").(uint64))
1391+
assert.Equal(t, float64(50.0), metric.GetField("pod_efa_reserved_capacity").(float64))
1392+
}
1393+
1394+
func TestPodStore_decorateNode_withMultipleEfaPods(t *testing.T) {
1395+
t.Setenv(ci.HostName, "testNode1")
1396+
podStore := getPodStoreWithEfaCapacity()
1397+
defer require.NoError(t, podStore.Shutdown())
1398+
1399+
pod1 := getBaseTestPodInfo()
1400+
pod1.Name = "efa-pod-1"
1401+
pod1.Spec.Containers[0].Resources.Requests["vpc.amazonaws.com/efa"] = resource.MustParse("1")
1402+
pod1.Spec.Containers[0].Resources.Limits["vpc.amazonaws.com/efa"] = resource.MustParse("2")
1403+
1404+
pod2 := getBaseTestPodInfo()
1405+
pod2.Name = "efa-pod-2"
1406+
pod2.Spec.Containers[0].Resources.Requests["vpc.amazonaws.com/efa"] = resource.MustParse("2")
1407+
pod2.Spec.Containers[0].Resources.Limits["vpc.amazonaws.com/efa"] = resource.MustParse("1")
1408+
pod2.Spec.Containers[0].Resources.Requests["vpc.amazonaws.com/efa"] = resource.MustParse("2")
1409+
pod2.Spec.Containers[0].Resources.Limits["vpc.amazonaws.com/efa"] = resource.MustParse("1")
1410+
1411+
podList := []corev1.Pod{*pod1, *pod2}
1412+
podStore.refreshInternal(time.Now(), podList)
1413+
1414+
tags := map[string]string{ci.MetricType: ci.TypeNode}
1415+
fields := map[string]any{
1416+
ci.MetricName(ci.TypeNode, ci.CPUTotal): float64(1000),
1417+
ci.MetricName(ci.TypeNode, ci.MemWorkingset): float64(2048),
1418+
}
1419+
1420+
metric := generateMetric(fields, tags)
1421+
podStore.includeEnhancedMetrics = true
1422+
podStore.enableAcceleratedComputeMetrics = true
1423+
1424+
podStore.decorateNode(metric)
1425+
1426+
assert.Equal(t, uint64(3), metric.GetField("node_efa_request").(uint64))
1427+
assert.Equal(t, uint64(4), metric.GetField("node_efa_limit").(uint64))
1428+
assert.Equal(t, uint64(3), metric.GetField("node_efa_usage_total").(uint64))
1429+
assert.Equal(t, float64(75.0), metric.GetField("node_efa_reserved_capacity").(float64))
1430+
assert.Equal(t, float64(25.0), metric.GetField("node_efa_unreserved_capacity").(float64))
1431+
assert.Equal(t, uint64(1), metric.GetField("node_efa_available_capacity").(uint64))
1432+
}
1433+
1434+
func TestPodStore_decorateNode_withEfa(t *testing.T) {
1435+
t.Setenv(ci.HostName, "testNode1")
1436+
pod := getBaseTestPodInfo()
1437+
pod.Spec.Containers[0].Resources.Requests["vpc.amazonaws.com/efa"] = resource.MustParse("1")
1438+
pod.Spec.Containers[0].Resources.Limits["vpc.amazonaws.com/efa"] = resource.MustParse("2")
1439+
1440+
podList := []corev1.Pod{*pod}
1441+
podStore := getPodStoreWithEfaCapacity()
1442+
defer require.NoError(t, podStore.Shutdown())
1443+
podStore.refreshInternal(time.Now(), podList)
1444+
1445+
tags := map[string]string{ci.MetricType: ci.TypeNode}
1446+
fields := map[string]any{
1447+
ci.MetricName(ci.TypeNode, ci.CPUTotal): float64(100),
1448+
ci.MetricName(ci.TypeNode, ci.CPULimit): uint64(4000),
1449+
ci.MetricName(ci.TypeNode, ci.MemWorkingset): float64(100 * 1024 * 1024),
1450+
ci.MetricName(ci.TypeNode, ci.MemLimit): uint64(400 * 1024 * 1024),
1451+
}
1452+
1453+
metric := generateMetric(fields, tags)
1454+
podStore.includeEnhancedMetrics = true
1455+
podStore.enableAcceleratedComputeMetrics = true
1456+
podStore.decorateNode(metric)
1457+
1458+
assert.Equal(t, uint64(1), metric.GetField("node_efa_request").(uint64))
1459+
assert.Equal(t, uint64(4), metric.GetField("node_efa_limit").(uint64))
1460+
assert.Equal(t, uint64(2), metric.GetField("node_efa_usage_total").(uint64))
1461+
assert.Equal(t, float64(25.0), metric.GetField("node_efa_reserved_capacity").(float64))
1462+
assert.Equal(t, float64(75.0), metric.GetField("node_efa_unreserved_capacity").(float64))
1463+
assert.Equal(t, uint64(3), metric.GetField("node_efa_available_capacity").(uint64))
1464+
}

receiver/awscontainerinsightreceiver/internal/stores/utils.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ const (
2626
resourceSpecNeuronKey = "aws.amazon.com/neuron"
2727
resourceSpecNeuroncoreKey = "aws.amazon.com/neuroncore"
2828
resourceSpecNeuronDeviceKey = "aws.amazon.com/neurondevice"
29+
resourceSpecEfaKey = "vpc.amazonaws.com/efa"
2930
)
3031

3132
func createPodKeyFromMetaData(pod *corev1.Pod) string {

receiver/awscontainerinsightreceiver/internal/stores/utils_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ func (m *mockNodeInfoProvider) NodeToCapacityMap() map[string]v1.ResourceList {
7272
"aws.amazon.com/neuron": *resource.NewQuantity(16, resource.DecimalSI),
7373
"aws.amazon.com/neuroncore": *resource.NewQuantity(32, resource.DecimalSI),
7474
"aws.amazon.com/neurondevice": *resource.NewQuantity(16, resource.DecimalSI),
75+
"vpc.amazonaws.com/efa": *resource.NewQuantity(4, resource.DecimalSI),
7576
},
7677
"testNode2": {
7778
"pods": *resource.NewQuantity(10, resource.DecimalSI),
@@ -87,6 +88,7 @@ func (m *mockNodeInfoProvider) NodeToAllocatableMap() map[string]v1.ResourceList
8788
"aws.amazon.com/neuron": *resource.NewQuantity(16, resource.DecimalSI),
8889
"aws.amazon.com/neuroncore": *resource.NewQuantity(32, resource.DecimalSI),
8990
"aws.amazon.com/neurondevice": *resource.NewQuantity(16, resource.DecimalSI),
91+
"vpc.amazonaws.com/efa": *resource.NewQuantity(4, resource.DecimalSI),
9092
},
9193
"testNode2": {
9294
"pods": *resource.NewQuantity(20, resource.DecimalSI),

0 commit comments

Comments
 (0)