Skip to content

Commit 4b4b3ff

Browse files
committed
Scrape more EFA metrics:
- unresponsive_remote_events - impaired_remote_conn_events - retrans_timeout_events - retrans_pkts - retrans_bytes
1 parent f6af981 commit 4b4b3ff

File tree

3 files changed

+169
-89
lines changed

3 files changed

+169
-89
lines changed

internal/aws/containerinsight/const.go

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,17 @@ const (
140140
DiskIOWrite = "Write"
141141
DiskIOTotal = "Total"
142142

143-
EfaRdmaReadBytes = "rdma_read_bytes"
144-
EfaRdmaWriteBytes = "rdma_write_bytes"
145-
EfaRdmaWriteRecvBytes = "rdma_write_recv_bytes"
146-
EfaRxBytes = "rx_bytes"
147-
EfaRxDropped = "rx_dropped"
148-
EfaTxBytes = "tx_bytes"
143+
EfaRdmaReadBytes = "rdma_read_bytes"
144+
EfaRdmaWriteBytes = "rdma_write_bytes"
145+
EfaRdmaWriteRecvBytes = "rdma_write_recv_bytes"
146+
EfaRxBytes = "rx_bytes"
147+
EfaRxDropped = "rx_dropped"
148+
EfaTxBytes = "tx_bytes"
149+
EfaRetransBytes = "retrans_bytes"
150+
EfaRetransPkts = "retrans_pkts"
151+
EfaRetransTimeoutEvents = "retrans_timeout_events"
152+
EfaUnresponsiveRemoveEvents = "unresponsive_remote_events"
153+
EfaImpairedRemoteConnEvents = "impaired_remote_conn_events"
149154

150155
GpuLimit = "gpu_limit"
151156
GpuUsageTotal = "gpu_usage_total"
@@ -377,17 +382,22 @@ func init() {
377382
// for CPU and CPU cores, but UnitBytes would be more appropriate for resource type memory.
378383

379384
// others
380-
RunningPodCount: UnitCount,
381-
RunningContainerCount: UnitCount,
382-
ContainerCount: UnitCount,
383-
ContainerRestartCount: UnitCount,
384-
RunningTaskCount: UnitCount,
385-
EfaRdmaReadBytes: UnitBytesPerSec,
386-
EfaRdmaWriteBytes: UnitBytesPerSec,
387-
EfaRdmaWriteRecvBytes: UnitBytesPerSec,
388-
EfaRxBytes: UnitBytesPerSec,
389-
EfaRxDropped: UnitCountPerSec,
390-
EfaTxBytes: UnitBytesPerSec,
385+
RunningPodCount: UnitCount,
386+
RunningContainerCount: UnitCount,
387+
ContainerCount: UnitCount,
388+
ContainerRestartCount: UnitCount,
389+
RunningTaskCount: UnitCount,
390+
EfaRdmaReadBytes: UnitBytesPerSec,
391+
EfaRdmaWriteBytes: UnitBytesPerSec,
392+
EfaRdmaWriteRecvBytes: UnitBytesPerSec,
393+
EfaRxBytes: UnitBytesPerSec,
394+
EfaRxDropped: UnitCountPerSec,
395+
EfaTxBytes: UnitBytesPerSec,
396+
EfaRetransBytes: UnitBytes,
397+
EfaRetransPkts: UnitCount,
398+
EfaRetransTimeoutEvents: UnitCount,
399+
EfaUnresponsiveRemoveEvents: UnitCount,
400+
EfaImpairedRemoteConnEvents: UnitCount,
391401

392402
EfaLimit: UnitCount,
393403
EfaUsageTotal: UnitCount,

receiver/awscontainerinsightreceiver/internal/efa/efaSysfs.go

Lines changed: 54 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,31 @@ const (
3030
efaK8sResourceName = "vpc.amazonaws.com/efa"
3131

3232
// hardware counter names
33-
counterRdmaReadBytes = "rdma_read_bytes"
34-
counterRdmaWriteBytes = "rdma_write_bytes"
35-
counterRdmaWriteRecvBytes = "rdma_write_recv_bytes"
36-
counterRxBytes = "rx_bytes"
37-
counterRxDrops = "rx_drops"
38-
counterTxBytes = "tx_bytes"
33+
counterRdmaReadBytes = "rdma_read_bytes"
34+
counterRdmaWriteBytes = "rdma_write_bytes"
35+
counterRdmaWriteRecvBytes = "rdma_write_recv_bytes"
36+
counterRxBytes = "rx_bytes"
37+
counterRxDrops = "rx_drops"
38+
counterTxBytes = "tx_bytes"
39+
counterRetransBytes = "retrans_bytes"
40+
counterRetransPkts = "retrans_pkts"
41+
counterRetransTimeoutEvents = "retrans_timeout_events"
42+
counterUnresponsiveRemoteEvents = "unresponsive_remote_events"
43+
counterImpairedRemoteConnEvents = "impaired_remote_conn_events"
3944
)
4045

4146
var counterNames = map[string]any{
42-
counterRdmaReadBytes: nil,
43-
counterRdmaWriteBytes: nil,
44-
counterRdmaWriteRecvBytes: nil,
45-
counterRxBytes: nil,
46-
counterRxDrops: nil,
47-
counterTxBytes: nil,
47+
counterRdmaReadBytes: nil,
48+
counterRdmaWriteBytes: nil,
49+
counterRdmaWriteRecvBytes: nil,
50+
counterRxBytes: nil,
51+
counterRxDrops: nil,
52+
counterTxBytes: nil,
53+
counterRetransBytes: nil,
54+
counterRetransPkts: nil,
55+
counterRetransTimeoutEvents: nil,
56+
counterUnresponsiveRemoteEvents: nil,
57+
counterImpairedRemoteConnEvents: nil,
4858
}
4959

5060
type Scraper struct {
@@ -98,12 +108,17 @@ type efaDeviceName string
98108
// /sys/class/infiniband/<Name>/ports/<Port>/hw_counters
99109
// for a single port of one Amazon Elastic Fabric Adapter device.
100110
type efaCounters struct {
101-
rdmaReadBytes uint64 // hw_counters/rdma_read_bytes
102-
rdmaWriteBytes uint64 // hw_counters/rdma_write_bytes
103-
rdmaWriteRecvBytes uint64 // hw_counters/rdma_write_recv_bytes
104-
rxBytes uint64 // hw_counters/rx_bytes
105-
rxDrops uint64 // hw_counters/rx_drops
106-
txBytes uint64 // hw_counters/tx_bytes
111+
rdmaReadBytes uint64 // hw_counters/rdma_read_bytes
112+
rdmaWriteBytes uint64 // hw_counters/rdma_write_bytes
113+
rdmaWriteRecvBytes uint64 // hw_counters/rdma_write_recv_bytes
114+
rxBytes uint64 // hw_counters/rx_bytes
115+
rxDrops uint64 // hw_counters/rx_drops
116+
txBytes uint64 // hw_counters/tx_bytes
117+
retransBytes uint64 // hw_counters/retrans_bytes
118+
retransPkts uint64 // hw_counters/retrans_pkts
119+
retransTimeoutEvents uint64 // hw_counters/retrans_timeout_events
120+
unresponsiveRemoteEvents uint64 // hw_counters/unresponsive_remote_events
121+
impairedRemoteConnEvents uint64 // hw_counters/impaired_remote_conn_events
107122
}
108123

109124
func NewEfaSyfsScraper(logger *zap.Logger, decorator stores.Decorator, podResourcesStore podResourcesStore, hostInfo hostInfoProvider) *Scraper {
@@ -163,12 +178,17 @@ func (s *Scraper) GetMetrics() []pmetric.Metrics {
163178
}
164179

165180
measurementValue := map[string]uint64{
166-
ci.EfaRdmaReadBytes: counters.rdmaReadBytes,
167-
ci.EfaRdmaWriteBytes: counters.rdmaWriteBytes,
168-
ci.EfaRdmaWriteRecvBytes: counters.rdmaWriteRecvBytes,
169-
ci.EfaRxBytes: counters.rxBytes,
170-
ci.EfaRxDropped: counters.rxDrops,
171-
ci.EfaTxBytes: counters.txBytes,
181+
ci.EfaRdmaReadBytes: counters.rdmaReadBytes,
182+
ci.EfaRdmaWriteBytes: counters.rdmaWriteBytes,
183+
ci.EfaRdmaWriteRecvBytes: counters.rdmaWriteRecvBytes,
184+
ci.EfaRxBytes: counters.rxBytes,
185+
ci.EfaRxDropped: counters.rxDrops,
186+
ci.EfaTxBytes: counters.txBytes,
187+
ci.EfaRetransBytes: counters.retransBytes,
188+
ci.EfaRetransPkts: counters.retransPkts,
189+
ci.EfaRetransTimeoutEvents: counters.retransTimeoutEvents,
190+
ci.EfaUnresponsiveRemoveEvents: counters.unresponsiveRemoteEvents,
191+
ci.EfaImpairedRemoteConnEvents: counters.impairedRemoteConnEvents,
172192
}
173193

174194
for measurement, value := range measurementValue {
@@ -349,6 +369,16 @@ func (s *Scraper) readCounters(deviceName efaDeviceName, port string, counters *
349369
counters.rxDrops += reader(counter)
350370
case counterTxBytes:
351371
counters.txBytes += reader(counter)
372+
case counterRetransBytes:
373+
counters.retransBytes += reader(counter)
374+
case counterRetransPkts:
375+
counters.retransPkts += reader(counter)
376+
case counterRetransTimeoutEvents:
377+
counters.retransTimeoutEvents += reader(counter)
378+
case counterUnresponsiveRemoteEvents:
379+
counters.unresponsiveRemoteEvents += reader(counter)
380+
case counterImpairedRemoteConnEvents:
381+
counters.impairedRemoteConnEvents += reader(counter)
352382
}
353383
}
354384

receiver/awscontainerinsightreceiver/internal/efa/efaSysfs_test.go

Lines changed: 88 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,17 @@ func (r mockSysfsReader) GetMACAddressFromDeviceName(deviceName efaDeviceName) (
6868
}
6969

7070
var mockCounterValues = map[string]uint64{
71-
counterRdmaReadBytes: 1,
72-
counterRdmaWriteBytes: 2,
73-
counterRdmaWriteRecvBytes: 3,
74-
counterRxBytes: 4,
75-
counterRxDrops: 5,
76-
counterTxBytes: 6,
71+
counterRdmaReadBytes: 1,
72+
counterRdmaWriteBytes: 2,
73+
counterRdmaWriteRecvBytes: 3,
74+
counterRxBytes: 4,
75+
counterRxDrops: 5,
76+
counterTxBytes: 6,
77+
counterRetransBytes: 7,
78+
counterRetransPkts: 8,
79+
counterRetransTimeoutEvents: 9,
80+
counterUnresponsiveRemoteEvents: 10,
81+
counterImpairedRemoteConnEvents: 11,
7782
}
7883

7984
func (r mockSysfsReader) ReadCounter(deviceName efaDeviceName, port string, counter string) (uint64, error) {
@@ -135,12 +140,17 @@ var mockHost = &mockHostInfo{
135140
var efa0Metrics = []expectation{
136141
{
137142
map[string]uint64{
138-
"node_efa_rdma_read_bytes": 2,
139-
"node_efa_rdma_write_bytes": 4,
140-
"node_efa_rdma_write_recv_bytes": 6,
141-
"node_efa_rx_bytes": 8,
142-
"node_efa_rx_dropped": 10,
143-
"node_efa_tx_bytes": 12,
143+
"node_efa_rdma_read_bytes": 2,
144+
"node_efa_rdma_write_bytes": 4,
145+
"node_efa_rdma_write_recv_bytes": 6,
146+
"node_efa_rx_bytes": 8,
147+
"node_efa_rx_dropped": 10,
148+
"node_efa_tx_bytes": 12,
149+
"node_efa_retrans_bytes": 14,
150+
"node_efa_retrans_pkts": 16,
151+
"node_efa_retrans_timeout_events": 18,
152+
"node_efa_unresponsive_remote_events": 20,
153+
"node_efa_impaired_remote_conn_events": 22,
144154
},
145155
map[string]string{
146156
ci.MetricType: ci.TypeNodeEFA,
@@ -151,12 +161,17 @@ var efa0Metrics = []expectation{
151161
},
152162
{
153163
map[string]uint64{
154-
"pod_efa_rdma_read_bytes": 2,
155-
"pod_efa_rdma_write_bytes": 4,
156-
"pod_efa_rdma_write_recv_bytes": 6,
157-
"pod_efa_rx_bytes": 8,
158-
"pod_efa_rx_dropped": 10,
159-
"pod_efa_tx_bytes": 12,
164+
"pod_efa_rdma_read_bytes": 2,
165+
"pod_efa_rdma_write_bytes": 4,
166+
"pod_efa_rdma_write_recv_bytes": 6,
167+
"pod_efa_rx_bytes": 8,
168+
"pod_efa_rx_dropped": 10,
169+
"pod_efa_tx_bytes": 12,
170+
"pod_efa_retrans_bytes": 14,
171+
"pod_efa_retrans_pkts": 16,
172+
"pod_efa_retrans_timeout_events": 18,
173+
"pod_efa_unresponsive_remote_events": 20,
174+
"pod_efa_impaired_remote_conn_events": 22,
160175
},
161176
map[string]string{
162177
ci.MetricType: ci.TypePodEFA,
@@ -170,12 +185,17 @@ var efa0Metrics = []expectation{
170185
},
171186
{
172187
map[string]uint64{
173-
"container_efa_rdma_read_bytes": 2,
174-
"container_efa_rdma_write_bytes": 4,
175-
"container_efa_rdma_write_recv_bytes": 6,
176-
"container_efa_rx_bytes": 8,
177-
"container_efa_rx_dropped": 10,
178-
"container_efa_tx_bytes": 12,
188+
"container_efa_rdma_read_bytes": 2,
189+
"container_efa_rdma_write_bytes": 4,
190+
"container_efa_rdma_write_recv_bytes": 6,
191+
"container_efa_rx_bytes": 8,
192+
"container_efa_rx_dropped": 10,
193+
"container_efa_tx_bytes": 12,
194+
"container_efa_retrans_bytes": 14,
195+
"container_efa_retrans_pkts": 16,
196+
"container_efa_retrans_timeout_events": 18,
197+
"container_efa_unresponsive_remote_events": 20,
198+
"container_efa_impaired_remote_conn_events": 22,
179199
},
180200
map[string]string{
181201
ci.MetricType: ci.TypeContainerEFA,
@@ -191,12 +211,17 @@ var efa0Metrics = []expectation{
191211

192212
var efa1NodeMetric = expectation{
193213
map[string]uint64{
194-
"node_efa_rdma_read_bytes": 2,
195-
"node_efa_rdma_write_bytes": 4,
196-
"node_efa_rdma_write_recv_bytes": 6,
197-
"node_efa_rx_bytes": 8,
198-
"node_efa_rx_dropped": 10,
199-
"node_efa_tx_bytes": 12,
214+
"node_efa_rdma_read_bytes": 2,
215+
"node_efa_rdma_write_bytes": 4,
216+
"node_efa_rdma_write_recv_bytes": 6,
217+
"node_efa_rx_bytes": 8,
218+
"node_efa_rx_dropped": 10,
219+
"node_efa_tx_bytes": 12,
220+
"node_efa_retrans_bytes": 14,
221+
"node_efa_retrans_pkts": 16,
222+
"node_efa_retrans_timeout_events": 18,
223+
"node_efa_unresponsive_remote_events": 20,
224+
"node_efa_impaired_remote_conn_events": 22,
200225
},
201226
map[string]string{
202227
ci.MetricType: ci.TypeNodeEFA,
@@ -209,12 +234,17 @@ var efa1NodeMetric = expectation{
209234
var efa1PodContainerMetrics = []expectation{
210235
{
211236
map[string]uint64{
212-
"pod_efa_rdma_read_bytes": 2,
213-
"pod_efa_rdma_write_bytes": 4,
214-
"pod_efa_rdma_write_recv_bytes": 6,
215-
"pod_efa_rx_bytes": 8,
216-
"pod_efa_rx_dropped": 10,
217-
"pod_efa_tx_bytes": 12,
237+
"pod_efa_rdma_read_bytes": 2,
238+
"pod_efa_rdma_write_bytes": 4,
239+
"pod_efa_rdma_write_recv_bytes": 6,
240+
"pod_efa_rx_bytes": 8,
241+
"pod_efa_rx_dropped": 10,
242+
"pod_efa_tx_bytes": 12,
243+
"pod_efa_retrans_bytes": 14,
244+
"pod_efa_retrans_pkts": 16,
245+
"pod_efa_retrans_timeout_events": 18,
246+
"pod_efa_unresponsive_remote_events": 20,
247+
"pod_efa_impaired_remote_conn_events": 22,
218248
},
219249
map[string]string{
220250
ci.MetricType: ci.TypePodEFA,
@@ -228,12 +258,17 @@ var efa1PodContainerMetrics = []expectation{
228258
},
229259
{
230260
map[string]uint64{
231-
"container_efa_rdma_read_bytes": 2,
232-
"container_efa_rdma_write_bytes": 4,
233-
"container_efa_rdma_write_recv_bytes": 6,
234-
"container_efa_rx_bytes": 8,
235-
"container_efa_rx_dropped": 10,
236-
"container_efa_tx_bytes": 12,
261+
"container_efa_rdma_read_bytes": 2,
262+
"container_efa_rdma_write_bytes": 4,
263+
"container_efa_rdma_write_recv_bytes": 6,
264+
"container_efa_rx_bytes": 8,
265+
"container_efa_rx_dropped": 10,
266+
"container_efa_tx_bytes": 12,
267+
"container_efa_retrans_bytes": 14,
268+
"container_efa_retrans_pkts": 16,
269+
"container_efa_retrans_timeout_events": 18,
270+
"container_efa_unresponsive_remote_events": 20,
271+
"container_efa_impaired_remote_conn_events": 22,
237272
},
238273
map[string]string{
239274
ci.MetricType: ci.TypeContainerEFA,
@@ -401,12 +436,17 @@ func TestScrape(t *testing.T) {
401436

402437
expectedCounters := efaCounters{
403438
// All values multiplied by 2 because we mock 2 ports
404-
rdmaReadBytes: 2,
405-
rdmaWriteBytes: 4,
406-
rdmaWriteRecvBytes: 6,
407-
rxBytes: 8,
408-
rxDrops: 10,
409-
txBytes: 12,
439+
rdmaReadBytes: 2,
440+
rdmaWriteBytes: 4,
441+
rdmaWriteRecvBytes: 6,
442+
rxBytes: 8,
443+
rxDrops: 10,
444+
txBytes: 12,
445+
retransBytes: 14,
446+
retransPkts: 16,
447+
retransTimeoutEvents: 18,
448+
unresponsiveRemoteEvents: 20,
449+
impairedRemoteConnEvents: 22,
410450
}
411451
expected := efaDevices{
412452
efaDevice{

0 commit comments

Comments
 (0)