Skip to content

Commit 1392172

Browse files
Merge pull request #131 from oci-hpc/2.10.3_more_counters
Publish NIC metrics to influxDB
2 parents 0c8a4c7 + 9b96bdc commit 1392172

19 files changed

+840
-3
lines changed

playbooks/roles/telegraf/tasks/common.yml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,24 @@
4646
- infiniband.conf
4747
- influxdb.conf
4848
- net.conf
49-
- infiniband_hw_counters.conf
49+
- ethtool_counters.conf
50+
- infiniband_mlx5_0_hw_counters.conf
51+
- infiniband_mlx5_1_hw_counters.conf
52+
- infiniband_mlx5_2_hw_counters.conf
53+
- infiniband_mlx5_3_hw_counters.conf
54+
- infiniband_mlx5_4_hw_counters.conf
55+
- infiniband_mlx5_5_hw_counters.conf
56+
- infiniband_mlx5_6_hw_counters.conf
57+
- infiniband_mlx5_7_hw_counters.conf
58+
- infiniband_mlx5_8_hw_counters.conf
59+
- infiniband_mlx5_9_hw_counters.conf
60+
- infiniband_mlx5_10_hw_counters.conf
61+
- infiniband_mlx5_11_hw_counters.conf
62+
- infiniband_mlx5_12_hw_counters.conf
63+
- infiniband_mlx5_13_hw_counters.conf
64+
- infiniband_mlx5_14_hw_counters.conf
65+
- infiniband_mlx5_15_hw_counters.conf
66+
- infiniband_mlx5_16_hw_counters.conf
5067
- name: restart telegraf
5168
become: true
5269
service:
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Returns ethtool statistics for given interfaces
2+
[[inputs.ethtool]]
3+
4+
interface_include = ["rdma*","enp*"]
5+
interval = "300s"
6+
fieldpass = ["tx_pci_signal_integrity","rx_steer_missed_packets","rx_vport_multicast_bytes","rx_vport_rdma_unicast_packets",
7+
"rx_vport_rdma_unicast_bytes","tx_vport_rdma_unicast_packets","tx_vport_rdma_unicast_bytes","tx_packets_phy","rx_packets_phy",
8+
"tx_bytes_phy","rx_bytes_phy", "rx_multicast_phy","rx_65_to_127_bytes_phy","rx_2048_to_4095_bytes_phy", "rx_4096_to_8191_bytes_phy",
9+
"rx_crc_errors_phy","rx_symbol_err_phy", "rx_discards_phy","tx_discards_phy","tx_errors_phy","rx_64_bytes_phy","link_down_events_phy",
10+
"rx_out_of_buffer","module_bus_stuck","module_high_temp","rx_buffer_passed_thres_phy","tx_pause_storm_warning_events","tx_pause_storm_error_events",
11+
"rx_pcs_symbol_err_phy","rx_pci_signal_integrity","tx_pci_signal_integrity","rx_prio0_bytes","rx_prio0_packets","tx_prio0_bytes",
12+
"tx_prio0_packets","rx_prio0_buf_discard","rx_prio0_cong_discard","rx_prio0_marked","outbound_pci_buffer_overflow"]
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
[[inputs.multifile]]
2+
name_override = "infiniband_mlx5_0_hw_counters"
3+
base_dir = "/sys/class/infiniband"
4+
interval = "300s"
5+
6+
[[inputs.multifile.tags]]
7+
device="mlx5_0"
8+
port="1"
9+
type="hw_counters"
10+
11+
[[inputs.multifile.file]]
12+
file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
13+
conversion = “int"
14+
15+
[[inputs.multifile.file]]
16+
file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
17+
conversion = “int"
18+
19+
[[inputs.multifile.file]]
20+
file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
21+
conversion = “int"
22+
23+
[[inputs.multifile.file]]
24+
file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
25+
conversion = “int"
26+
27+
[[inputs.multifile.file]]
28+
file = "mlx5_0/ports/1/hw_counters/roce_adp_retrans"
29+
conversion = “int"
30+
31+
[[inputs.multifile.file]]
32+
file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
33+
conversion = “int"
34+
35+
[[inputs.multifile.file]]
36+
file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
37+
conversion = “int"
38+
39+
[[inputs.multifile.file]]
40+
file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
41+
conversion = “int"
42+
43+
[[inputs.multifile.file]]
44+
file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
45+
conversion = “int"
46+
47+
[[inputs.multifile.file]]
48+
file = "mlx5_0/ports/1/hw_counters/roce_slow_restart"
49+
conversion = "int"
50+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
2+
3+
[[inputs.multifile]]
4+
name_override = "infiniband_mlx5_10_hw_counters"
5+
base_dir = "/sys/class/infiniband"
6+
interval = "300s"
7+
8+
[[inputs.multifile.tags]]
9+
device="mlx5_10"
10+
port="1"
11+
type="hw_counters"
12+
13+
[[inputs.multifile.file]]
14+
file = "mlx5_10/ports/1/hw_counters/np_ecn_marked_roce_packets"
15+
conversion = “int"
16+
17+
[[inputs.multifile.file]]
18+
file = "mlx5_10/ports/1/hw_counters/out_of_sequence"
19+
conversion = “int"
20+
21+
[[inputs.multifile.file]]
22+
file = "mlx5_10/ports/1/hw_counters/packet_seq_err"
23+
conversion = “int"
24+
25+
[[inputs.multifile.file]]
26+
file = "mlx5_10/ports/1/hw_counters/local_ack_timeout_err"
27+
conversion = “int"
28+
29+
[[inputs.multifile.file]]
30+
file = "mlx5_10/ports/1/hw_counters/roce_adp_retrans"
31+
conversion = “int"
32+
33+
[[inputs.multifile.file]]
34+
file = "mlx5_10/ports/1/hw_counters/np_cnp_sent"
35+
conversion = “int"
36+
37+
[[inputs.multifile.file]]
38+
file = "mlx5_10/ports/1/hw_counters/rp_cnp_handled"
39+
conversion = “int"
40+
41+
[[inputs.multifile.file]]
42+
file = "mlx5_10/ports/1/hw_counters/rp_cnp_ignored"
43+
conversion = “int"
44+
45+
[[inputs.multifile.file]]
46+
file = "mlx5_10/ports/1/hw_counters/rx_icrc_encapsulated"
47+
conversion = “int"
48+
49+
[[inputs.multifile.file]]
50+
file = "mlx5_10/ports/1/hw_counters/roce_slow_restart"
51+
conversion = "int"
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
2+
3+
[[inputs.multifile]]
4+
name_override = "infiniband_mlx5_11_hw_counters"
5+
base_dir = "/sys/class/infiniband"
6+
interval = "300s"
7+
8+
[[inputs.multifile.tags]]
9+
device="mlx5_11"
10+
port="1"
11+
type="hw_counters"
12+
13+
[[inputs.multifile.file]]
14+
file = "mlx5_11/ports/1/hw_counters/np_ecn_marked_roce_packets"
15+
conversion = “int"
16+
17+
[[inputs.multifile.file]]
18+
file = "mlx5_11/ports/1/hw_counters/out_of_sequence"
19+
conversion = “int"
20+
21+
[[inputs.multifile.file]]
22+
file = "mlx5_11/ports/1/hw_counters/packet_seq_err"
23+
conversion = “int"
24+
25+
[[inputs.multifile.file]]
26+
file = "mlx5_11/ports/1/hw_counters/local_ack_timeout_err"
27+
conversion = “int"
28+
29+
[[inputs.multifile.file]]
30+
file = "mlx5_11/ports/1/hw_counters/roce_adp_retrans"
31+
conversion = “int"
32+
33+
[[inputs.multifile.file]]
34+
file = "mlx5_11/ports/1/hw_counters/np_cnp_sent"
35+
conversion = “int"
36+
37+
[[inputs.multifile.file]]
38+
file = "mlx5_11/ports/1/hw_counters/rp_cnp_handled"
39+
conversion = “int"
40+
41+
[[inputs.multifile.file]]
42+
file = "mlx5_11/ports/1/hw_counters/rp_cnp_ignored"
43+
conversion = “int"
44+
45+
[[inputs.multifile.file]]
46+
file = "mlx5_11/ports/1/hw_counters/rx_icrc_encapsulated"
47+
conversion = “int"
48+
49+
[[inputs.multifile.file]]
50+
file = "mlx5_11/ports/1/hw_counters/roce_slow_restart"
51+
conversion = "int"
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
2+
3+
[[inputs.multifile]]
4+
name_override = "infiniband_mlx5_12_hw_counters"
5+
base_dir = "/sys/class/infiniband"
6+
interval = "300s"
7+
8+
[[inputs.multifile.tags]]
9+
device="mlx5_12"
10+
port="1"
11+
type="hw_counters"
12+
13+
[[inputs.multifile.file]]
14+
file = "mlx5_12/ports/1/hw_counters/np_ecn_marked_roce_packets"
15+
conversion = “int"
16+
17+
[[inputs.multifile.file]]
18+
file = "mlx5_12/ports/1/hw_counters/out_of_sequence"
19+
conversion = “int"
20+
21+
[[inputs.multifile.file]]
22+
file = "mlx5_12/ports/1/hw_counters/packet_seq_err"
23+
conversion = “int"
24+
25+
[[inputs.multifile.file]]
26+
file = "mlx5_12/ports/1/hw_counters/local_ack_timeout_err"
27+
conversion = “int"
28+
29+
[[inputs.multifile.file]]
30+
file = "mlx5_12/ports/1/hw_counters/roce_adp_retrans"
31+
conversion = “int"
32+
33+
[[inputs.multifile.file]]
34+
file = "mlx5_12/ports/1/hw_counters/np_cnp_sent"
35+
conversion = “int"
36+
37+
[[inputs.multifile.file]]
38+
file = "mlx5_12/ports/1/hw_counters/rp_cnp_handled"
39+
conversion = “int"
40+
41+
[[inputs.multifile.file]]
42+
file = "mlx5_12/ports/1/hw_counters/rp_cnp_ignored"
43+
conversion = “int"
44+
45+
[[inputs.multifile.file]]
46+
file = "mlx5_12/ports/1/hw_counters/rx_icrc_encapsulated"
47+
conversion = “int"
48+
49+
[[inputs.multifile.file]]
50+
file = "mlx5_12/ports/1/hw_counters/roce_slow_restart"
51+
conversion = "int"
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
[[inputs.multifile]]
3+
name_override = "infiniband_mlx5_13_hw_counters"
4+
base_dir = "/sys/class/infiniband"
5+
interval = "300s"
6+
7+
[[inputs.multifile.tags]]
8+
device="mlx5_13"
9+
port="1"
10+
type="hw_counters"
11+
12+
[[inputs.multifile.file]]
13+
file = "mlx5_13/ports/1/hw_counters/np_ecn_marked_roce_packets"
14+
conversion = “int"
15+
16+
[[inputs.multifile.file]]
17+
file = "mlx5_13/ports/1/hw_counters/out_of_sequence"
18+
conversion = “int"
19+
20+
[[inputs.multifile.file]]
21+
file = "mlx5_13/ports/1/hw_counters/packet_seq_err"
22+
conversion = “int"
23+
24+
[[inputs.multifile.file]]
25+
file = "mlx5_13/ports/1/hw_counters/local_ack_timeout_err"
26+
conversion = “int"
27+
28+
[[inputs.multifile.file]]
29+
file = "mlx5_13/ports/1/hw_counters/roce_adp_retrans"
30+
conversion = “int"
31+
32+
[[inputs.multifile.file]]
33+
file = "mlx5_13/ports/1/hw_counters/np_cnp_sent"
34+
conversion = “int"
35+
36+
[[inputs.multifile.file]]
37+
file = "mlx5_13/ports/1/hw_counters/rp_cnp_handled"
38+
conversion = “int"
39+
40+
[[inputs.multifile.file]]
41+
file = "mlx5_13/ports/1/hw_counters/rp_cnp_ignored"
42+
conversion = “int"
43+
44+
[[inputs.multifile.file]]
45+
file = "mlx5_13/ports/1/hw_counters/rx_icrc_encapsulated"
46+
conversion = “int"
47+
48+
[[inputs.multifile.file]]
49+
file = "mlx5_13/ports/1/hw_counters/roce_slow_restart"
50+
conversion = "int"
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
[[inputs.multifile]]
3+
name_override = "infiniband_mlx5_14_hw_counters"
4+
base_dir = "/sys/class/infiniband"
5+
interval = "300s"
6+
7+
[[inputs.multifile.tags]]
8+
device="mlx5_14"
9+
port="1"
10+
type="hw_counters"
11+
12+
[[inputs.multifile.file]]
13+
file = "mlx5_14/ports/1/hw_counters/np_ecn_marked_roce_packets"
14+
conversion = “int"
15+
16+
[[inputs.multifile.file]]
17+
file = "mlx5_14/ports/1/hw_counters/out_of_sequence"
18+
conversion = “int"
19+
20+
[[inputs.multifile.file]]
21+
file = "mlx5_14/ports/1/hw_counters/packet_seq_err"
22+
conversion = “int"
23+
24+
[[inputs.multifile.file]]
25+
file = "mlx5_14/ports/1/hw_counters/local_ack_timeout_err"
26+
conversion = “int"
27+
28+
[[inputs.multifile.file]]
29+
file = "mlx5_14/ports/1/hw_counters/roce_adp_retrans"
30+
conversion = “int"
31+
32+
[[inputs.multifile.file]]
33+
file = "mlx5_14/ports/1/hw_counters/np_cnp_sent"
34+
conversion = “int"
35+
36+
[[inputs.multifile.file]]
37+
file = "mlx5_14/ports/1/hw_counters/rp_cnp_handled"
38+
conversion = “int"
39+
40+
[[inputs.multifile.file]]
41+
file = "mlx5_14/ports/1/hw_counters/rp_cnp_ignored"
42+
conversion = “int"
43+
44+
[[inputs.multifile.file]]
45+
file = "mlx5_14/ports/1/hw_counters/rx_icrc_encapsulated"
46+
conversion = “int"
47+
48+
[[inputs.multifile.file]]
49+
file = "mlx5_14/ports/1/hw_counters/roce_slow_restart"
50+
conversion = "int"

0 commit comments

Comments
 (0)