Skip to content

Commit 6bc1a97

Browse files
committed
Merge branch '2.10.3' into 2.10.3_an_ol8_fixes
2 parents 5768dce + c784965 commit 6bc1a97

19 files changed

+707
-703
lines changed

bastion.tf

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -486,18 +486,34 @@ data "oci_objectstorage_namespace" "compartment_namespace" {
486486
compartment_id = var.targetCompartment
487487
}
488488

489+
locals {
490+
rdma_nic_metric_bucket_name = "RDMA_NIC_metrics"
491+
par_path = ".."
492+
}
493+
/*
494+
saving the PAR into file: ../PAR_file_for_metrics.
495+
this PAR is used by the scripts to upload NIC metrics to object storage (i.e. script: upload_rdma_nic_metrics.sh)
496+
*/
497+
498+
data "oci_objectstorage_bucket" "RDMA_NIC_Metrics_bucket_check" {
499+
name = local.rdma_nic_metric_bucket_name
500+
namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace
501+
}
502+
503+
489504
resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" {
490-
count = var.bastion_object_storage_par ? 1 : 0
505+
count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0
491506
compartment_id = var.targetCompartment
492-
name = "RDMA_NIC_metrics"
507+
name = local.rdma_nic_metric_bucket_name
493508
namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace
509+
versioning = "Enabled"
494510
}
495511

496512
resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" {
497-
count = var.bastion_object_storage_par ? 1 : 0
513+
count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0
498514
depends_on = [oci_objectstorage_bucket.RDMA_NIC_metrics_bucket]
499515
access_type = "AnyObjectWrite"
500-
bucket = oci_objectstorage_bucket.RDMA_NIC_metrics_bucket[0].name
516+
bucket = local.rdma_nic_metric_bucket_name
501517
name = format("%s-%s", "RDMA_NIC_metrics_bucket", var.tenancy_ocid)
502518
namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace
503519
time_expires = "2030-08-01T00:00:00+00:00"
@@ -506,18 +522,12 @@ resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" {
506522

507523
output "RDMA_NIC_metrics_url" {
508524
depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par]
509-
value = var.bastion_object_storage_par ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : ""
525+
value = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : ""
510526
}
511527

512-
locals {
513-
par_path = "/opt/oci-hpc"
514-
}
515-
/*
516-
saving the PAR into file: /opt/oci-hpc/PAR_file_for_metrics.
517-
this PAR is used by the scripts to upload NIC metrics to object storage (i.e. script: upload_rdma_nic_metrics.sh)
518-
*/
528+
519529
resource "local_file" "PAR" {
520-
count = var.bastion_object_storage_par ? 1 : 0
530+
count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0
521531
depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par]
522532
content = "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}"
523533
filename = "${local.par_path}/PAR_file_for_metrics"

playbooks/roles/telegraf/tasks/common.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,23 @@
4747
- influxdb.conf
4848
- net.conf
4949
- ethtool_counters.conf
50+
- infiniband_mlx5_0_hw_counters.conf
51+
- infiniband_mlx5_1_hw_counters.conf
52+
- infiniband_mlx5_2_hw_counters.conf
53+
- infiniband_mlx5_3_hw_counters.conf
54+
- infiniband_mlx5_4_hw_counters.conf
55+
- infiniband_mlx5_5_hw_counters.conf
56+
- infiniband_mlx5_6_hw_counters.conf
57+
- infiniband_mlx5_7_hw_counters.conf
58+
- infiniband_mlx5_8_hw_counters.conf
59+
- infiniband_mlx5_9_hw_counters.conf
60+
- infiniband_mlx5_10_hw_counters.conf
61+
- infiniband_mlx5_11_hw_counters.conf
62+
- infiniband_mlx5_12_hw_counters.conf
63+
- infiniband_mlx5_13_hw_counters.conf
64+
- infiniband_mlx5_14_hw_counters.conf
65+
- infiniband_mlx5_15_hw_counters.conf
66+
- infiniband_mlx5_16_hw_counters.conf
5067
- name: restart telegraf
5168
become: true
5269
service:
Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
[[inputs.multifile]]
1+
[[inputs.multifile]]
22
name_override = "infiniband_mlx5_0_hw_counters"
33
base_dir = "/sys/class/infiniband"
44
interval = "300s"
@@ -10,41 +10,40 @@
1010

1111
[[inputs.multifile.file]]
1212
file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets"
13-
conversion = int"
13+
conversion = "int"
1414

1515
[[inputs.multifile.file]]
1616
file = "mlx5_0/ports/1/hw_counters/out_of_sequence"
17-
conversion = int"
17+
conversion = "int"
1818

1919
[[inputs.multifile.file]]
2020
file = "mlx5_0/ports/1/hw_counters/packet_seq_err"
21-
conversion = int"
21+
conversion = "int"
2222

2323
[[inputs.multifile.file]]
2424
file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err"
25-
conversion = int"
25+
conversion = "int"
2626

2727
[[inputs.multifile.file]]
2828
file = "mlx5_0/ports/1/hw_counters/roce_adp_retrans"
29-
conversion = int"
29+
conversion = "int"
3030

3131
[[inputs.multifile.file]]
3232
file = "mlx5_0/ports/1/hw_counters/np_cnp_sent"
33-
conversion = int"
33+
conversion = "int"
3434

3535
[[inputs.multifile.file]]
3636
file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled"
37-
conversion = int"
37+
conversion = "int"
3838

3939
[[inputs.multifile.file]]
4040
file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored"
41-
conversion = int"
41+
conversion = "int"
4242

4343
[[inputs.multifile.file]]
4444
file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated"
45-
conversion = int"
45+
conversion = "int"
4646

4747
[[inputs.multifile.file]]
4848
file = "mlx5_0/ports/1/hw_counters/roce_slow_restart"
4949
conversion = "int"
50-
Lines changed: 49 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,49 @@
1-
2-
3-
[[inputs.multifile]]
4-
name_override = "infiniband_mlx5_10_hw_counters"
5-
base_dir = "/sys/class/infiniband"
6-
interval = "300s"
7-
8-
[[inputs.multifile.tags]]
9-
device="mlx5_10"
10-
port="1"
11-
type="hw_counters"
12-
13-
[[inputs.multifile.file]]
14-
file = "mlx5_10/ports/1/hw_counters/np_ecn_marked_roce_packets"
15-
conversion = “int"
16-
17-
[[inputs.multifile.file]]
18-
file = "mlx5_10/ports/1/hw_counters/out_of_sequence"
19-
conversion = “int"
20-
21-
[[inputs.multifile.file]]
22-
file = "mlx5_10/ports/1/hw_counters/packet_seq_err"
23-
conversion = “int"
24-
25-
[[inputs.multifile.file]]
26-
file = "mlx5_10/ports/1/hw_counters/local_ack_timeout_err"
27-
conversion = “int"
28-
29-
[[inputs.multifile.file]]
30-
file = "mlx5_10/ports/1/hw_counters/roce_adp_retrans"
31-
conversion = “int"
32-
33-
[[inputs.multifile.file]]
34-
file = "mlx5_10/ports/1/hw_counters/np_cnp_sent"
35-
conversion = “int"
36-
37-
[[inputs.multifile.file]]
38-
file = "mlx5_10/ports/1/hw_counters/rp_cnp_handled"
39-
conversion = “int"
40-
41-
[[inputs.multifile.file]]
42-
file = "mlx5_10/ports/1/hw_counters/rp_cnp_ignored"
43-
conversion = “int"
44-
45-
[[inputs.multifile.file]]
46-
file = "mlx5_10/ports/1/hw_counters/rx_icrc_encapsulated"
47-
conversion = “int"
48-
49-
[[inputs.multifile.file]]
50-
file = "mlx5_10/ports/1/hw_counters/roce_slow_restart"
51-
conversion = "int"
1+
[[inputs.multifile]]
2+
name_override = "infiniband_mlx5_10_hw_counters"
3+
base_dir = "/sys/class/infiniband"
4+
interval = "300s"
5+
6+
[[inputs.multifile.tags]]
7+
device="mlx5_10"
8+
port="1"
9+
type="hw_counters"
10+
11+
[[inputs.multifile.file]]
12+
file = "mlx5_10/ports/1/hw_counters/np_ecn_marked_roce_packets"
13+
conversion = "int"
14+
15+
[[inputs.multifile.file]]
16+
file = "mlx5_10/ports/1/hw_counters/out_of_sequence"
17+
conversion = "int"
18+
19+
[[inputs.multifile.file]]
20+
file = "mlx5_10/ports/1/hw_counters/packet_seq_err"
21+
conversion = "int"
22+
23+
[[inputs.multifile.file]]
24+
file = "mlx5_10/ports/1/hw_counters/local_ack_timeout_err"
25+
conversion = "int"
26+
27+
[[inputs.multifile.file]]
28+
file = "mlx5_10/ports/1/hw_counters/roce_adp_retrans"
29+
conversion = "int"
30+
31+
[[inputs.multifile.file]]
32+
file = "mlx5_10/ports/1/hw_counters/np_cnp_sent"
33+
conversion = "int"
34+
35+
[[inputs.multifile.file]]
36+
file = "mlx5_10/ports/1/hw_counters/rp_cnp_handled"
37+
conversion = "int"
38+
39+
[[inputs.multifile.file]]
40+
file = "mlx5_10/ports/1/hw_counters/rp_cnp_ignored"
41+
conversion = "int"
42+
43+
[[inputs.multifile.file]]
44+
file = "mlx5_10/ports/1/hw_counters/rx_icrc_encapsulated"
45+
conversion = "int"
46+
47+
[[inputs.multifile.file]]
48+
file = "mlx5_10/ports/1/hw_counters/roce_slow_restart"
49+
conversion = "int"
Lines changed: 49 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,49 @@
1-
2-
3-
[[inputs.multifile]]
4-
name_override = "infiniband_mlx5_11_hw_counters"
5-
base_dir = "/sys/class/infiniband"
6-
interval = "300s"
7-
8-
[[inputs.multifile.tags]]
9-
device="mlx5_11"
10-
port="1"
11-
type="hw_counters"
12-
13-
[[inputs.multifile.file]]
14-
file = "mlx5_11/ports/1/hw_counters/np_ecn_marked_roce_packets"
15-
conversion = “int"
16-
17-
[[inputs.multifile.file]]
18-
file = "mlx5_11/ports/1/hw_counters/out_of_sequence"
19-
conversion = “int"
20-
21-
[[inputs.multifile.file]]
22-
file = "mlx5_11/ports/1/hw_counters/packet_seq_err"
23-
conversion = “int"
24-
25-
[[inputs.multifile.file]]
26-
file = "mlx5_11/ports/1/hw_counters/local_ack_timeout_err"
27-
conversion = “int"
28-
29-
[[inputs.multifile.file]]
30-
file = "mlx5_11/ports/1/hw_counters/roce_adp_retrans"
31-
conversion = “int"
32-
33-
[[inputs.multifile.file]]
34-
file = "mlx5_11/ports/1/hw_counters/np_cnp_sent"
35-
conversion = “int"
36-
37-
[[inputs.multifile.file]]
38-
file = "mlx5_11/ports/1/hw_counters/rp_cnp_handled"
39-
conversion = “int"
40-
41-
[[inputs.multifile.file]]
42-
file = "mlx5_11/ports/1/hw_counters/rp_cnp_ignored"
43-
conversion = “int"
44-
45-
[[inputs.multifile.file]]
46-
file = "mlx5_11/ports/1/hw_counters/rx_icrc_encapsulated"
47-
conversion = “int"
48-
49-
[[inputs.multifile.file]]
50-
file = "mlx5_11/ports/1/hw_counters/roce_slow_restart"
51-
conversion = "int"
1+
[[inputs.multifile]]
2+
name_override = "infiniband_mlx5_11_hw_counters"
3+
base_dir = "/sys/class/infiniband"
4+
interval = "60s"
5+
6+
[[inputs.multifile.tags]]
7+
device="mlx5_11"
8+
port="1"
9+
type="hw_counters"
10+
11+
[[inputs.multifile.file]]
12+
file = "mlx5_11/ports/1/hw_counters/np_ecn_marked_roce_packets"
13+
conversion = "int"
14+
15+
[[inputs.multifile.file]]
16+
file = "mlx5_11/ports/1/hw_counters/out_of_sequence"
17+
conversion = "int"
18+
19+
[[inputs.multifile.file]]
20+
file = "mlx5_11/ports/1/hw_counters/packet_seq_err"
21+
conversion = "int"
22+
23+
[[inputs.multifile.file]]
24+
file = "mlx5_11/ports/1/hw_counters/local_ack_timeout_err"
25+
conversion = "int"
26+
27+
[[inputs.multifile.file]]
28+
file = "mlx5_11/ports/1/hw_counters/roce_adp_retrans"
29+
conversion = "int"
30+
31+
[[inputs.multifile.file]]
32+
file = "mlx5_11/ports/1/hw_counters/np_cnp_sent"
33+
conversion = "int"
34+
35+
[[inputs.multifile.file]]
36+
file = "mlx5_11/ports/1/hw_counters/rp_cnp_handled"
37+
conversion = "int"
38+
39+
[[inputs.multifile.file]]
40+
file = "mlx5_11/ports/1/hw_counters/rp_cnp_ignored"
41+
conversion = "int"
42+
43+
[[inputs.multifile.file]]
44+
file = "mlx5_11/ports/1/hw_counters/rx_icrc_encapsulated"
45+
conversion = "int"
46+
47+
[[inputs.multifile.file]]
48+
file = "mlx5_11/ports/1/hw_counters/roce_slow_restart"
49+
conversion = "int"

0 commit comments

Comments
 (0)