Skip to content

Commit a771031

Browse files
PR Title: GPU-16 Export Metrics from oci-hpc stack to OCI
Summary: https://jira.oci.oraclecorp.com/browse/GPU-16 Significant Changes: -added terraform code to create a PAR and save the PAR into a file. (filename: bastion.tf) -added shell script to generate CSV file from influxDB and upload into object store (filename: bin/upload_rdma_nic_metrics.sh) Test Plan: * Using code changes, triggered hpc-stack creation. Observed that terraform code created the PAR and saved into a file. As i didn't add needed policies before stack,stack apply was not successful, which is expected. * In another GPU test setup, from bastion, I was able to run the shell script to upload the CSV file into object store using the above PAR.
1 parent 2ea5c41 commit a771031

File tree

6 files changed

+228
-3
lines changed

6 files changed

+228
-3
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,3 +393,20 @@ The nvidia bug report, sosreport, and console history logs for compute-permanent
393393
Where hostlist had the below contents
394394
compute-permanent-node-467
395395
compute-permanent-node-787
396+
397+
398+
## Collect RDMA NIC Metrics and Upload to Object Storage
399+
400+
OCI-HPC is deployed in customer tenancy. So, OCI service teams cannot access metrics from these OCI-HPC stack clusters. Due to overcome this issue, in release,
401+
we introduce a feature to collect RDMA NIC Metrics and upload those metrics to Object Storage. Later on, that Object Storage URL could be shared with OCI service
402+
teams. After that URL, OCI service teams could access metrics and use those metrics for debugging purpose.
403+
404+
To collect RDMA NIC Metrics and upload those to Object Storage, user needs to follow these following steps:
405+
406+
Step 1: Create a PAR (PreAuthenticated Request)
407+
For creating a PAR, user needs to select check-box "Create Object Storage PAR" during Resource Manager's stack creation.
408+
By default, this check box is enabled. By selecting, this check-box, a PAR would be created.
409+
410+
Step 2: Use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage.
411+
User needs to use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User could configure metrics
412+
collection limit and interval through config file: rdma_metrics_collection_config.conf.

bastion.tf

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ resource "null_resource" "cluster" {
292292
}
293293
}
294294

295+
295296
provisioner "file" {
296297
content = var.node_count > 0 ? join("\n",local.cluster_instances_ips) : "\n"
297298
destination = "/tmp/hosts"
@@ -479,4 +480,46 @@ provisioner "file" {
479480
private_key = tls_private_key.ssh.private_key_pem
480481
}
481482
}
482-
}
483+
}
484+
485+
data "oci_objectstorage_namespace" "compartment_namespace" {
486+
compartment_id = var.targetCompartment
487+
}
488+
489+
resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" {
490+
count = var.bastion_object_storage_par ? 1 : 0
491+
compartment_id = var.targetCompartment
492+
name = "RDMA_NIC_metrics"
493+
namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace
494+
}
495+
496+
resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" {
497+
count = var.bastion_object_storage_par ? 1 : 0
498+
depends_on = [oci_objectstorage_bucket.RDMA_NIC_metrics_bucket]
499+
access_type = "AnyObjectWrite"
500+
bucket = oci_objectstorage_bucket.RDMA_NIC_metrics_bucket[0].name
501+
name = format("%s-%s", "RDMA_NIC_metrics_bucket", var.tenancy_ocid)
502+
namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace
503+
time_expires = "2030-08-01T00:00:00+00:00"
504+
}
505+
506+
507+
output "RDMA_NIC_metrics_url" {
508+
depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par]
509+
value = var.bastion_object_storage_par ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : ""
510+
}
511+
512+
locals {
513+
par_path = "/opt/oci-hpc"
514+
}
515+
/*
516+
saving the PAR into file: /opt/oci-hpc/PAR_file_for_metrics.
517+
this PAR is used by the scripts to upload NIC metrics to object storage (i.e. script: upload_rdma_nic_metrics.sh)
518+
*/
519+
resource "local_file" "PAR" {
520+
count = var.bastion_object_storage_par ? 1 : 0
521+
depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par]
522+
content = "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}"
523+
filename = "${local.par_path}/PAR_file_for_metrics"
524+
}
525+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
hoursAgoFromNow=24
2+
metricsCollectionIntervalInMinute=5
3+
parFileName=/opt/oci-hpc/PAR_file_for_metrics

bin/upload_rdma_nic_metrics.sh

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#!/bin/bash
2+
3+
scripts=`realpath $0`
4+
folder=`dirname $scripts`
5+
6+
source "${folder}/rdma_metrics_collection_config.conf"
7+
hours="$hoursAgoFromNow"
8+
interval="$metricsCollectionIntervalInMinute"
9+
par_filename="$parFileName"
10+
11+
if [ -z "$par_filename" ]
12+
then
13+
echo "Please create a PAR and save into a file. Then, in config file, set the path of PAR-file to parFileName"
14+
exit
15+
fi
16+
17+
if [ ! -f ${par_filename} ]
18+
then
19+
echo "PAR file:${par_filename} does not exist. Please create PAR file and update the config file"
20+
exit
21+
fi
22+
23+
dis_help()
24+
{
25+
echo
26+
echo "Usage:"
27+
echo
28+
echo "./upload_rdma_nic_metrics.sh -l <limit:hours ago from now> -i <Metrics Interval>"
29+
echo
30+
echo "Options:"
31+
echo "l Hours Ago From Now (optional)"
32+
echo "n Metrics Collection Interval In Minute (optional)"
33+
echo "h Print this help."
34+
echo
35+
echo "RDMA metrics are uploaded to Object Storage using PAR"
36+
echo
37+
echo "e.g., sh ./upload_rdma_nic_metrics.sh -l 24 -i 5 "
38+
echo
39+
echo "Supported releases: 2.10.3+"
40+
echo
41+
}
42+
43+
#Do this if number of arguments passed is greater than 0
44+
if [ "$#" -gt "0" ]
45+
then
46+
while getopts "l:i:h" option
47+
do
48+
case $option in
49+
l) hours=${OPTARG};;
50+
i) interval=${OPTARG};;
51+
h) dis_help
52+
exit;;
53+
\?) # Invalid option
54+
echo "Error: Invalid option"
55+
exit;;
56+
esac
57+
done
58+
fi
59+
60+
monitoring_folder=$folder/../monitoring
61+
62+
if [ -f $monitoring_folder/activated ]
63+
then
64+
timestamp=$(date +%s)
65+
for i in {0..16}
66+
do
67+
measurementname="infiniband_mlx5_"$i"_hw_counters"
68+
measurementnameBackup="infiniband_mlx5_"$i"_hw_counters_backup"
69+
echo "Checking device mlx5_${i} for RDMA HW metrics...."
70+
query="SELECT MEAN(*) INTO ${measurementnameBackup} FROM ${measurementname} WHERE time < now() AND time > now() - ${hours}h GROUP BY time(${interval}m)"
71+
rows=$(influx -database 'telegraf' -execute "${query}" -format json | jq '.results[0].series[0].values[0][1]')
72+
73+
if [ "$rows" -eq 0 ]; then
74+
echo "Device mlx5_${i} does not have metrics to collect"
75+
echo "......................................................"
76+
continue
77+
fi
78+
79+
filename="infiniband_mlx5_${i}_${timestamp}"
80+
filename_csv="${filename}.csv"
81+
filename_zip="${filename}.zip"
82+
83+
echo "Collecting RDMA HW metrics of device mlx5_${i}...."
84+
query="SELECT * FROM ${measurementnameBackup}"
85+
influx -database 'telegraf' -execute "${query}" -format csv > $filename_csv
86+
filename_csv_path="${folder}/${filename_csv}"
87+
if [ ! -f ${filename_csv_path} ]
88+
then
89+
echo "ERROR:${filename_csv_path} was not created."
90+
continue
91+
fi
92+
93+
zip ${filename_zip} ${filename_csv}
94+
rm ${filename_csv}
95+
filename_zip_path="${folder}/${filename_zip}"
96+
if [ ! -f ${filename_zip_path} ]
97+
then
98+
echo "ERROR:${filename_zip_path} was not created."
99+
continue
100+
fi
101+
102+
par=$(cat "${par_filename}")
103+
echo "Uploading RDMA HW Metrics to Object Stroage for device mlx5_${i}"
104+
curl -X PUT --data-binary @${filename_zip} "$par""$filename_zip"
105+
echo "Uploaded RDMA HW metrics to Object Storage for device mlx5_${i}"
106+
echo "Object storage URL for device mlx_5${i}: ${par}${filename_zip}"
107+
108+
sqldelete="DELETE FROM ${measurementnameBackup}"
109+
influx -database 'telegraf' -execute "${sqldelete}"
110+
echo "......................................................"
111+
done
112+
113+
measurementname="infiniband"
114+
measurementnameBackup="infiniband_backup"
115+
116+
echo "Checking for Infiniband counter metrics...."
117+
query="SELECT MEAN(*) INTO ${measurementnameBackup} FROM ${measurementname} WHERE time < now() AND time > now() - ${hours}h GROUP BY time(${interval}m)"
118+
rows=$(influx -database 'telegraf' -execute "${query}" -format json | jq '.results[0].series[0].values[0][1]')
119+
120+
if [ "$rows" -eq 0 ]; then
121+
echo "It does not have Infiniband counter metrics to collect"
122+
fi
123+
124+
filename="infiniband_${timestamp}"
125+
filename_csv="${filename}.csv"
126+
filename_zip="${filename}.zip"
127+
128+
echo "Collecting Infiniband counter metrics...."
129+
query="SELECT * FROM ${measurementnameBackup}"
130+
influx -database 'telegraf' -execute "${query}" -format csv > $filename_csv
131+
filename_csv_path="${folder}/${filename_csv}"
132+
if [ ! -f ${filename_csv_path} ]
133+
then
134+
echo "ERROR:${filename_csv_path} was not created."
135+
continue
136+
fi
137+
138+
zip ${filename_zip} ${filename_csv}
139+
rm ${filename_csv}
140+
filename_zip_path="${folder}/${filename_zip}"
141+
if [ ! -f ${filename_zip_path} ]
142+
then
143+
echo "ERROR:${filename_zip_path} was not created."
144+
continue
145+
fi
146+
147+
echo "Uploading Infiniband counter metrics to Object Stroage"
148+
curl -X PUT --data-binary @${filename_zip} "$par""$filename_zip"
149+
echo "Uploaded Infiniband counter metrics to Object Storage"
150+
echo "Object storage URL for Infiniband counter metrics: ${par}${filename_zip}"
151+
152+
sqldelete="DELETE FROM ${measurementnameBackup}"
153+
influx -database 'telegraf' -execute "${sqldelete}"
154+
155+
fi

schema.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ variableGroups:
3535
- ${bastion_memory}
3636
- ${bastion_boot_volume_size}
3737
- ${bastion_boot_volume_backup}
38+
- ${bastion_object_storage_par}
3839
- title: "Compute node options"
3940
variables:
4041
- ${use_multiple_ads}
@@ -366,7 +367,12 @@ variables:
366367
- and:
367368
- ${bastion_custom_memory}
368369
required: true
369-
370+
bastion_object_storage_par:
371+
title: Create Object Storage PAR
372+
description: "Create a PAR (i.e. Pre-Authenticated Request), so that user could use that PAR to upload monitoring metrics to
373+
to Object Storage and share the URL with OCI service teams."
374+
type: boolean
375+
default: true
370376
use_standard_image:
371377
type: boolean
372378
title: "use standard bastion image"
@@ -1605,4 +1611,4 @@ variables:
16051611
- ${use_marketplace_image_login}
16061612
- ${use_old_marketplace_image_login}
16071613
- not:
1608-
- ${use_standard_image_login}
1614+
- ${use_standard_image_login}

variables.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ variable "use_custom_name" { default = false }
1515
variable "cluster_name" { default = "" }
1616
variable "bastion_ad" {}
1717
variable "bastion_shape" { default = "VM.Standard2.4" }
18+
variable "bastion_object_storage_par" { default = true }
1819
variable "use_standard_image" { default= true }
1920
variable "use_standard_image_login" { default= true }
2021
variable "custom_bastion_image" {

0 commit comments

Comments
 (0)