Skip to content

Commit 5241b3d

Browse files
Add GPU and RDMA monitoring
1 parent b2f1d73 commit 5241b3d

File tree

5 files changed

+53
-5
lines changed

5 files changed

+53
-5
lines changed

autoscaling/tf_init/cluster-network-configuration.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
4141
name = "Compute HPC RDMA Auto-Configuration"
4242
desired_state = plugins_config.value
4343
}
44-
44+
}
45+
dynamic plugins_config {
46+
for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
47+
content {
48+
name = "Compute RDMA GPU Monitoring"
49+
desired_state = plugins_config.value
50+
}
4551
}
4652
}
4753
dynamic "platform_config" {

autoscaling/tf_init/instance-pool-configuration.tf

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
1818
user_data = base64encode(data.template_file.config.rendered)
1919
}
2020
agent_config {
21-
is_management_disabled = true
21+
22+
are_all_plugins_disabled = false
23+
is_management_disabled = true
24+
is_monitoring_disabled = false
25+
26+
plugins_config {
27+
desired_state = "DISABLED"
28+
name = "OS Management Service Agent"
29+
}
30+
dynamic plugins_config {
31+
for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
32+
content {
33+
name = "Compute RDMA GPU Monitoring"
34+
desired_state = plugins_config.value
35+
}
2236
}
37+
}
2338
shape = var.instance_pool_shape
2439

2540
dynamic "shape_config" {

cluster-network-configuration.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
4545
name = "Compute HPC RDMA Auto-Configuration"
4646
desired_state = plugins_config.value
4747
}
48-
48+
}
49+
dynamic plugins_config {
50+
for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
51+
content {
52+
name = "Compute RDMA GPU Monitoring"
53+
desired_state = plugins_config.value
54+
}
4955
}
5056
}
5157

compute-nodes.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,13 @@ resource "oci_core_instance" "compute_cluster_instances" {
4848
name = "Compute HPC RDMA Auto-Configuration"
4949
desired_state = plugins_config.value
5050
}
51-
51+
}
52+
dynamic plugins_config {
53+
for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
54+
content {
55+
name = "Compute RDMA GPU Monitoring"
56+
desired_state = plugins_config.value
57+
}
5258
}
5359
}
5460

instance-pool-configuration.tf

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
2222
user_data = base64encode(data.template_file.config.rendered)
2323
}
2424
agent_config {
25-
is_management_disabled = true
25+
26+
are_all_plugins_disabled = false
27+
is_management_disabled = true
28+
is_monitoring_disabled = false
29+
30+
plugins_config {
31+
desired_state = "DISABLED"
32+
name = "OS Management Service Agent"
33+
}
34+
dynamic plugins_config {
35+
for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
36+
content {
37+
name = "Compute RDMA GPU Monitoring"
38+
desired_state = plugins_config.value
39+
}
2640
}
41+
}
2742
shape = var.instance_pool_shape
2843

2944
dynamic "shape_config" {

0 commit comments

Comments
 (0)