Skip to content

Commit 9e3772a

Browse files
Merge pull request #50 from oracle-quickstart/add-amd-to-main-readme
Update stack to add AMD RCCL tests
2 parents 55dd971 + 041549f commit 9e3772a

File tree

8 files changed

+154
-82
lines changed

8 files changed

+154
-82
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,11 @@ kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke
176176
kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/nccl-tests/BM.GPU.B4.8-nccl-test.yaml
177177
```
178178

179+
##### BM.GPU.MI300X.8
180+
```
181+
kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/rccl-tests/BM.GPU.MI300X.8.yaml
182+
```
183+
179184
The initial pull of the container will take long. Once the master pod `nccl-allreduce-job0-mpimaster-0` starts running, you can check it logs for the NCCL test result.
180185

181186
```sh
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
annotations:
5+
name: rccl-tests-job0
6+
spec:
7+
minAvailable: 0
8+
plugins:
9+
ssh: []
10+
svc: []
11+
queue: default
12+
schedulerName: volcano
13+
tasks:
14+
- name: mpimaster
15+
policies:
16+
- action: CompleteJob
17+
event: TaskCompleted
18+
replicas: 1
19+
template:
20+
metadata:
21+
spec:
22+
containers:
23+
- command:
24+
- /bin/bash
25+
- -c
26+
- |
27+
sysctl --system
28+
NUM_GPUS=8
29+
NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host)
30+
NP=$(($NUM_HOSTS*$NUM_GPUS))
31+
mpirun --allow-run-as-root \
32+
-mca plm_rsh_args "-p 2222" \
33+
--bind-to numa \
34+
--mca oob_tcp_if_exclude docker,lo \
35+
--mca btl ^openib \
36+
-x NCCL_DEBUG=VERSION \
37+
-x NCCL_IB_HCA==mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9 \
38+
-x NCCL_SOCKET_IFNAME=eth0 \
39+
-x NCCL_IB_TC=41 \
40+
-x NCCL_IB_SL=0 \
41+
-x NCCL_IB_GID_INDEX=3 \
42+
-x NCCL_IB_QPS=2 \
43+
-x NCCL_IB_SPLIT_DATA_ON_QPS=4 \
44+
-x NCCL_ALGO=Ring \
45+
-hostfile /etc/volcano/mpiworker.host \
46+
-N 8 -np $NP \
47+
/workspace/rccl-tests/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
48+
ports:
49+
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
50+
image: iad.ocir.io/hpc_limited_availability/oke/rccl-tests:rocm-6.2.1-ofed-5.9-0.5.6.0.127
51+
imagePullPolicy: Always
52+
name: mpimaster
53+
resources:
54+
limits:
55+
ephemeral-storage: 32Gi
56+
requests:
57+
cpu: 8
58+
ephemeral-storage: 32Gi
59+
memory: 2Gi
60+
securityContext:
61+
privileged: true
62+
capabilities:
63+
add: [IPC_LOCK, SYS_PTRACE]
64+
volumeMounts:
65+
- { mountPath: /dev/shm, name: shm }
66+
workingDir: /workspace
67+
dnsPolicy: ClusterFirstWithHostNet
68+
hostNetwork: true
69+
restartPolicy: OnFailure
70+
terminationGracePeriodSeconds: 2
71+
volumes:
72+
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 128Gi }}
73+
- minAvailable: 0
74+
name: mpiworker
75+
replicas: 2
76+
template:
77+
spec:
78+
containers:
79+
- command:
80+
- /bin/bash
81+
- -c
82+
- sysctl --system; mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222
83+
ports:
84+
- { name: mpijob-port, containerPort: 2222, protocol: TCP }
85+
image: iad.ocir.io/hpc_limited_availability/oke/rccl-tests:rocm-6.2.1-ofed-5.9-0.5.6.0.127
86+
imagePullPolicy: Always
87+
name: mpiworker
88+
resources:
89+
limits:
90+
ephemeral-storage: 32Gi
91+
amd.com/gpu: 8
92+
requests:
93+
cpu: 200
94+
ephemeral-storage: 32Gi
95+
memory: 1024Gi
96+
amd.com/gpu: 8
97+
securityContext:
98+
privileged: true
99+
capabilities:
100+
add: [IPC_LOCK, SYS_PTRACE]
101+
volumeMounts:
102+
- { mountPath: /dev/shm, name: shm }
103+
workingDir: /workspace
104+
dnsPolicy: ClusterFirstWithHostNet
105+
hostNetwork: true
106+
restartPolicy: OnFailure
107+
terminationGracePeriodSeconds: 2
108+
tolerations:
109+
- { key: amd.com/gpu, operator: Exists }
110+
volumes:
111+
- { name: shm, emptyDir: { medium: Memory, sizeLimit: 128Gi }}

terraform/files/grafana/dashboards/rdma-data.json renamed to terraform/files/grafana/dashboards/rdma.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@
316316
"uid": "prometheus"
317317
},
318318
"editorMode": "code",
319-
"expr": "irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\",device=~\"mlx5_.+\"}[1m])",
319+
"expr": "sum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.H100.8\", device=~\"mlx5_0|mlx5_1|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_12|mlx5_13|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.H200.8\", device=~\"mlx5_0|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_9|mlx5_10|mlx5_11\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.B4.8\", device=~\"mlx5_1|mlx5_2|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_11|mlx5_12|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.A100-v2.8\", device=~\"mlx5_1|mlx5_2|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_11|mlx5_12|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU4.8\", device=~\"mlx5_0|mlx5_1|mlx5_2|mlx5_3|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_11|mlx5_12|mlx5_13|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.MI300X.8\", device=~\"mlx5_0|mlx5_1|mlx5_2|mlx5_3|mlx5_6|mlx5_7|mlx5_8|mlx5_9\"}[1m]))\n",
320320
"interval": "",
321321
"intervalFactor": 1,
322322
"legendFormat": "{{device}}",
@@ -416,7 +416,7 @@
416416
"uid": "prometheus"
417417
},
418418
"editorMode": "code",
419-
"expr": "irate(node_infiniband_port_data_transmitted_bytes_total{instance=~\"$node\",device=~\"mlx5_.+\"}[1m])",
419+
"expr": "sum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.H100.8\", device=~\"mlx5_0|mlx5_1|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_12|mlx5_13|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.H200.8\", device=~\"mlx5_0|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_9|mlx5_10|mlx5_11\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.B4.8\", device=~\"mlx5_1|mlx5_2|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_11|mlx5_12|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.A100-v2.8\", device=~\"mlx5_1|mlx5_2|mlx5_3|mlx5_4|mlx5_5|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_11|mlx5_12|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU4.8\", device=~\"mlx5_0|mlx5_1|mlx5_2|mlx5_3|mlx5_6|mlx5_7|mlx5_8|mlx5_9|mlx5_10|mlx5_11|mlx5_12|mlx5_13|mlx5_14|mlx5_15|mlx5_16|mlx5_17\"}[1m])) \nor \nsum by (device) (irate(node_infiniband_port_data_received_bytes_total{instance=~\"$node\", instance_shape=\"BM.GPU.MI300X.8\", device=~\"mlx5_0|mlx5_1|mlx5_2|mlx5_3|mlx5_6|mlx5_7|mlx5_8|mlx5_9\"}[1m]))\n",
420420
"interval": "",
421421
"intervalFactor": 1,
422422
"legendFormat": "{{device}}",
@@ -458,7 +458,7 @@
458458
},
459459
"definition": "",
460460
"includeAll": false,
461-
"label": "Host",
461+
"label": "Instance",
462462
"multi": true,
463463
"name": "node",
464464
"options": [],
@@ -478,6 +478,6 @@
478478
"timezone": "browser",
479479
"title": "RDMA Received/Transmitted bytes",
480480
"uid": "rdma-data",
481-
"version": 13,
481+
"version": 2,
482482
"weekStart": ""
483483
}

terraform/grafana.tf

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,10 @@ resource "kubernetes_config_map_v1" "grafana_alerts" {
4848

4949

5050
resource "random_password" "grafana_admin_password" {
51-
length = 12
52-
special = true
51+
length = 16
52+
min_lower = 1
53+
min_upper = 1
54+
min_numeric = 1
55+
min_special = 1
5356
override_special = "!#$%&*()-_=+[]:?"
5457
}

terraform/kube-prometheus-stack.tf

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
# # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
33

44
resource "helm_release" "prometheus" {
5-
count = var.install_node_problem_detector_kube_prometheus_stack ? 1 : 0
6-
depends_on = [
5+
count = var.install_node_problem_detector_kube_prometheus_stack ? 1 : 0
6+
depends_on = [
77
module.oke,
88
time_sleep.wait_for_lb_termination
9-
]
9+
]
1010
namespace = var.monitoring_namespace
1111
name = "kube-prometheus-stack"
1212
chart = "kube-prometheus-stack"
@@ -26,5 +26,5 @@ resource "helm_release" "prometheus" {
2626
}
2727

2828
resource "time_sleep" "wait_for_lb_termination" {
29-
destroy_duration = "30s"
29+
destroy_duration = "60s"
3030
}

terraform/oke-cluster.tf

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ module "oke" {
5454
"NvidiaGpuPlugin" = {
5555
remove_addon_resources_on_delete = true
5656
override_existing = true
57-
configurations = [
57+
configurations = [
5858
{
5959
key = "isDcgmExporterDisabled"
6060
value = "true"
@@ -65,27 +65,26 @@ module "oke" {
6565
anytrue([
6666
var.worker_rdma_shape == "BM.GPU.MI300X.8",
6767
var.worker_gpu_shape == "BM.GPU.MI300X.8"
68-
]) ? {
68+
]) ? {
6969
"AmdGpuPlugin" = {
7070
remove_addon_resources_on_delete = true
7171
override_existing = true
7272
}
7373
} : {}
7474
)
75-
cni_type = var.cni_type == "VCN-Native Pod Networking" ? "npn" : "flannel"
76-
control_plane_allowed_cidrs = flatten(tolist([var.control_plane_allowed_cidrs]))
77-
control_plane_is_public = true
78-
create_bastion = var.create_bastion
79-
create_cluster = true
80-
create_iam_defined_tags = false
81-
create_iam_resources = false
82-
create_iam_tag_namespace = false
83-
create_operator = var.create_operator
84-
create_vcn = var.create_vcn
85-
kubernetes_version = var.kubernetes_version
86-
load_balancers = "internal"
87-
lockdown_default_seclist = true
88-
# TODO input variable + schema for image selection
75+
cni_type = var.cni_type == "VCN-Native Pod Networking" ? "npn" : "flannel"
76+
control_plane_allowed_cidrs = flatten(tolist([var.control_plane_allowed_cidrs]))
77+
control_plane_is_public = true
78+
create_bastion = var.create_bastion
79+
create_cluster = true
80+
create_iam_defined_tags = false
81+
create_iam_resources = false
82+
create_iam_tag_namespace = false
83+
create_operator = var.create_operator
84+
create_vcn = var.create_vcn
85+
kubernetes_version = var.kubernetes_version
86+
load_balancers = "internal"
87+
lockdown_default_seclist = true
8988
operator_image_type = "platform"
9089
operator_image_os = "Canonical Ubuntu" # Ignored when bastion_image_type = "custom"
9190
operator_image_os_version = "22.04"
@@ -103,7 +102,7 @@ module "oke" {
103102
boot_volume_size = var.operator_shape_boot
104103
}
105104
output_detail = true
106-
pods_cidr = "10.240.0.0/12" # TODO input var (but keep expanded default)
105+
pods_cidr = "10.240.0.0/12"
107106
# services_cidr = "10.96.0.0/16"
108107
#preferred_load_balancer = "internal"
109108
ssh_public_key = trimspace(var.ssh_public_key)
@@ -139,7 +138,7 @@ module "oke" {
139138
}
140139
allow_rules_internal_lb = {
141140
"Allow TCP ingress to internal load balancers from internal VCN/DRG" = {
142-
protocol = "all", port = -1, source = "10.0.0.0/8", source_type = "CIDR_BLOCK",
141+
protocol = "all", port = -1, source = var.vcn_cidrs, source_type = "CIDR_BLOCK",
143142
}
144143
}
145144
}

terraform/output.tf

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,7 @@
33

44
# Terraform
55
output "state_id" { value = module.oke.state_id }
6-
7-
# Identity
8-
# output "dynamic_group_ids" { value = module.oke.dynamic_group_ids }
9-
# output "policy_statements" { value = module.oke.policy_statements }
6+
output "stack_version" { value = "v25.3.1" }
107

118
# Network
129
output "vcn_id" { value = module.oke.vcn_id }
@@ -35,33 +32,27 @@ output "cluster_id" { value = module.oke.cluster_id }
3532
output "cluster_name" { value = local.cluster_name }
3633
output "cluster_public_endpoint" { value = local.cluster_public_endpoint }
3734
output "cluster_private_endpoint" { value = local.cluster_private_endpoint }
38-
output "cluster_kubeconfig" { value = module.oke.cluster_kubeconfig }
3935
output "cluster_ca_cert" { value = base64decode(module.oke.cluster_ca_cert) }
4036
output "control_plane_subnet_id" { value = module.oke.control_plane_subnet_id }
4137
output "control_plane_subnet_cidr" { value = module.oke.control_plane_subnet_cidr }
4238
output "control_plane_nsg_id" { value = module.oke.control_plane_nsg_id }
4339
output "int_lb_subnet_id" { value = module.oke.int_lb_subnet_id }
4440
output "int_lb_subnet_cidr" { value = module.oke.int_lb_subnet_cidr }
41+
output "int_lb_nsg_id" { value = module.oke.int_lb_nsg_id }
4542
output "pub_lb_subnet_id" { value = module.oke.pub_lb_subnet_id }
4643
output "pub_lb_subnet_cidr" { value = module.oke.pub_lb_subnet_cidr }
44+
output "pub_lb_nsg_id" { value = module.oke.pub_lb_nsg_id }
45+
4746

4847
# Workers
4948
output "worker_subnet_id" { value = module.oke.worker_subnet_id }
5049
output "worker_nsg_id" { value = module.oke.worker_nsg_id }
50+
output "worker_subnet_cidr" { value = module.oke.worker_subnet_cidr }
5151
output "worker_ops_pool_id" { value = lookup(module.oke.worker_pool_ids, "oke-ops", null) }
5252
output "worker_cpu_pool_id" { value = lookup(module.oke.worker_pool_ids, "oke-cpu", null) }
5353
output "worker_gpu_pool_id" { value = lookup(module.oke.worker_pool_ids, "oke-gpu", null) }
5454
output "worker_rdma_pool_id" { value = lookup(module.oke.worker_pool_ids, "oke-rdma", null) }
5555

56-
# Storage
57-
#output "fss_ad" { value = oci_file_storage_file_system.fss.0.availability_domain }
58-
#output "fss_filesystem_id" { value = oci_file_storage_file_system.fss.0.id }
59-
#output "fss_volume_name" { value = local.fss_volume_name }
60-
#output "fss_nsg_id" { value = local.fss_nsg_id }
61-
#output "fss_subnet_id" { value = local.fss_subnet_id }
62-
#output "fss_mount_target_id" { value = oci_file_storage_mount_target.fss.0.id }
63-
#output "fss_export_set_id" { value = oci_file_storage_export_set.fss.0.id }
64-
6556
# Monitoring
6657
output "grafana_public_ip" {
6758
value = format("kubectl get svc -n %v -l app.kubernetes.io/instance=kube-prometheus-stack,app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}'", var.monitoring_namespace)

0 commit comments

Comments
 (0)