Skip to content

Commit 0177652

Browse files
Adding AMD plugin and metrics tracking for MI300x. (#101)
* Adding AMD plugin and metrics tracking to support MI300x. - Added example blueprint of MI300x with Llama4 Maverick. - Added example blueprint of MI300x shared node pool. - Updated API documentation to include local_filesystem and input_file_system - Added MI300x specs to RDMA table with link to HPC image * Renamed local_directory_path to node_directory_path. * Added AMD metrics exporter version to software versions, and added the bring your own pattern. * Cleanup.
1 parent 714ede9 commit 0177652

File tree

17 files changed

+292
-32
lines changed

17 files changed

+292
-32
lines changed

cluster_creation_terraform/oke.tf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ resource "oci_containerengine_cluster" "oke_cluster" {
3535
count = 1
3636
}
3737

38+
resource "oci_containerengine_addon" "amd_operator_plugin" {
39+
cluster_id = oci_containerengine_cluster.oke_cluster[0].id
40+
addon_name = "AmdGpuPlugin"
41+
remove_addon_resources_on_delete = true
42+
override_existing = true
43+
}
44+
3845
resource "oci_containerengine_node_pool" "oke_node_pool" {
3946
cluster_id = oci_containerengine_cluster.oke_cluster[0].id
4047
compartment_id = local.oke_compartment_ocid

cluster_creation_terraform/variables.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ variable "k8s_version" {
114114
description = "Kubernetes version installed on your master and worker nodes"
115115
}
116116
variable "num_pool_workers" {
117-
default = 6
117+
default = 3
118118
description = "The number of worker nodes in the node pool. If select Cluster Autoscaler, will assume the minimum number of nodes configured"
119119
}
120120

docs/about.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,10 @@
4545

4646
This repository provides comprehensive Terraform scripts that provision and configure:
4747

48-
1. An ATP database instance
49-
2. Grafana & Prometheus for monitoring
50-
3. MLFlow for experiment tracking
51-
4. KEDA for dynamic auto-scaling
52-
5. The OCI AI Blueprints front-end and back-end in an OKE cluster of your choice
48+
1. Grafana & Prometheus for monitoring
49+
2. MLFlow for experiment tracking
50+
3. KEDA for dynamic auto-scaling
51+
4. The OCI AI Blueprints front-end and back-end in an OKE cluster of your choice
5352

5453
Once installed, you can:
5554

docs/api_documentation.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
| recipe_shared_memory_volume_size_limit_in_mb | int | Yes | ???. Recommend entering 100.?? |
3232
| input_object_storage | object | Yes | Name of bucket to mount at location “mount_location”. Mount size will be `volume_size_in_gbs`. Will copy all objects in bucket to mount location. Store your LLM model (and in the case of fine-tuning blueprints, your input dataset as well) in this bucket. Example: `[{"bucket_name": "corrino_hf_oss_models", "mount_location": "/models", "volume_size_in_gbs": 500}]` |
3333
| output_object_storage | object | No | Required for fine-tuning deployments. Name of bucket to mount at location “mount_location”. Mount size will be “volume_size_in_gbs”. Will copy all items written here during program runtime to bucket on program completion. Example: `[{“bucket_name”: “output”,“mount_location”: “/output”,“volume_size_in_gbs”: 500}]` |
34+
| input_file_system | object | No | Required for shared storage. This is both input and output storage. OCI File System OCID, Mount Target OCID will be used to mount the file system at "mount location". Mount size will be “volume_size_in_gbs”. This works as an NFS, so any data written will persist to file storage. Example: `[{“file_system_ocid”: “ocid..._”,“mount_target_ocid”: “ocid...”,"mount_location": "/models",“volume_size_in_gbs”: 500}]` |
35+
| local_filesystem | object | No | Local filesystem path to mount to container. This will be read / write path, and is local to the node the container runs on. Any written data will persist to node, and will be subject to available storage on node. Example: `[{"mount_location": "/models","node_directory_path": “/mnt/nvme/models”}]` |
3436
| recipe_image_uri | string | Yes | Location of the recipe container image. Each recipe points to a specific container image. See the recipe.json examples below. Example: `iduyx1qnmway/corrino-devops-repository:vllmv0901` |
3537
| recipe_container_command_args | string | No | Container init arguments to pass. Each recipe has specific container arguments that it expects. See the Blueprint Arguments section below for details. Example: `["--model","$(Model_Path)","--tensor-parallel-size","$(tensor_parallel_size)"]` |
3638
| recipe_container_env | string | No | Values of the recipe container init arguments. See the Blueprint Arguments section below for details. Example: `[{"key": "tensor_parallel_size","value": "2"},{"key": "model_name","value": "NousResearch/Meta-Llama-3.1-8B-Instruct"},{"key": "Model_Path","value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}]` |

docs/custom_blueprints/blueprint_json_schema.json

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,37 @@
434434
}
435435
}
436436
},
437+
"local_filesystem": {
438+
"type": "array",
439+
"description": "Local filesystem path to mount to container. This will be read / write path, and is local to the node the container runs on. Any written data will persist to node, and will be subject to available storage on node.",
440+
"items": {
441+
"additionalProperties": false,
442+
"required": [
443+
"node_directory_path",
444+
"mount_location"
445+
],
446+
"properties": {
447+
"node_directory_path": {
448+
"type": "string",
449+
"description": "The actual directory path on the node to mount to the container.",
450+
"examples": ["/mnt/nvme/models"]
451+
},
452+
"mount_location": {
453+
"type": "string",
454+
"description": "The mount location in the container.",
455+
"examples": ["/models"]
456+
}
457+
}
458+
},
459+
"examples": [
460+
[
461+
{
462+
"node_directory_path": "/mnt/nvme/models",
463+
"mount_location": "/models"
464+
}
465+
]
466+
]
467+
},
437468
"output_object_storage": {
438469
"type": "array",
439470
"items": {

docs/sample_blueprints/other/using_rdma_enabled_node_pools/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ RDMA is currently supported for:
3232
- BM.GPU.H200.8
3333
- BM.GPU.B200.8
3434
- BM.GPU.B4.8
35+
- BM.GPU.MI300X.8
3536

3637
Additional shape support is coming soon.
3738

@@ -69,6 +70,7 @@ One of the images in the table below must be imported into your tenancy in the c
6970
- Once the image is done importing (30 minutes to an hour), it will be usable during cluster deployment
7071
- To use the image in recipes, you will need to retrieve the image OCID
7172

73+
**Note**: Clicking any of the links below will download a large image file to your computer (~20GB). It is best to copy the link to paste directly into the console when importing the custom image.
7274

7375
**Note**: B200 requires Driver version 570 and CUDA >= 12.8. Ensure correct PAR for compatibility with B200.
7476

@@ -81,6 +83,11 @@ One of the images in the table below must be imported into your tenancy in the c
8183
| Ubuntu 24.04 | H200, H100, A100 | 560 | 12.6 | DOCA-OFED-2.10.0 | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-24.04-2025.05.20-0-DOCA-OFED-2.10.0-GPU-560-CUDA-12.6-2025.06.25-0) |
8284

8385

86+
**Note**: Table for AMD Systems
87+
| Operating System | Shape Compatibility | AMD Driver Version | ROCm Version | Mellanox OFED Driver Version | Image PAR Link |
88+
| :--------------: | :-----------------: | :-------------------: | :----------: | :--------------------------: | :------------: |
89+
| Ubuntu 22.04 | MI300X | 6.10.5 | 6.3.2-66 | v24.10-1.1.4.0 | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-AMD-ROCM-632-2025.03.26-0) |
90+
8491

8592
[This doc](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/custom-images-import.htm#listing-custom-images) provides complete details for all image importing options.
8693

docs/sample_blueprints/platform_features/shared_node_pools/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ Shared node pools are compatible with any blueprint and support all OCI compute
1414
1. Specifying the Availability Domain of the instance type
1515
2. Specifying the custom image OCID to use for the node
1616

17+
**Note**: Clicking the Link in the table below will download a large image file to your computer (~20GB). It is best to copy the link and paste it in your conole to import the image as described in [This document section](../../other/using_rdma_enabled_node_pools/README.md).
18+
19+
| Shape Name | Image PAR |
20+
| :--------: | :-------: |
21+
| BM.GPU.B200.8 | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2025.05.20-0-OFED-24.10-1.1.4.0-GPU-570-OPEN-CUDA-12.8-2025.06.07-0) |
22+
| BM.GPU.MI300X.8 | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-AMD-ROCM-632-2025.03.26-0) |
23+
24+
1725
Additional required fields:
1826

1927
```json

docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_B200_BM.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
"shared_node_pool_shape": "BM.GPU.B200.8",
66
"shared_node_pool_boot_volume_size_in_gbs": 1000,
77
"recipe_availability_domain": "TrcQ:US-ASHBURN-AD-2",
8-
"recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaasbjq55p7d6mmbbvgt6r22fh6mko7jmh2lpaxw7rsyjqg6cpfgs2a"
8+
"recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaa____2a"
99
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"deployment_name": "MI300x-bp",
3+
"recipe_mode": "shared_node_pool",
4+
"shared_node_pool_size": 1,
5+
"shared_node_pool_shape": "BM.GPU.MI300X.8",
6+
"shared_node_pool_boot_volume_size_in_gbs": 1000,
7+
"skip_capacity_validation": true,
8+
"recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaap___ea",
9+
"recipe_availability_domain": "TrcQ:US-ASHBURN-AD-3",
10+
"recipe_public_ssh_key": "ssh-rsa AAAAB3NzaC___= dkennetz"
11+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
{
2+
"recipe_id": "llm_inference_amd",
3+
"recipe_mode": "service",
4+
"deployment_name": "maverick",
5+
"recipe_image_uri": "docker.io/rocm/vllm-dev:llama4-20250514",
6+
"recipe_node_shape": "BM.GPU.MI300X.8",
7+
"recipe_replica_count": 1,
8+
"recipe_container_port": "8000",
9+
"recipe_amd_gpu_count": 4,
10+
"recipe_ephemeral_storage_size": 400,
11+
"recipe_shared_memory_volume_size_limit_in_mb": 16384,
12+
"recipe_use_shared_node_pool": true,
13+
"recipe_prometheus_enabled": true,
14+
"local_filesystem": [
15+
{
16+
"mount_location": "/models",
17+
"node_directory_path": "/mnt/nvme/models"
18+
}
19+
],
20+
"recipe_container_env": [
21+
{
22+
"key": "VLLM_USE_V1",
23+
"value": "1"
24+
},
25+
{
26+
"key": "VLLM_ROCM_USE_AITER",
27+
"value": "1"
28+
},
29+
{
30+
"key": "VLLM_WORKER_MULTIPROC_METHOD",
31+
"value": "spawn"
32+
},
33+
{
34+
"key": "SAFETENSORS_FAST_GPU",
35+
"value": "1"
36+
}
37+
],
38+
"recipe_container_command_args": [
39+
"python3",
40+
"-m",
41+
"vllm.entrypoints.openai.api_server",
42+
"--model",
43+
"/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
44+
"--tensor-parallel-size",
45+
"4",
46+
"--disable-log-requests",
47+
"--max_num_batched_tokens",
48+
"32768",
49+
"--max-num-seqs",
50+
"1024",
51+
"--max-model-len",
52+
"36000",
53+
"--served-model-name",
54+
"Llama-4-Maverick-17B-128E-Instruct-FP8"
55+
],
56+
"recipe_readiness_probe_params": {
57+
"endpoint_path": "/health",
58+
"port": 8000,
59+
"scheme": "HTTP",
60+
"initial_delay_seconds": 20,
61+
"period_seconds": 30,
62+
"success_threshold": 1,
63+
"timeout_seconds": 10
64+
},
65+
"recipe_liveness_probe_params": {
66+
"failure_threshold": 3,
67+
"endpoint_path": "/health",
68+
"port": 8000,
69+
"scheme": "HTTP",
70+
"initial_delay_seconds": 1200,
71+
"period_seconds": 60,
72+
"success_threshold": 1,
73+
"timeout_seconds": 10
74+
}
75+
}

0 commit comments

Comments
 (0)