Adding AMD plugin and metrics tracking for MI300x. (#101)

dkennetzoracle · web-flow · commit 01776527a743 · 2025-08-06T12:18:17.000-07:00
* Adding AMD plugin and metrics tracking to support MI300x.

  - Added example blueprint of MI300x with Llama4 Maverick.
  - Added example blueprint of MI300x shared node pool.
  - Updated API documentation to include local_filesystem and input_file_system
  - Added MI300x specs to RDMA table with link to HPC image

* Renamed local_directory_path to node_directory_path.

* Added AMD metrics exporter version to software versions, and added the bring your own pattern.

* Cleanup.
diff --git a/cluster_creation_terraform/oke.tf b/cluster_creation_terraform/oke.tf
@@ -35,6 +35,13 @@ resource "oci_containerengine_cluster" "oke_cluster" {
   count      = 1
 }
 
+resource "oci_containerengine_addon" "amd_operator_plugin" {
+  cluster_id = oci_containerengine_cluster.oke_cluster[0].id
+  addon_name = "AmdGpuPlugin"
+  remove_addon_resources_on_delete = true
+  override_existing = true
+}
+
 resource "oci_containerengine_node_pool" "oke_node_pool" {
   cluster_id         = oci_containerengine_cluster.oke_cluster[0].id
   compartment_id     = local.oke_compartment_ocid
diff --git a/cluster_creation_terraform/variables.tf b/cluster_creation_terraform/variables.tf
@@ -114,7 +114,7 @@ variable "k8s_version" {
   description = "Kubernetes version installed on your master and worker nodes"
 }
 variable "num_pool_workers" {
-  default     = 6
+  default     = 3
   description = "The number of worker nodes in the node pool. If select Cluster Autoscaler, will assume the minimum number of nodes configured"
 }
 
diff --git a/docs/about.md b/docs/about.md
@@ -45,11 +45,10 @@
 
 This repository provides comprehensive Terraform scripts that provision and configure:
 
-1. An ATP database instance
-2. Grafana & Prometheus for monitoring
-3. MLFlow for experiment tracking
-4. KEDA for dynamic auto-scaling
-5. The OCI AI Blueprints front-end and back-end in an OKE cluster of your choice
+1. Grafana & Prometheus for monitoring
+2. MLFlow for experiment tracking
+3. KEDA for dynamic auto-scaling
+4. The OCI AI Blueprints front-end and back-end in an OKE cluster of your choice
 
 Once installed, you can:
 
diff --git a/docs/api_documentation.md b/docs/api_documentation.md
@@ -31,6 +31,8 @@
 | recipe_shared_memory_volume_size_limit_in_mb | int                    | Yes                                                                                                                                                                                                                                         | ???. Recommend entering 100.??                                                                                                                                                                                                                                                                                                                                               |
 | input_object_storage                         | object                 | Yes                                                                                                                                                                                                                                         | Name of bucket to mount at location “mount_location”. Mount size will be `volume_size_in_gbs`. Will copy all objects in bucket to mount location. Store your LLM model (and in the case of fine-tuning blueprints, your input dataset as well) in this bucket. Example: `[{"bucket_name": "corrino_hf_oss_models", "mount_location": "/models", "volume_size_in_gbs": 500}]` |
 | output_object_storage                        | object                 | No                                                                                                                                                                                                                                          | Required for fine-tuning deployments. Name of bucket to mount at location “mount_location”. Mount size will be “volume_size_in_gbs”. Will copy all items written here during program runtime to bucket on program completion. Example: `[{“bucket_name”: “output”,“mount_location”: “/output”,“volume_size_in_gbs”: 500}]`                                                   |
+| input_file_system                           | object                 | No                                                                                                                                                                                                                                          | Required for shared storage. This is both input and output storage. OCI File System OCID, Mount Target OCID will be used to mount the file system at "mount location". Mount size will be “volume_size_in_gbs”. This works as an NFS, so any data written will persist to file storage. Example: `[{“file_system_ocid”: “ocid..._”,“mount_target_ocid”: “ocid...”,"mount_location": "/models",“volume_size_in_gbs”: 500}]`                                                   |
+| local_filesystem                            | object                 | No                                                                                                                                                                                                                                          | Local filesystem path to mount to container. This will be read / write path, and is local to the node the container runs on. Any written data will persist to node, and will be subject to available storage on node. Example: `[{"mount_location": "/models","node_directory_path": “/mnt/nvme/models”}]`                                                   |
 | recipe_image_uri                             | string                 | Yes                                                                                                                                                                                                                                         | Location of the recipe container image. Each recipe points to a specific container image. See the recipe.json examples below. Example: `iduyx1qnmway/corrino-devops-repository:vllmv0901`                                                                                                                                                                                    |
 | recipe_container_command_args                | string                 | No                                                                                                                                                                                                                                          | Container init arguments to pass. Each recipe has specific container arguments that it expects. See the Blueprint Arguments section below for details. Example: `["--model","$(Model_Path)","--tensor-parallel-size","$(tensor_parallel_size)"]`                                                                                                                             |
 | recipe_container_env                         | string                 | No                                                                                                                                                                                                                                          | Values of the recipe container init arguments. See the Blueprint Arguments section below for details. Example: `[{"key": "tensor_parallel_size","value": "2"},{"key": "model_name","value": "NousResearch/Meta-Llama-3.1-8B-Instruct"},{"key": "Model_Path","value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}]`                                                    |
diff --git a/docs/custom_blueprints/blueprint_json_schema.json b/docs/custom_blueprints/blueprint_json_schema.json
@@ -434,6 +434,37 @@
             }
           }
         },
+        "local_filesystem": {
+          "type": "array",
+          "description": "Local filesystem path to mount to container. This will be read / write path, and is local to the node the container runs on. Any written data will persist to node, and will be subject to available storage on node.",
+          "items": {
+            "additionalProperties": false,
+            "required": [
+              "node_directory_path",
+              "mount_location"
+            ],
+            "properties": {
+              "node_directory_path": {
+                "type": "string",
+                "description": "The actual directory path on the node to mount to the container.",
+                "examples": ["/mnt/nvme/models"]
+              },
+              "mount_location": {
+                "type": "string",
+                "description": "The mount location in the container.",
+                "examples": ["/models"]
+              }
+            }
+          },
+          "examples": [
+            [
+              {
+                "node_directory_path": "/mnt/nvme/models",
+                "mount_location": "/models"
+              }
+            ]
+          ]
+        },
         "output_object_storage": {
           "type": "array",
           "items": {
diff --git a/docs/sample_blueprints/other/using_rdma_enabled_node_pools/README.md b/docs/sample_blueprints/other/using_rdma_enabled_node_pools/README.md
@@ -32,6 +32,7 @@ RDMA is currently supported for:
 - BM.GPU.H200.8
 - BM.GPU.B200.8
 - BM.GPU.B4.8
+- BM.GPU.MI300X.8
 
 Additional shape support is coming soon.
 
@@ -69,6 +70,7 @@ One of the images in the table below must be imported into your tenancy in the c
 - Once the image is done importing (30 minutes to an hour), it will be usable during cluster deployment
 - To use the image in recipes, you will need to retrieve the image OCID
 
+**Note**: Clicking any of the links below will download a large image file to your computer (~20GB). It is best to copy the link to paste directly into the console when importing the custom image.
 
 **Note**: B200 requires Driver version 570 and CUDA >= 12.8. Ensure correct PAR for compatibility with B200.
 
@@ -81,6 +83,11 @@ One of the images in the table below must be imported into your tenancy in the c
 | Ubuntu 24.04     | H200, H100, A100       | 560                | 12.6         | DOCA-OFED-2.10.0             | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-24.04-2025.05.20-0-DOCA-OFED-2.10.0-GPU-560-CUDA-12.6-2025.06.25-0) |
 
 
+**Note**: Table for AMD Systems
+| Operating System | Shape Compatibility | AMD Driver Version    | ROCm Version | Mellanox OFED Driver Version | Image PAR Link |
+| :--------------: | :-----------------: | :-------------------: | :----------: | :--------------------------: | :------------: |
+| Ubuntu 22.04     | MI300X              |         6.10.5        |  6.3.2-66    |  v24.10-1.1.4.0              | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-AMD-ROCM-632-2025.03.26-0) |
+
 
 [This doc](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/custom-images-import.htm#listing-custom-images) provides complete details for all image importing options.
 
diff --git a/docs/sample_blueprints/platform_features/shared_node_pools/README.md b/docs/sample_blueprints/platform_features/shared_node_pools/README.md
@@ -14,6 +14,14 @@ Shared node pools are compatible with any blueprint and support all OCI compute
 1. Specifying the Availability Domain of the instance type
 2. Specifying the custom image OCID to use for the node
 
+**Note**: Clicking the Link in the table below will download a large image file to your computer (~20GB). It is best to copy the link and paste it in your conole to import the image as described in [This document section](../../other/using_rdma_enabled_node_pools/README.md).
+
+| Shape Name      | Image PAR |
+| :--------:      | :-------: |
+| BM.GPU.B200.8   | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2025.05.20-0-OFED-24.10-1.1.4.0-GPU-570-OPEN-CUDA-12.8-2025.06.07-0) |
+| BM.GPU.MI300X.8 | [Link](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-AMD-ROCM-632-2025.03.26-0) |
+
+
 Additional required fields:
 
 ```json
diff --git a/docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_B200_BM.json b/docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_B200_BM.json
@@ -5,5 +5,5 @@
   "shared_node_pool_shape": "BM.GPU.B200.8", 
   "shared_node_pool_boot_volume_size_in_gbs": 1000,
   "recipe_availability_domain": "TrcQ:US-ASHBURN-AD-2",
-  "recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaasbjq55p7d6mmbbvgt6r22fh6mko7jmh2lpaxw7rsyjqg6cpfgs2a"
+  "recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaa____2a"
 }
diff --git a/docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_MI300x_BM.json b/docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_MI300x_BM.json
@@ -0,0 +1,11 @@
+{ 
+  "deployment_name": "MI300x-bp",
+  "recipe_mode": "shared_node_pool",
+  "shared_node_pool_size": 1,
+  "shared_node_pool_shape": "BM.GPU.MI300X.8",
+  "shared_node_pool_boot_volume_size_in_gbs": 1000,
+  "skip_capacity_validation": true,
+  "recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaap___ea",
+  "recipe_availability_domain": "TrcQ:US-ASHBURN-AD-3",
+  "recipe_public_ssh_key": "ssh-rsa AAAAB3NzaC___= dkennetz"
+}
diff --git a/docs/sample_blueprints/platform_features/shared_node_pools/vllm_inference_MI300x_shared_node_pool.json b/docs/sample_blueprints/platform_features/shared_node_pools/vllm_inference_MI300x_shared_node_pool.json
@@ -0,0 +1,75 @@
+{
+    "recipe_id": "llm_inference_amd",
+    "recipe_mode": "service",
+    "deployment_name": "maverick",
+    "recipe_image_uri": "docker.io/rocm/vllm-dev:llama4-20250514",
+    "recipe_node_shape": "BM.GPU.MI300X.8",
+    "recipe_replica_count": 1,
+    "recipe_container_port": "8000",
+    "recipe_amd_gpu_count": 4,
+    "recipe_ephemeral_storage_size": 400,
+    "recipe_shared_memory_volume_size_limit_in_mb": 16384,
+    "recipe_use_shared_node_pool": true,
+    "recipe_prometheus_enabled": true,
+    "local_filesystem": [
+        {
+            "mount_location": "/models",
+            "node_directory_path": "/mnt/nvme/models"
+        }
+    ],
+    "recipe_container_env": [
+        {
+            "key": "VLLM_USE_V1",
+            "value": "1"
+        },
+        {
+            "key": "VLLM_ROCM_USE_AITER",
+            "value": "1"
+        },
+        {
+            "key": "VLLM_WORKER_MULTIPROC_METHOD",
+            "value": "spawn"
+        },
+        {
+            "key": "SAFETENSORS_FAST_GPU",
+            "value": "1"
+        }
+    ],
+    "recipe_container_command_args": [
+        "python3",
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        "/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        "--tensor-parallel-size",
+        "4",
+        "--disable-log-requests",
+        "--max_num_batched_tokens",
+        "32768",
+        "--max-num-seqs",
+        "1024",
+        "--max-model-len",
+        "36000",
+        "--served-model-name",
+        "Llama-4-Maverick-17B-128E-Instruct-FP8"
+    ],
+    "recipe_readiness_probe_params": {
+        "endpoint_path": "/health",
+        "port": 8000,
+        "scheme": "HTTP",
+        "initial_delay_seconds": 20,
+        "period_seconds": 30,
+        "success_threshold": 1,
+        "timeout_seconds": 10
+    },
+    "recipe_liveness_probe_params": {
+        "failure_threshold": 3,
+        "endpoint_path": "/health",
+        "port": 8000,
+        "scheme": "HTTP",
+        "initial_delay_seconds": 1200,
+        "period_seconds": 60,
+        "success_threshold": 1,
+        "timeout_seconds": 10
+    }
+}
diff --git a/docs/versions/QuickStartVersions.md b/docs/versions/QuickStartVersions.md
@@ -45,17 +45,18 @@ The following table describes software versions for tagged releases of this quic
 
 ### Helm Chart Versions
 
-|     Chart Name      | Version |                     Chart URL                      |
-| :-----------------: | :-----: | :------------------------------------------------: |
-|       Grafana       | 6.47.1  |       https://grafana.github.io/helm-charts        |
-|     Prometheus      | 19.0.1  | https://prometheus-community.github.io/helm-charts |
-|   Metrics Server    |  3.8.3  |  https://kubernetes-sigs.github.io/metrics-server  |
-|    Ingress Nginx    |  4.4.0  |     https://kubernetes.github.io/ingress-nginx     |
-|       MLFlow        | 0.16.5  |   https://community-charts.github.io/helm-charts   |
-| NVIDIA GPU Operator | v25.3.0 |         https://helm.ngc.nvidia.com/nvidia         |
-|        Keda         | 2.17.0  |         https://kedacore.github.io/charts          |
-|   LeaderWorkerSet   |  0.1.0  |                       local                        |
-|        Kueue        | 0.11.4  |         oci://registry.k8s.io/kueue/charts         |
+|     Chart Name       | Version |                     Chart URL                      |
+| :-----------------:  | :-----: | :------------------------------------------------: |
+|       Grafana        | 6.47.1  |       https://grafana.github.io/helm-charts        |
+|     Prometheus       | 19.0.1  | https://prometheus-community.github.io/helm-charts |
+|   Metrics Server     |  3.8.3  |  https://kubernetes-sigs.github.io/metrics-server  |
+|    Ingress Nginx     |  4.4.0  |     https://kubernetes.github.io/ingress-nginx     |
+|       MLFlow         | 0.16.5  |   https://community-charts.github.io/helm-charts   |
+| NVIDIA GPU Operator  | v25.3.0 |         https://helm.ngc.nvidia.com/nvidia         |
+|        Keda          | 2.17.0  |         https://kedacore.github.io/charts          |
+|   LeaderWorkerSet    |  0.1.0  |                       local                        |
+|        Kueue         | 0.11.4  |         oci://registry.k8s.io/kueue/charts         |
+| AMD Metrics Exporter | v1.2.1  |                       local                        |
 
 ### Container Versions
 
diff --git a/oci_ai_blueprints_terraform/files/amd-device-metrics-exporter/values.yaml b/oci_ai_blueprints_terraform/files/amd-device-metrics-exporter/values.yaml
@@ -0,0 +1,73 @@
+platform: k8s
+
+# -- Add node selector for the daemonset of metrics exporter
+nodeSelector:
+  node.kubernetes.io/instance-type: BM.GPU.MI300X.8
+
+# -- Add tolerations for deploying metrics exporter on tainted nodes
+tolerations:
+  - effect: NoSchedule
+    operator: Exists
+
+image:
+  # -- repository URL for the metrics exporter image
+  repository: docker.io/rocm/device-metrics-exporter
+  # -- metrics exporter image tag
+  tag: v1.2.1
+  # -- metrics exporter image pullPolicy
+  pullPolicy: Always
+  # -- metrics exporter image pullSecret name
+  pullSecrets: ""
+  # -- metrics exporter initContainer image
+  initContainerImage: busybox:1.36
+
+service:
+  # -- metrics exporter service type, could be ClusterIP or NodePort
+  type: ClusterIP
+  ClusterIP:
+    # -- set port for ClusterIP type service
+    port: 5000
+  NodePort:
+    # -- set port for NodePort type service
+    port: 5000
+    # -- set nodePort for NodePort type service
+    nodePort: 32500
+
+# -- configMap name for the customizing configs and mount into metrics exporter container
+configMap: ""
+
+# -- ServiceMonitor configuration
+serviceMonitor:
+  # -- Whether to create a ServiceMonitor resource for Prometheus Operator
+  enabled: true
+  # -- Scrape interval for the ServiceMonitor
+  interval: "30s"
+  # -- Honor labels configuration for ServiceMonitor
+  honorLabels: true
+  # -- Honor timestamps configuration for ServiceMonitor
+  honorTimestamps: true
+  # -- Additional labels for the ServiceMonitor
+  attachMetadata:
+    node: true
+  labels:
+    release: prometheus
+  # -- RelabelConfigs to apply to samples before scraping
+  relabelings:
+    - sourceLabels: [__meta_kubernetes_pod_node_name]
+      separator: ;
+      regex: ^(.*)$
+      targetLabel: hostname
+      replacement: $1
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_provider_id]
+      targetLabel: instance_id
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_host_serial_number]
+      targetLabel: host_serial_number
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_label_node_kubernetes_io_instance_type]
+      targetLabel: instance_shape
+      action: replace
+    - sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_rdma_cluster_id]
+      targetLabel: cluster_name
+      action: replace
diff --git a/oci_ai_blueprints_terraform/helm.tf b/oci_ai_blueprints_terraform/helm.tf
@@ -79,6 +79,23 @@ resource "helm_release" "nvidia-dcgm" {
   }
 }
 
+resource "helm_release" "amd_device_metrics_exporter" {
+  count             = var.bring_your_own_amd_metrics_exporter ? 0 : 1
+  namespace         = "cluster-tools"
+  name              = "amd-device-metrics-exporter"
+  chart             = "device-metrics-exporter-charts"
+  repository        = "https://rocm.github.io/device-metrics-exporter"
+  version           = "v1.2.1"
+  values            = ["${file("./files/amd-device-metrics-exporter/values.yaml")}"]
+  create_namespace  = false
+  recreate_pods     = true
+  force_update      = true
+  dependency_update = true
+  wait              = false
+  max_history       = 1
+  depends_on = [null_resource.webhook_charts_ready]
+}
+
 resource "helm_release" "keda" {
   name             = "keda"
   repository       = "https://kedacore.github.io/charts"
diff --git a/oci_ai_blueprints_terraform/locals.tf b/oci_ai_blueprints_terraform/locals.tf
@@ -27,7 +27,7 @@ locals {
   }
 
   postgres_db = {
-    host     = "postgres"
+    host     = "bp-postgres"
     port     = "5432"
     db_name  = format("%s_db", random_string.postgres_db_name.result)
     user     = format("%s_user", random_string.postgres_db_username.result)
diff --git a/oci_ai_blueprints_terraform/postgres_db.tf b/oci_ai_blueprints_terraform/postgres_db.tf
diff --git a/oci_ai_blueprints_terraform/schema.yaml b/oci_ai_blueprints_terraform/schema.yaml
diff --git a/oci_ai_blueprints_terraform/variables.tf b/oci_ai_blueprints_terraform/variables.tf

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ variable "k8s_version" {`
`114`	`114`	`description = "Kubernetes version installed on your master and worker nodes"`
`115`	`115`	`}`
`116`	`116`	`variable "num_pool_workers" {`
`117`		`- default = 6`
	`117`	`+ default = 3`
`118`	`118`	`description = "The number of worker nodes in the node pool. If select Cluster Autoscaler, will assume the minimum number of nodes configured"`
`119`	`119`	`}`
`120`	`120`
Original file line number	Diff line number	Diff line change
`@@ -5,5 +5,5 @@`
`5`	`5`	`"shared_node_pool_shape": "BM.GPU.B200.8",`
`6`	`6`	`"shared_node_pool_boot_volume_size_in_gbs": 1000,`
`7`	`7`	`"recipe_availability_domain": "TrcQ:US-ASHBURN-AD-2",`
`8`		`- "recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaasbjq55p7d6mmbbvgt6r22fh6mko7jmh2lpaxw7rsyjqg6cpfgs2a"`
	`8`	`+ "recipe_node_image_ocid": "ocid1.image.oc1.iad.aaaaaaaa____2a"`
`9`	`9`	`}`