ROCm · maleksan85 · Apr 3, 2025 · Apr 3, 2025 · Apr 3, 2025 · Apr 3, 2025
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -10,7 +10,7 @@ export PYTHONPATH=".."
 echo "--- Confirming Clean Initial State"
 while true; do
         sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+        if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
                 echo "GPUs state is \"clean\""
                 break
         fi
@@ -49,18 +49,18 @@ cleanup_docker
 
 echo "--- Resetting GPUs"
 
-echo "reset" > /opt/amdgpu/etc/gpu_state
+echo "reset" > ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}
 
 while true; do
         sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+	if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
                 echo "GPUs state is \"clean\""
                 break
         fi
 done
 
 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -0,0 +1,47 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "vllm/tests" %}
+{% set hf_home = "/root/.cache/huggingface" %}
+
+steps:
+  - label: ":docker: build image"
+    depends_on: ~
+    commands:
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm  --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ."
+      - "docker push {{ docker_image_amd }}"
+    key: "amd-build"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    agents:
+      queue: amd-cpu
+    soft_fail: false
+
+{% for step in steps %}
+{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+  - label: "AMD: {{ step.label }}"
+    depends_on: 
+      - "amd-build"
+    agents:
+{% if step.amd_gpus and step.amd_gpus==8%}
+      queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==4%}
+      queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==2%}
+      queue: amd_gpu
+{% else%}
+      queue: amd_gpu
+{% endif%}
+    commands: 
+      - bash .buildkite/scripts/hardware_ci/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    priority: 100
+    soft_fail: false
+{% endif %}
+{% endfor %}
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -1,22 +1,20 @@
 #!/bin/bash
 set -eux
 
-python_executable=python$1
-cuda_home=/usr/local/cuda-$2
+python_executable=python3
 
 # Update paths
-PATH=${cuda_home}/bin:$PATH
-LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
-
 # Install requirements
-$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
+$python_executable -m pip install -r requirements/rocm.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
-export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
+rm -f "$(which sccache)"
 
-bash tools/check_repo.sh
+export MAX_JOBS=32
 
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/ROCm_performance.md b/ROCm_performance.md
@@ -0,0 +1,19 @@
+# Overview of the optional performance features uinque to https://github.com/ROCm/vllm
+
+## Triton attention
+The default attention function on ROCm is using triton attention kernel. To fallback to the https://github.com/ROCm/flash-attention implementation set up the following environment symbol:  
+`VLLM_USE_TRITON_FLASH_ATTN=0`
+
+## Tunable ops
+Pytorch tunable ops are supported.  
+Define the following environment symbol: `PYTORCH_TUNABLEOP_ENABLED=1` in order to enable both the runtime tuning and the subsequent use of tuned results. To only use the tuned results without tuning any newly encountered shapes, set `PYTORCH_TUNABLEOP_TUNING=0`
+
+## Custom PagedAttention
+
+On ROCm, to have better performance, a custom paged attention is available by switching on the env variable: `VLLM_USE_ROCM_CUSTOM_PAGED_ATTN=1`.
+Currently, this env variable is enabled by default. To fallback to PagedAttention v2 kernel assign the env variable to 0.
+The custom PagedAttention kernel is enabled for dtype: bf16, fp16, block-size=16, head-size=128, and max context length <= 16k, with GQA ratio (num_heads//num_kv_heads) between 1 to 16. On all the other cases, we fallback to PagedAttention v2 kernel.
+
+## NCCL Performance environment variable
+
+For MI300x, setting environment variable NCCL_MIN_NCHANNELS=112 is expected to improve performance.