vllm-project
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 176 additions & 90 deletions b/‎.buildkite/pipeline.yml‎
Lines changed: 176 additions & 90 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 6 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 6 additions & 0 deletions
@@ -9,26 +9,35 @@ steps:
     agents:
       queue: "cpu_queue_premerge"
 
-  # - label: "Test on NPU"
-  #   depends_on: ~
-  #   key: npu-test
-  #   commands:
-  #     - ".buildkite/scripts/hardware_ci/run_npu_test.sh"
-  #   agents:
-  #     queue: "ascend"
+  - label: "Upload Nightly Pipeline"
+    depends_on: image-build
+    if: build.env("NIGHTLY") == "1"
+    commands:
+      - buildkite-agent pipeline upload .buildkite/test-nightly.yaml
+    agents:
+      queue: "cpu_queue_premerge"
 
   - label: "Simple Unit Test"
-    depends_on: ~
+    depends_on: image-build
     commands:
-      - ".buildkite/scripts/simple_test.sh"
+      - "pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml"
     agents:
-      queue: "cpu_queue_premerge"
+      queue: "gpu_1_queue"
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Diffusion Model Test"
     timeout_in_minutes: 20
     depends_on: image-build
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
+      - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model"
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
@@ -46,6 +55,7 @@ steps:
     depends_on: image-build
     commands:
       - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+      - pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
@@ -79,7 +89,7 @@ steps:
     timeout_in_minutes: 15
     depends_on: image-build
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+      - pytest -s -v -m 'core_model and cache and diffusion and not distributed_cuda and L4'
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
@@ -96,7 +106,7 @@ steps:
     timeout_in_minutes: 20
     depends_on: image-build
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+      - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py -m core_model
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:
@@ -110,11 +120,11 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Diffusion Tensor Parallelism Test"
+  - label: "Diffusion GPU Worker Test"
     timeout_in_minutes: 20
     depends_on: image-build
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+      - pytest -s -v tests/diffusion/test_diffusion_worker.py
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:
@@ -128,109 +138,185 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Diffusion GPU Worker Test"
+
+  # - label: "Benchmark&Engine Test"
+  #   timeout_in_minutes: 15
+  #   depends_on: image-build
+  #   commands:
+  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #     - pytest -s -v tests/benchmarks/test_serve_cli.py
+  #     - pytest -s -v tests/engine/test_async_omni_engine_abort.py
+  #   agents:
+  #     queue: "mithril-h100-pool"
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           containers:
+  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #               resources:
+  #                 limits:
+  #                   nvidia.com/gpu: 2
+  #               volumeMounts:
+  #                 - name: devshm
+  #                   mountPath: /dev/shm
+  #                 - name: hf-cache
+  #                   mountPath: /root/.cache/huggingface
+  #               env:
+  #                 - name: HF_HOME
+  #                   value: /root/.cache/huggingface
+  #           nodeSelector:
+  #             node.kubernetes.io/instance-type: gpu-h100-sxm
+  #           volumes:
+  #             - name: devshm
+  #               emptyDir:
+  #                 medium: Memory
+  #             - name: hf-cache
+  #               hostPath:
+  #                 path: /mnt/hf-cache
+  #                 type: DirectoryOrCreate
+
+  - label: "Omni Model Test"
     timeout_in_minutes: 20
     depends_on: image-build
     commands:
-      - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py
+      - export VLLM_LOGGING_LEVEL=DEBUG
+      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:
       - docker#v5.2.0:
           image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
           always-pull: true
           propagate-environment: true
-          shm-size: "8gb"
           environment:
             - "HF_HOME=/fsx/hf_cache"
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Omni Model Test"
-    timeout_in_minutes: 15
+  # - label: "Omni Model Test with H100"
+  #   timeout_in_minutes: 20
+  #   depends_on: image-build
+  #   commands:
+  #      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #      - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+  #      - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+  #      - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
+  #   agents:
+  #      queue: "mithril-h100-pool"
+  #   plugins:
+  #      - kubernetes:
+  #          podSpec:
+  #            containers:
+  #              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #                resources:
+  #                  limits:
+  #                    nvidia.com/gpu: 2
+  #                volumeMounts:
+  #                  - name: devshm
+  #                    mountPath: /dev/shm
+  #                  - name: hf-cache
+  #                    mountPath: /root/.cache/huggingface
+  #                env:
+  #                  - name: HF_HOME
+  #                    value: /root/.cache/huggingface
+  #            nodeSelector:
+  #              node.kubernetes.io/instance-type: gpu-h100-sxm
+  #            volumes:
+  #              - name: devshm
+  #                emptyDir:
+  #                  medium: Memory
+  #              - name: hf-cache
+  #                hostPath:
+  #                  path: /mnt/hf-cache
+  #                  type: DirectoryOrCreate
+
+  - label: "Qwen3-TTS E2E Test"
+    timeout_in_minutes: 20
     depends_on: image-build
     commands:
       - export VLLM_LOGGING_LEVEL=DEBUG
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+      - pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py
     agents:
-      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+      queue: "gpu_4_queue"
     plugins:
       - docker#v5.2.0:
           image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
           always-pull: true
           propagate-environment: true
+          shm-size: "8gb"
           environment:
             - "HF_HOME=/fsx/hf_cache"
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Omni Model Test with H100"
-    timeout_in_minutes: 30
-    depends_on: image-build
-    commands:
-      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_async_omni.py
-    agents:
-      queue: "mithril-h100-pool"
-    plugins:
-      - kubernetes:
-          podSpec:
-            containers:
-              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                resources:
-                  limits:
-                    nvidia.com/gpu: 2
-                volumeMounts:
-                  - name: devshm
-                    mountPath: /dev/shm
-                  - name: hf-cache
-                    mountPath: /root/.cache/huggingface
-                env:
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
-              - name: devshm
-                emptyDir:
-                  medium: Memory
-              - name: hf-cache
-                hostPath:
-                  path: /mnt/hf-cache
-                  type: DirectoryOrCreate
+  # - label: "Diffusion Image Edit Test with H100 (1 GPU)"
+  #   timeout_in_minutes: 20
+  #   depends_on: image-build
+  #   commands:
+  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #     - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
+  #   agents:
+  #     queue: "mithril-h100-pool"
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           containers:
+  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #               resources:
+  #                 limits:
+  #                   nvidia.com/gpu: 1
+  #               volumeMounts:
+  #                 - name: devshm
+  #                   mountPath: /dev/shm
+  #                 - name: hf-cache
+  #                   mountPath: /root/.cache/huggingface
+  #               env:
+  #                 - name: HF_HOME
+  #                   value: /root/.cache/huggingface
+  #           nodeSelector:
+  #             node.kubernetes.io/instance-type: gpu-h100-sxm
+  #           volumes:
+  #             - name: devshm
+  #               emptyDir:
+  #                 medium: Memory
+  #             - name: hf-cache
+  #               hostPath:
+  #                 path: /mnt/hf-cache
+  #                 type: DirectoryOrCreate
 
-  - label: "Diffusion Image Edit Test with H100 (1 GPU)"
-    timeout_in_minutes: 20
-    depends_on: image-build
-    commands:
-      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-      - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py
-    agents:
-      queue: "mithril-h100-pool"
-    plugins:
-      - kubernetes:
-          podSpec:
-            containers:
-              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                resources:
-                  limits:
-                    nvidia.com/gpu: 1
-                volumeMounts:
-                  - name: devshm
-                    mountPath: /dev/shm
-                  - name: hf-cache
-                    mountPath: /root/.cache/huggingface
-                env:
-                  - name: HF_HOME
-                    value: /root/.cache/huggingface
-            nodeSelector:
-              node.kubernetes.io/instance-type: gpu-h100-sxm
-            volumes:
-              - name: devshm
-                emptyDir:
-                  medium: Memory
-              - name: hf-cache
-                hostPath:
-                  path: /mnt/hf-cache
-                  type: DirectoryOrCreate
+  # - label: "Bagel Text2Img Model Test with H100"
+  #   timeout_in_minutes: 30
+  #   depends_on: image-build
+  #   commands:
+  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #     - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
+  #   agents:
+  #     queue: "mithril-h100-pool"
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           containers:
+  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #               resources:
+  #                 limits:
+  #                   nvidia.com/gpu: 1
+  #               volumeMounts:
+  #                 - name: devshm
+  #                   mountPath: /dev/shm
+  #                 - name: hf-cache
+  #                   mountPath: /root/.cache/huggingface
+  #               env:
+  #                 - name: HF_HOME
+  #                   value: /root/.cache/huggingface
+  #           nodeSelector:
+  #             node.kubernetes.io/instance-type: gpu-h100-sxm
+  #           volumes:
+  #             - name: devshm
+  #               emptyDir:
+  #                 medium: Memory
+  #             - name: hf-cache
+  #               hostPath:
+  #                 path: /mnt/hf-cache
+  #                 type: DirectoryOrCreate
@@ -116,6 +116,9 @@ if [[ $commands == *"--shard-id="* ]]; then
         --shm-size=16gb \
         --group-add "$render_gid" \
         --rm \
+        -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+        -e MIOPEN_DEBUG_CONV_GEMM=0 \
+        -e VLLM_ROCM_USE_AITER=1 \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
         -e AWS_ACCESS_KEY_ID \
@@ -148,6 +151,9 @@ else
           --shm-size=16gb \
           --group-add "$render_gid" \
           --rm \
+          -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+          -e MIOPEN_DEBUG_CONV_GEMM=0 \
+          -e VLLM_ROCM_USE_AITER=1 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \