diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index 15324965..196517ab 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -495,14 +495,14 @@ steps: {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} - - label: "AMD MI300: {{ step.label }}" + - label: "AMD MI325: {{ step.label }}" depends_on: amd-build agents: - {% if step.label and step.label=="Benchmarks" or step.label=="Kernels Attention Test %N" or step.label=="LoRA Test %N" or step.label=="Kernels Quantization Test %N" %} + {% if step.label=="LoRA Test %N" or step.label=="Kernels Attention Test %N" or step.label=="Kernels Quantization Test %N" or step.label=="Kernels MoE Test %N" or step.label and step.label=="Benchmarks" or step.label=="Benchmarks CLI Test" or step.label=="Basic Models Tests (Extra Initialization) %N" or step.label=="Language Models Tests (Hybrid) %N" %} queue: amd_mi325_8 - {% elif step.label=="Distributed Tests (4 GPUs)" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" %} + {% elif step.label=="Distributed Tests (4 GPUs)" or step.label=="EPLB Execution Test" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" or step.label=="Pipeline + Context Parallelism Test" or step.label=="LoRA TP Test (Distributed)" %} queue: amd_mi325_4 - {% elif step.label=="Distributed Comm Ops Test" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Weight Loading Multiple GPU Test" or step.label=="Weight Loading Multiple GPU Test - Large Models" %} + {% elif step.label=="Metrics, Tracing Test" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Weight Loading Multiple GPU Test" or step.label=="Weight Loading Multiple GPU Test - Large Models" %} queue: amd_mi325_2 {% else %} queue: amd_mi325_1 @@ -511,7 +511,11 @@ steps: env: DOCKER_BUILDKIT: "1" priority: 100 + {% if step.label and step.label=="Regresson Test" %} soft_fail: false + {% else %} + soft_fail: true + {% endif %} {% endif %} {% endfor %} {% for step in steps %} diff --git a/buildkite/test-template-fastcheck.j2 b/buildkite/test-template-fastcheck.j2 index 49c3d593..229a1408 100644 --- a/buildkite/test-template-fastcheck.j2 +++ b/buildkite/test-template-fastcheck.j2 @@ -346,14 +346,14 @@ steps: {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} {% if step.label and step.label=="Basic Correctness Test" %} - - block: "Run AMD MI300: {{ step.label }} with {{mirror_hw}}" + - block: "Run AMD MI325: {{ step.label }} with {{mirror_hw}}" key: block-amd-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} depends_on: amd-build - label: "AMD MI300: {{ step.label }} with {{mirror_hw}}" depends_on: block-amd-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} agents: - queue: amd_mi300_1 + queue: amd_mi325_1 command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: DOCKER_BUILDKIT: "1"