EmbeddedLLM
diff --git a/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh‎
Lines changed: 2 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 10 additions & 1 deletion b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎.github/mergify.yml‎
Lines changed: 28 additions & 0 deletions b/‎.github/mergify.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/backend_request_func.py‎
Lines changed: 7 additions & 1 deletion b/‎benchmarks/backend_request_func.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎benchmarks/benchmark_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_throughput.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_throughput.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_moe_align_block_size.py‎
Lines changed: 159 additions & 0 deletions b/‎benchmarks/kernels/benchmark_moe_align_block_size.py‎
Lines changed: 159 additions & 0 deletions
@@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
        --name "${container_name}" \
        ${image_name} \
        /bin/bash -c "
+            set -e; # Exit on first error
             python3 /workspace/vllm/examples/offline_inference/neuron.py;
             python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
             for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo 'Running test file: '$f;
+                echo \"Running test file: \$f\";
                 python3 -m pytest \$f -v --capture=tee-sys;
             done
        "
@@ -89,7 +89,7 @@ steps:
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_chunked_prefill
@@ -271,6 +271,15 @@ steps:
   commands:
     - pytest -v -s prefix_caching
 
+
+- label: Platform Tests (CUDA)
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+
 - label: Samplers Test # 36min
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
 
@@ -45,6 +45,7 @@ pull_request_rules:
       - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
       - files~=^vllm/model_executor/models/.*llama.*\.py
       - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+      - title~=(?i)llama
   actions:
     label:
       add:
@@ -65,6 +66,33 @@ pull_request_rules:
       add:
         - multi-modality
 
+- name: label-performance
+  description: Automatically apply performance label
+  conditions:
+    - or:
+      - files~=^benchmarks/
+      - files~=^vllm/benchmarks/
+      - files~=^tests/benchmarks/
+      - files~=^\.buildkite/nightly-benchmarks/
+  actions:
+    label:
+      add:
+        - performance
+
+- name: label-qwen
+  description: Automatically apply qwen label
+  conditions:
+    - or:
+      - files~=^examples/.*qwen.*\.py
+      - files~=^tests/.*qwen.*\.py
+      - files~=^vllm/model_executor/models/.*qwen.*\.py
+      - files~=^vllm/reasoning/.*qwen.*\.py
+      - title~=(?i)Qwen
+  actions:
+    label:
+      add:
+        - qwen
+
 - name: label-rocm
   description: Automatically apply rocm label
   conditions:
 
@@ -115,6 +115,11 @@ repos:
     entry: python tools/check_spdx_header.py
     language: python
     types: [python]
+  - id: check-root-lazy-imports
+    name: Check root lazy imports
+    entry: python tools/check_init_lazy_imports.py
+    language: python
+    types: [python]
   - id: check-filenames
     name: Check for spaces in all filenames
     entry: bash
 
@@ -420,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
-  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
+  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
+  # require CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
     set(SRCS
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
 
@@ -154,11 +154,13 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Contact Us
 
+<!-- --8<-- [start:contact-us] -->
 - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
-- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [[email protected]](mailto:[email protected])
+<!-- --8<-- [end:contact-us] -->
 
 ## Media Kit
 
 
@@ -404,8 +404,14 @@ async def async_request_openai_chat_completions(
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                             continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with a colon.
+                        # These are not JSON data payload and should be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")
 
-                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                         if chunk != "[DONE]":
                             timestamp = time.perf_counter()
                             data = json.loads(chunk)
 
@@ -353,7 +353,7 @@ def sample(
                 : input_lens[i]
             ]
             prompt = tokenizer.decode(re_encoded_sequence)
-            total_input_len = prefix_len + int(input_lens[i])
+            total_input_len = len(re_encoded_sequence)
             requests.append(
                 SampleRequest(
                     prompt=prompt,
 
@@ -97,7 +97,7 @@ def run_vllm(
         assert lora_requests is None, "BeamSearch API does not support LoRA"
         prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
-        output_len = requests[0][2]
+        output_len = requests[0].expected_output_len
         for request in requests:
             assert request.expected_output_len == output_len
         start = time.perf_counter()
 
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import itertools
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size_triton,
+)
+from vllm.triton_utils import triton
+
+
+def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
+    return torch.stack(
+        [
+            torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
+            for _ in range(num_tokens)
+        ]
+    )
+
+
+def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
+    """
+    Verifies vllm vs. Triton
+    """
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+
+    # 1. malloc space for triton and vllm
+    # malloc enough space (max_num_tokens_padded) for the sorted ids
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids_triton = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
+    )
+    sorted_ids_triton.fill_(topk_ids.numel())  # fill with sentinel value
+    expert_ids_triton = torch.zeros(
+        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
+
+    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
+    sorted_ids_vllm.fill_(topk_ids.numel())
+    expert_ids_vllm = torch.zeros_like(expert_ids_triton)
+    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
+
+    # 2. run implementations
+    moe_align_block_size_triton(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_triton,
+        expert_ids_triton,
+        num_tokens_post_pad_triton,
+    )
+
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_vllm,
+        expert_ids_vllm,
+        num_tokens_post_pad_vllm,
+    )
+    print(f"✅ VLLM implementation works with {num_experts} experts!")
+
+    # 3. compare results
+    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
+        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
+    ):
+        print("✅ Triton and VLLM implementations match.")
+    else:
+        print("❌ Triton and VLLM implementations DO NOT match.")
+        print("Triton expert_ids:", expert_ids_triton)
+        print("VLLM expert_ids:", expert_ids_vllm)
+        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
+        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
+
+
+# test configurations
+num_tokens_range = [1, 16, 256, 4096]
+num_experts_range = [16, 64, 224, 256, 280, 512]
+topk_range = [1, 2, 8]
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens", "num_experts", "topk"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["vllm", "triton"],  # "triton"
+        line_names=["VLLM", "Triton"],  # "Triton"
+        plot_name="moe-align-block-size-performance",
+        args={},
+    )
+)
+def benchmark(num_tokens, num_experts, topk, provider):
+    """Benchmark function for Triton."""
+    block_size = 256
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
+    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "vllm":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: ops.moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num_experts",
+        type=int,
+        default=64,
+        choices=[8, 16, 32, 64, 128, 256],
+    )
+    parser.add_argument(
+        "--topk",
+        type=int,
+        default=8,
+        choices=[2, 4, 8],
+        help="Top-k value for correctness check.",
+    )
+    args = parser.parse_args()
+
+    print("Running correctness check...")
+    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
+    benchmark.run(print_data=True, show_plots=True)
Original file line number	Diff line number	Diff line change
`@@ -353,7 +353,7 @@ def sample(`
`353`	`353`	`: input_lens[i]`
`354`	`354`	`]`
`355`	`355`	`prompt = tokenizer.decode(re_encoded_sequence)`
`356`		`- total_input_len = prefix_len + int(input_lens[i])`
	`356`	`+ total_input_len = len(re_encoded_sequence)`
`357`	`357`	`requests.append(`
`358`	`358`	`SampleRequest(`
`359`	`359`	`prompt=prompt,`