vllm-project
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmark/benchmark_reshape_and_cache.py
Lines changed: 180 additions & 0 deletions b/‎benchmark/benchmark_reshape_and_cache.py
Lines changed: 180 additions & 0 deletions
diff --git a/‎benchmark/benchmark_reshape_and_cache_flash.py
Lines changed: 181 additions & 0 deletions b/‎benchmark/benchmark_reshape_and_cache_flash.py
Lines changed: 181 additions & 0 deletions
@@ -146,6 +146,7 @@ endif()
 
 if(VLLM_GPU_LANG STREQUAL "SYCL")
   set(VLLM_EXT_SRC
+    "csrc/xpu/cache.cpp"
     "csrc/xpu/layernorm.cpp"
     "csrc/xpu/torch_bindings.cpp"
   )
 
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from tests import register_ops as ops
+from tests.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
+
+
+@torch.inference_mode()
+def run_benchmark(
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    num_iters: int,
+    device: str = "xpu",
+) -> float:
+    """Return latency (seconds) for given num_tokens."""
+
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        raise ValueError(
+            "fp8 kv-cache requires head_size to be a multiple of 16.")
+
+    seed = 42
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+
+    # create random key / value tensors [T, H, D].
+    key = torch.randn(num_tokens,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device=device)
+    value = torch.randn_like(key)
+
+    # prepare the slot mapping.
+    # each token is assigned a unique slot in the KV-cache.
+    num_slots = block_size * num_blocks
+    if num_tokens > num_slots:
+        raise ValueError(
+            "num_tokens cannot exceed the total number of cache slots")
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    num_layers = 1  # for simplicity, we use a single layer
+    key_caches, value_caches = create_kv_caches_with_random(
+        num_blocks,
+        block_size,
+        num_layers,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # compute per-kernel scaling factors for fp8 conversion (if used).
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    def run_xpu_benchmark(n_iters: int) -> float:
+        nonlocal key, value, key_cache, value_cache, slot_mapping
+        torch.xpu.synchronize()
+        start = time.perf_counter()
+        for _ in range(n_iters):
+            ops.reshape_and_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+        torch.xpu.synchronize()
+        end = time.perf_counter()
+        return (end - start) / n_iters
+
+    # warm-up
+    run_xpu_benchmark(3)
+
+    lat = run_xpu_benchmark(num_iters)
+
+    # free tensors to mitigate OOM when sweeping
+    del key, value, key_cache, value_cache, slot_mapping
+    torch.xpu.empty_cache()
+
+    return lat
+
+
+def main(args):
+    rows = []
+    for exp in range(1, 12):
+        n_tok = 2**exp
+        lat = run_benchmark(
+            num_tokens=n_tok,
+            num_heads=args.num_heads,
+            head_size=args.head_size,
+            block_size=args.block_size,
+            num_blocks=args.num_blocks,
+            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+            kv_cache_dtype=args.kv_cache_dtype,
+            num_iters=args.iters,
+            device="xpu",
+        )
+        rows.append([
+            n_tok,
+            args.num_heads,
+            args.head_size,
+            args.block_size,
+            args.num_blocks,
+            args.dtype,
+            args.kv_cache_dtype,
+            f"{lat * 1e6:.3f}",
+        ])
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "num_tokens",
+                "num_heads",
+                "head_size",
+                "block_size",
+                "num_blocks",
+                "dtype",
+                "kv_cache_dtype",
+                "latency (us)",
+            ],
+        ))
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size",
+                        type=int,
+                        choices=[16, 32, 64],
+                        default=64)
+    parser.add_argument("--num-blocks", type=int, default=1024)
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["half", "bfloat16"],
+        default="half",
+    )
+
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8", "fp8_e4m3", "fp8_e5m2"],
+        default="auto",
+    )
+
+    parser.add_argument("--iters", type=int, default=100)
+    args = parser.parse_args()
+
+    main(args)
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from tests import register_ops as ops
+from tests.utils import (STR_DTYPE_TO_TORCH_DTYPE,
+                         create_kv_caches_with_random_flash)
+
+
+@torch.inference_mode()
+def run_benchmark(
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    num_iters: int,
+    device: str = "xpu",
+) -> float:
+    """Return latency (seconds) for given num_tokens."""
+
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        raise ValueError(
+            "fp8 kv-cache requires head_size to be a multiple of 16.")
+
+    seed = 42
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+
+    # create random key / value tensors [T, H, D].
+    key = torch.randn(num_tokens,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device=device)
+    value = torch.randn_like(key)
+
+    # prepare the slot mapping.
+    # each token is assigned a unique slot in the KV-cache.
+    num_slots = block_size * num_blocks
+    if num_tokens > num_slots:
+        raise ValueError(
+            "num_tokens cannot exceed the total number of cache slots")
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    num_layers = 1  # for simplicity, we use a single layer
+    key_caches, value_caches = create_kv_caches_with_random_flash(
+        num_blocks,
+        block_size,
+        num_layers,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # compute per-kernel scaling factors for fp8 conversion (if used).
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    def run_xpu_benchmark(n_iters: int) -> float:
+        nonlocal key, value, key_cache, value_cache, slot_mapping
+        torch.xpu.synchronize()
+        start = time.perf_counter()
+        for _ in range(n_iters):
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+        torch.xpu.synchronize()
+        end = time.perf_counter()
+        return (end - start) / n_iters
+
+    # warm-up
+    run_xpu_benchmark(3)
+
+    lat = run_xpu_benchmark(num_iters)
+
+    # free tensors to mitigate OOM when sweeping
+    del key, value, key_cache, value_cache, slot_mapping
+    torch.xpu.empty_cache()
+
+    return lat
+
+
+def main(args):
+    rows = []
+    for exp in range(1, 12):
+        n_tok = 2**exp
+        lat = run_benchmark(
+            num_tokens=n_tok,
+            num_heads=args.num_heads,
+            head_size=args.head_size,
+            block_size=args.block_size,
+            num_blocks=args.num_blocks,
+            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+            kv_cache_dtype=args.kv_cache_dtype,
+            num_iters=args.iters,
+            device="xpu",
+        )
+        rows.append([
+            n_tok,
+            args.num_heads,
+            args.head_size,
+            args.block_size,
+            args.num_blocks,
+            args.dtype,
+            args.kv_cache_dtype,
+            f"{lat * 1e6:.3f}",
+        ])
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "num_tokens",
+                "num_heads",
+                "head_size",
+                "block_size",
+                "num_blocks",
+                "dtype",
+                "kv_cache_dtype",
+                "latency (us)",
+            ],
+        ))
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size",
+                        type=int,
+                        choices=[16, 32, 64],
+                        default=64)
+    parser.add_argument("--num-blocks", type=int, default=512)
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["half", "bfloat16"],
+        default="half",
+    )
+
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8", "fp8_e4m3", "fp8_e5m2"],
+        default="auto",
+    )
+
+    parser.add_argument("--iters", type=int, default=100)
+    args = parser.parse_args()
+
+    main(args)
Original file line number	Diff line number	Diff line change
`@@ -146,6 +146,7 @@ endif()`
`146`	`146`
`147`	`147`	`if(VLLM_GPU_LANG STREQUAL "SYCL")`
`148`	`148`	`set(VLLM_EXT_SRC`
	`149`	`+ "csrc/xpu/cache.cpp"`
`149`	`150`	`"csrc/xpu/layernorm.cpp"`
`150`	`151`	`"csrc/xpu/torch_bindings.cpp"`
`151`	`152`	`)`