From c225f961e7a281ce6d1fc72fb6b78787964dc3a5 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 11 Nov 2025 11:07:25 -0800
Subject: [PATCH 1/7] inti

---
 backends/aoti/common_shims.cpp                |   4 +
 backends/aoti/common_shims.h                  |   1 +
 backends/aoti/utils.h                         |   2 +
 backends/cuda/cuda_backend.py                 |   6 +-
 backends/cuda/runtime/shims/memory.cpp        |  90 +++++
 backends/cuda/runtime/shims/memory.h          |  25 ++
 backends/cuda/runtime/utils.h                 |   5 +-
 .../optimized_sdpa_triton.py                  | 284 ++++++++++++++++
 custom_triton_playground/sdpa_triton.py       | 317 ++++++++++++++++++
 .../test-sdpa-with-custom-kernel.py           | 151 +++++++++
 extension/runner_util/inputs.h                |   2 +-
 11 files changed, 880 insertions(+), 7 deletions(-)
 create mode 100644 custom_triton_playground/optimized_sdpa_triton.py
 create mode 100644 custom_triton_playground/sdpa_triton.py
 create mode 100644 custom_triton_playground/test-sdpa-with-custom-kernel.py

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index deb10478778..f1a6cc031b8 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -164,6 +164,10 @@ int32_t aoti_torch_layout_strided() {
 }
 
 // Dtype constants - these return the PyTorch dtype codes
+int32_t aoti_torch_dtype_float16() {
+  return 5; // PyTorch's float16 dtype code
+}
+
 int32_t aoti_torch_dtype_float32() {
   return 6; // PyTorch's float32 dtype code
 }
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 91bb785b684..6600d05b1f5 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -57,6 +57,7 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 // Utility functions for device and layout information
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
+int32_t aoti_torch_dtype_float16();
 int32_t aoti_torch_dtype_float32();
 int32_t aoti_torch_dtype_bfloat16();
 int32_t aoti_torch_dtype_int8();
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
index 8f64bdbe7da..f457b3229c5 100644
--- a/backends/aoti/utils.h
+++ b/backends/aoti/utils.h
@@ -43,6 +43,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
       return executorch::aten::ScalarType::Int;
     case 4: // PyTorch's int64 dtype code
       return executorch::aten::ScalarType::Long;
+    case 5: // PyTorch's float16 (half) dtype code
+      return executorch::aten::ScalarType::Half;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 11: // PyTorch's bool dtype code
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index f8482835ea5..86571baa47a 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -162,11 +162,7 @@ def preprocess(
             "max_autotune_conv_backends": "TRITON",
         }
 
-        with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
-            [
-                SDPBackend.MATH  # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
-            ]
-        ), torch.no_grad():
+        with collect_unsupported_fallback_kernels(), torch.no_grad():
             # torch._logging.set_logs(post_grad_graphs=True)
             # Here we should expect 1 so file and 1 weight blob in the same directory.
             paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 46b8d448a3a..ccaf2538dca 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -582,6 +582,96 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) {
   return Error::Ok;
 }
 
+AOTITorchError aoti_torch_new_tensor_handle(
+    Tensor* orig_handle,
+    Tensor** new_handle) {
+  // Validate input parameters
+  ET_CHECK_OR_RETURN_ERROR(
+      orig_handle != nullptr,
+      InvalidArgument,
+      "aoti_torch_new_tensor_handle failed: orig_handle is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      new_handle != nullptr,
+      InvalidArgument,
+      "aoti_torch_new_tensor_handle failed: new_handle is null");
+
+  // Get metadata from the original tensor
+  int64_t* sizes_ptr;
+  int64_t* strides_ptr;
+  int32_t dtype;
+  int32_t device_type;
+  int32_t device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_sizes(orig_handle, &sizes_ptr));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      aoti_torch_get_strides(orig_handle, &strides_ptr));
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(orig_handle, &dtype));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      aoti_torch_get_device_type(orig_handle, &device_type));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      aoti_torch_get_device_index(orig_handle, &device_index));
+
+  int64_t ndim = orig_handle->dim();
+
+  // Validate dtype
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
+
+  // Ensure device_index is always 0
+  ET_CHECK_OR_RETURN_ERROR(
+      device_index == 0,
+      InvalidArgument,
+      "device_index must be 0, got: %d",
+      device_index);
+
+  // Get the original data pointer from the source tensor
+  void* data_ptr = orig_handle->mutable_data_ptr();
+  ET_CHECK_OR_RETURN_ERROR(
+      data_ptr != nullptr,
+      InvalidArgument,
+      "Source tensor has null data pointer");
+
+  // Check if the given memory is in the map
+  auto memory_it = memory_to_n_tensor.find(data_ptr);
+  ET_CHECK_OR_RETURN_ERROR(
+      memory_it != memory_to_n_tensor.end(),
+      InvalidArgument,
+      "Memory address %p is not being tracked by reference counting system",
+      data_ptr);
+
+  // Convert sizes and strides to vectors
+  std::vector<SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
+  std::vector<StridesType> strides =
+      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
+
+  // Create new tensor that shares the same memory as the original
+  // This is similar to PyTorch's Tensor copy constructor - creates a new
+  // tensor object that shares the same underlying storage
+  std::shared_ptr<Tensor> tensor = make_tensor(
+      sizes, // Same sizes as original
+      data_ptr, // Share the same memory from source tensor
+      {}, // dim_order (empty, will be auto-generated)
+      strides, // Same strides as original
+      dtype_to_scalar_type(dtype) // Same dtype as original
+  );
+
+  ET_CHECK_OR_RETURN_ERROR(
+      tensor != nullptr, InvalidArgument, "Failed to create new tensor handle");
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+
+  *new_handle = tensor.get();
+
+  // Increment the reference count for this memory address only if it is owned
+  // by tensor
+  memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
+      ? NOT_OWN
+      : memory_to_n_tensor[data_ptr] + 1;
+
+  return Error::Ok;
+}
+
 AOTITorchError aoti_torch__reinterpret_tensor(
     Tensor* self,
     int64_t ndim,
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index 7a8d4c3609b..3411f47cda5 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -114,6 +114,31 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     int64_t storage_offset,
     Tensor** ret_new_tensor);
 
+/**
+ * Creates a new tensor handle from an existing one.
+ *
+ * This function creates a new tensor object that shares the same underlying
+ * memory as the original tensor. Similar to PyTorch's Tensor copy constructor,
+ * it creates a new handle/reference to the same data without performing a deep
+ * copy.
+ *
+ * The new tensor will:
+ * - Share the same memory/storage as the original tensor
+ * - Have the same shape, strides, and dtype as the original
+ * - Increment the reference count for the underlying memory (if owned)
+ *
+ * @param orig_handle Original tensor to create a new handle from (must not be
+ * null)
+ * @param new_handle Output pointer to store the new tensor handle (must not be
+ * null)
+ *
+ * @return Error::Ok on success, appropriate error code on failure:
+ *         - Error::InvalidArgument: null pointers or invalid parameters
+ */
+AOTITorchError aoti_torch_new_tensor_handle(
+    Tensor* orig_handle,
+    Tensor** new_handle);
+
 /**
  * Copies data from source tensor to destination tensor.
  *
diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h
index 4474f8cf57e..544322a7fee 100644
--- a/backends/cuda/runtime/utils.h
+++ b/backends/cuda/runtime/utils.h
@@ -61,6 +61,7 @@ enum class SupportedDTypes : int32_t {
   INT16 = 2, // PyTorch's int16 dtype code
   INT32 = 3, // PyTorch's int32 dtype code
   INT64 = 4, // PyTorch's int64 dtype code
+  FLOAT16 = 5, // PyTorch's float16 dtype code
   FLOAT32 = 6, // PyTorch's float32 dtype code
   BOOL = 11, // PyTorch's bool dtype code
   BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
@@ -84,6 +85,7 @@ inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
     case static_cast<int32_t>(SupportedDTypes::INT16):
     case static_cast<int32_t>(SupportedDTypes::INT32):
     case static_cast<int32_t>(SupportedDTypes::INT64):
+    case static_cast<int32_t>(SupportedDTypes::FLOAT16):
     case static_cast<int32_t>(SupportedDTypes::FLOAT32):
     case static_cast<int32_t>(SupportedDTypes::BOOL):
     case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
@@ -98,12 +100,13 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
   ET_CHECK_OR_RETURN_ERROR(
       is_dtype_supported_in_et_cuda(dtype),
       InvalidArgument,
-      "Unsupported dtype: %d. Supported dtypes: %d (int8), %d (int16), %d (int32), %d (int64), %d (float32), %d (bool), %d (bfloat16)",
+      "Unsupported dtype: %d. Supported dtypes: %d (int8), %d (int16), %d (int32), %d (int64), %d (float16), %d (float32), %d (bool), %d (bfloat16)",
       dtype,
       static_cast<int32_t>(SupportedDTypes::INT8),
       static_cast<int32_t>(SupportedDTypes::INT16),
       static_cast<int32_t>(SupportedDTypes::INT32),
       static_cast<int32_t>(SupportedDTypes::INT64),
+      static_cast<int32_t>(SupportedDTypes::FLOAT16),
       static_cast<int32_t>(SupportedDTypes::FLOAT32),
       static_cast<int32_t>(SupportedDTypes::BOOL),
       static_cast<int32_t>(SupportedDTypes::BFLOAT16));
diff --git a/custom_triton_playground/optimized_sdpa_triton.py b/custom_triton_playground/optimized_sdpa_triton.py
new file mode 100644
index 00000000000..df267202d29
--- /dev/null
+++ b/custom_triton_playground/optimized_sdpa_triton.py
@@ -0,0 +1,284 @@
+import math
+from typing import Any, Optional
+
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+
+
+@triton.autotune(
+    configs=[
+        # Favor configs tuned for HEAD_DIM=64 and L up to ~1500
+        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_stages=4, num_warps=8),
+        triton.Config({"BLOCK_M": 128, "BLOCK_N": 256}, num_stages=4, num_warps=8),
+        triton.Config({"BLOCK_M": 64, "BLOCK_N": 256}, num_stages=4, num_warps=4),
+        triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_stages=4, num_warps=4),
+        triton.Config({"BLOCK_M": 128, "BLOCK_N": 64}, num_stages=3, num_warps=4),
+        triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_stages=3, num_warps=4),
+    ],
+    key=["L", "HEAD_DIM"],
+)
+@triton.jit
+def _sdpa_fwd_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    o_ptr,
+    B,
+    H,
+    L,
+    HEAD_DIM,
+    stride_qb,
+    stride_qh,
+    stride_ql,
+    stride_qd,
+    stride_kb,
+    stride_kh,
+    stride_kl,
+    stride_kd,
+    stride_vb,
+    stride_vh,
+    stride_vl,
+    stride_vd,
+    stride_ob,
+    stride_oh,
+    stride_ol,
+    stride_od,
+    sm_scale,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HEAD_DIM_CE: tl.constexpr,
+):
+    # Program IDs
+    pid_m = tl.program_id(axis=0)  # along query length
+    pid_hz = tl.program_id(axis=1)  # flattened batch*head
+
+    off_b = pid_hz // H
+    off_h = pid_hz % H
+
+    # Compute ranges
+    start_m = pid_m * BLOCK_M
+    offs_m = start_m + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_DIM_CE)
+    mask_m = offs_m < L
+
+    # Base pointers for this (b, h)
+    q_base = q_ptr + off_b * stride_qb + off_h * stride_qh
+    k_base = k_ptr + off_b * stride_kb + off_h * stride_kh
+    v_base = v_ptr + off_b * stride_vb + off_h * stride_vh
+    o_base = o_ptr + off_b * stride_ob + off_h * stride_oh
+
+    # Make head-dim addresses compiler-friendly
+    offs_d_ctg = tl.max_contiguous(tl.multiple_of(offs_d, 16), HEAD_DIM_CE)
+
+    # Load Q tile [BLOCK_M, HEAD_DIM] - coalesced along HEAD_DIM
+    q_ptrs = q_base + (offs_m[:, None] * stride_ql + offs_d_ctg[None, :] * stride_qd)
+    q = tl.load(q_ptrs, mask=mask_m[:, None], other=0.0)
+    q = q.to(tl.bfloat16)
+
+    # Initialize accumulators and softmax stats
+    acc = tl.zeros((BLOCK_M, HEAD_DIM_CE), dtype=tl.float32)
+    m_i = tl.full((BLOCK_M,), -float("inf"), dtype=tl.float32)
+    l_i = tl.zeros((BLOCK_M,), dtype=tl.float32)
+
+    # Convert to base-2 scale for exp2
+    qk_scale = sm_scale * 1.4426950408889634
+
+    # Loop over keys/values along sequence length in tiles of BLOCK_N
+    # Load K as [BLOCK_N, HEAD_DIM] for coalesced reads, then use tl.trans(K) in dot
+    for start_n in tl.range(0, L, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        mask_n = offs_n < L
+
+        # Load K tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM)
+        k_ptrs = k_base + (
+            offs_n[:, None] * stride_kl + offs_d_ctg[None, :] * stride_kd
+        )
+        k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0)
+        k = k.to(tl.bfloat16)
+
+        # Compute attention logits [BLOCK_M, BLOCK_N] = Q[BM,D] @ K[BN,D]^T
+        qk = tl.dot(q, tl.trans(k)).to(tl.float32)  # accumulator in fp32
+        qk = qk * qk_scale
+
+        # Apply OOB masks for both rows and cols to keep stability
+        qk = tl.where(mask_n[None, :], qk, -float("inf"))
+        qk = tl.where(mask_m[:, None], qk, -float("inf"))
+
+        # Online softmax
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))
+        p = tl.math.exp2(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        alpha = tl.math.exp2(m_i - m_ij)
+
+        # Load V tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM)
+        v_ptrs = v_base + (
+            offs_n[:, None] * stride_vl + offs_d_ctg[None, :] * stride_vd
+        )
+        v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0)
+        v = v.to(tl.bfloat16)
+
+        # Update accumulator
+        acc = acc * alpha[:, None]
+        # Cast p to bf16 to use tensor-cores in tl.dot; accumulate in fp32
+        p_bf16 = p.to(tl.bfloat16)
+        acc = tl.dot(p_bf16, v, acc)
+
+        # Update softmax stats
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    # Normalize accumulator by softmax denominator
+    acc = acc / l_i[:, None]
+
+    # Store output [BLOCK_M, HEAD_DIM]
+    o_ptrs = o_base + (offs_m[:, None] * stride_ol + offs_d_ctg[None, :] * stride_od)
+    tl.store(o_ptrs, acc.to(tl.bfloat16), mask=mask_m[:, None])
+
+
+@triton_op("custom::optimized_triton_scaled_dot_product_attention", mutates_args={})
+def optimized_triton_scaled_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: float = 0.0,
+    enable_gqa: bool = False,
+) -> torch.Tensor:
+    """
+    Triton fused Scaled Dot-Product Attention (forward, no causal, no dropout).
+    Expected shapes (tested): [B=1, H=20, L<=1500, D=64], dtype bfloat16.
+
+    Args:
+        query: Query tensor [B, H, L, D]
+        key: Key tensor [B, H, L, D]
+        value: Value tensor [B, H, L, D]
+        attn_mask: must be None (not supported)
+        dropout_p: must be 0.0 (not supported)
+        is_causal: must be False (not supported)
+        scale: must be 0.0 (not supported)
+        enable_gqa: must be False (not supported)
+
+    Returns:
+        Output tensor [B, H, L, D]
+    """
+    # Validate inputs
+    if not (query.is_cuda and key.is_cuda and value.is_cuda):
+        raise RuntimeError("Q, K, V must be CUDA tensors.")
+    if (
+        query.dtype != torch.bfloat16
+        or key.dtype != torch.bfloat16
+        or value.dtype != torch.bfloat16
+    ):
+        raise RuntimeError("Expected bfloat16 inputs")
+    if query.shape != key.shape or query.shape != value.shape:
+        raise RuntimeError(
+            f"Q, K, V must have identical shapes; got query={query.shape}, key={key.shape}, value={value.shape}."
+        )
+    if query.dim() != 4:
+        raise RuntimeError(
+            f"Expected 4D tensors shaped [B, H, L, D]; got {query.dim()}D."
+        )
+
+    # Enforce that only default values are accepted for these arguments
+    if attn_mask is not None:
+        raise RuntimeError(
+            "attn_mask must be None (not supported in this implementation)."
+        )
+    if dropout_p != 0.0:
+        raise RuntimeError(
+            "dropout_p must be 0.0 (not supported in this implementation)."
+        )
+    if is_causal is not False:
+        raise RuntimeError(
+            "is_causal must be False (not supported in this implementation)."
+        )
+    if scale != 0.0:
+        raise RuntimeError("scale must be 0.0 (not supported in this implementation).")
+    if enable_gqa is not False:
+        raise RuntimeError(
+            "enable_gqa must be False (not supported in this implementation)."
+        )
+
+    B, H, L, D = query.shape
+    # Allocate output
+    out = torch.empty_like(query)
+
+    # Element-wise strides (in elements)
+    sqb, sqh, sql, sqd = query.stride()
+    skb, skh, skl, skd = key.stride()
+    svb, svh, svl, svd = value.stride()
+    sob, soh, sol, sod = out.stride()
+
+    # Grid: tile queries (M) and batch*heads axis
+    def grid(META):
+        return (
+            triton.cdiv(L, META["BLOCK_M"]),
+            B * H,
+        )
+
+    # Scale factor for SDPA
+    sm_scale = 1.0 / math.sqrt(D)
+
+    # Launch kernel using wrap_triton to avoid tracing issues during export/compile
+    # Note: wrap_triton returns a callable that can be indexed with grid
+    wrap_triton(_sdpa_fwd_kernel)[grid](
+        query,
+        key,
+        value,
+        out,
+        B,
+        H,
+        L,
+        D,
+        sqb,
+        sqh,
+        sql,
+        sqd,
+        skb,
+        skh,
+        skl,
+        skd,
+        svb,
+        svh,
+        svl,
+        svd,
+        sob,
+        soh,
+        sol,
+        sod,
+        sm_scale,
+        HEAD_DIM_CE=D,
+    )
+
+    return out
+
+
+# Register the abstract/fake implementation for torch.export
+# This is critical to avoid accessing real tensor data during export
+@optimized_triton_scaled_dot_product_attention.register_fake
+def _optimized_triton_sdpa_abstract(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale=None,
+    enable_gqa=False,
+) -> torch.Tensor:
+    """
+    Abstract/fake implementation for torch.export.
+    This just returns an empty tensor with the correct shape/dtype/device.
+    No actual computation happens here - this is only for shape inference during export.
+    """
+    # Validate shapes match
+    assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape"
+    assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
+
+    # Output has the same shape and dtype as query
+    # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation
+    return torch.empty_like(query, dtype=query.dtype, device=query.device)
diff --git a/custom_triton_playground/sdpa_triton.py b/custom_triton_playground/sdpa_triton.py
new file mode 100644
index 00000000000..09233e557bd
--- /dev/null
+++ b/custom_triton_playground/sdpa_triton.py
@@ -0,0 +1,317 @@
+# kernel.py
+import math
+from typing import Any, Optional
+
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+
+
+"""
+Fused Scaled Dot-Product Attention (SDPA) implemented in a single Triton kernel.
+
+This module provides a transparent replacement for torch.nn.functional.scaled_dot_product_attention
+using a custom Triton kernel. The replacement is automatic - no model code changes needed!
+
+How it works:
+1. We register a custom implementation using torch.library
+2. When torch.nn.functional.scaled_dot_product_attention is called,
+   PyTorch's dispatch mechanism routes it to our implementation during AOTI compilation
+3. The model code remains unchanged
+
+What is fused:
+- We fuse QK^T matmul, numerically-stable online softmax, and the final
+  multiplication by V into one streaming kernel. No intermediate attention
+  matrix is materialized in memory.
+
+Design notes:
+- We tile along the query (sequence) dimension with BLOCK_M rows and iterate
+  over the key/value sequence dimension in BLOCK_N columns.
+- For each (batch, head) pair and query tile, we:
+  * Load a tile of Q once and keep it in registers.
+  * Stream over K/V in blocks: compute qk = Q @ K^T, update running row-wise
+    softmax statistics (m_i, l_i) and the output accumulator acc = sum(p * V)
+    using the "online softmax" algorithm:
+       m_new = max(m_old, max(qk))
+       p     = exp(qk - m_new)
+       acc   = acc * exp(m_old - m_new) + p @ V
+       l_new = l_old * exp(m_old - m_new) + sum(p)
+       m_old = m_new
+  * Finally, write O = acc / l_i.
+- All accumulation is done in fp32 for numerical stability; inputs/outputs are fp16.
+- Boundary conditions are handled with masks.
+- The Python wrapper only validates inputs, allocates outputs, configures the grid,
+  and launches the Triton kernel. All math is inside the Triton kernel.
+
+Runtime constraints respected:
+- No torch.nn or torch.nn.functional is used in the execution path.
+- No PyTorch compute ops are used to implement the algorithm; all math happens
+  in Triton via tl.load/tl.store/tl.dot/tl.exp/tl.max/tl.sum.
+"""
+
+
+@triton.jit
+def _sdpa_fwd_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    o_ptr,
+    B,
+    H,
+    S,
+    D,  # shapes
+    stride_qb,
+    stride_qh,
+    stride_qs,
+    stride_qd,
+    stride_kb,
+    stride_kh,
+    stride_ks,
+    stride_kd,
+    stride_vb,
+    stride_vh,
+    stride_vs,
+    stride_vd,
+    stride_ob,
+    stride_oh,
+    stride_os,
+    stride_od,
+    scale,  # 1/sqrt(D)
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+):
+    # Program IDs
+    pid_m = tl.program_id(0)  # along sequence dimension (queries)
+    pid_bh = tl.program_id(1)  # across batch*heads
+
+    b = pid_bh // H
+    h = pid_bh % H
+
+    # Offsets for this block of queries
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_DIM)
+
+    # Base pointers for this (b, h)
+    q_bh = q_ptr + b * stride_qb + h * stride_qh
+    k_bh = k_ptr + b * stride_kb + h * stride_kh
+    v_bh = v_ptr + b * stride_vb + h * stride_vh
+    o_bh = o_ptr + b * stride_ob + h * stride_oh
+
+    # Load Q tile: [BLOCK_M, HEAD_DIM]
+    q_ptrs = q_bh + (offs_m[:, None] * stride_qs + offs_d[None, :] * stride_qd)
+    q_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D)
+    q = tl.load(q_ptrs, mask=q_mask, other=0.0)
+
+    # Initialize online-softmax stats and output accumulator
+    m_i = tl.full([BLOCK_M], -float("inf"), dtype=tl.float32)
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
+
+    # Iterate over keys/values in blocks of BLOCK_N
+    for start_n in tl.range(0, S, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        kv_mask_cols = offs_n < S
+
+        # Load K in a layout suitable for qk = q @ kT:
+        # k_ptrs produces a tensor of shape [HEAD_DIM, BLOCK_N]
+        k_ptrs = k_bh + (offs_n[None, :] * stride_ks + offs_d[:, None] * stride_kd)
+        k = tl.load(
+            k_ptrs, mask=(offs_d[:, None] < D) & (kv_mask_cols[None, :]), other=0.0
+        )
+
+        # qk = [BLOCK_M, BLOCK_N] in fp32
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, qk)
+        qk = qk * scale  # scale by 1/sqrt(D)
+
+        # Mask out-of-bounds columns so they don't affect max/sum
+        qk = tl.where(kv_mask_cols[None, :], qk, -float("inf"))
+
+        # Online softmax update
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])  # fp32
+        alpha = tl.exp(m_i - m_ij)
+        l_i = l_i * alpha + tl.sum(p, axis=1)
+        m_i = m_ij
+
+        # Load V tile: [BLOCK_N, HEAD_DIM]
+        v_ptrs = v_bh + (offs_n[:, None] * stride_vs + offs_d[None, :] * stride_vd)
+        v = tl.load(
+            v_ptrs, mask=(kv_mask_cols[:, None]) & (offs_d[None, :] < D), other=0.0
+        )
+
+        # Update output accumulator: acc = acc * alpha + p @ v
+        acc = acc * alpha[:, None]
+        # Use fp16 inputs for tl.dot with fp32 accumulation
+        acc = tl.dot(p.to(tl.float16), v.to(tl.float16), acc)
+
+    # Normalize: O = acc / l_i[:, None]
+    o = acc / l_i[:, None]
+    # Store O in fp16
+    o_ptrs = o_bh + (offs_m[:, None] * stride_os + offs_d[None, :] * stride_od)
+    o_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D)
+    tl.store(o_ptrs, o.to(tl.float16), mask=o_mask)
+
+
+@triton_op("custom::scaled_dot_product_attention", mutates_args={})
+def triton_scaled_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: float = 0.0,
+    enable_gqa: bool = False,
+) -> torch.Tensor:
+    """
+    Fused Scaled Dot-Product Attention registered as a custom op:
+      O = softmax(Q @ K^T / sqrt(D)) @ V
+    where Q, K, V are shaped [batch, heads, seq_len, head_dim].
+
+    This function is registered with @triton_op so AOTI can discover and use it
+    during compilation as a replacement for torch.nn.functional.scaled_dot_product_attention.
+
+    Wrapper responsibilities:
+    - Validate input tensors (dtype/device/shapes)
+    - Allocate output tensor
+    - Configure grid and launch the Triton kernel
+    - No math is done here beyond basic scalar setup; all heavy compute runs in the Triton kernel.
+
+    Fusion details:
+    - This launches a single kernel that computes QK^T, performs online softmax,
+      and multiplies by V to produce O, all in one pass over K/V blocks.
+    - No intermediate attention matrix is written to global memory.
+
+    Args:
+        query: Query tensor [B, H, S, D]
+        key: Key tensor [B, H, S, D]
+        value: Value tensor [B, H, S, D]
+        attn_mask: has to be None
+        is_causal: has to be False
+        scale: has to be None
+        enable_gqa: has to be False
+
+    Returns:
+        Output tensor [B, H, S, D]
+    """
+    # Basic validation
+    if not (query.is_cuda and key.is_cuda and value.is_cuda):
+        raise RuntimeError("Q, K, V must be CUDA tensors.")
+    if (
+        query.dtype != torch.float16
+        or key.dtype != torch.float16
+        or value.dtype != torch.float16
+    ):
+        raise RuntimeError("This reference implementation expects float16 tensors.")
+    if query.shape != key.shape or query.shape != value.shape:
+        raise RuntimeError(
+            f"Q, K, V must have identical shapes; got Q={query.shape}, K={key.shape}, V={value.shape}."
+        )
+    if query.dim() != 4:
+        raise RuntimeError(
+            f"Expected 4D tensors shaped [B, H, S, D]; got {query.dim()}D."
+        )
+
+    # Enforce that only default values are accepted for these arguments
+    if attn_mask is not None:
+        raise RuntimeError(
+            "attn_mask must be None (not supported in this implementation)."
+        )
+
+    if dropout_p != 0.0:
+        raise RuntimeError(
+            "dropout_p must be 0.0 (not supported in this implementation)."
+        )
+    if is_causal is not False:
+        raise RuntimeError(
+            "is_causal must be False (not supported in this implementation)."
+        )
+    if scale != 0:
+        raise RuntimeError("scale must be None (not supported in this implementation).")
+    if enable_gqa is not False:
+        raise RuntimeError(
+            "enable_gqa must be False (not supported in this implementation)."
+        )
+
+    B, H, S, D = query.shape
+
+    # Allocate output
+    O = torch.empty_like(query)
+
+    # Choose tiling parameters (powers of two, coalesced-friendly)
+    # Conservative sizes to keep register/SMEM pressure reasonable for D=1024
+    BLOCK_M = 16
+    BLOCK_N = 32
+
+    # Compute softmax scale on host (scalar) - this is setup, not heavy math
+    scale = 1.0 / math.sqrt(float(D))
+
+    # Grid: one program per (query block, batch*head)
+    grid = (triton.cdiv(S, BLOCK_M), B * H)
+
+    # Launch kernel using wrap_triton to avoid tracing issues during export/compile
+    # Note: wrap_triton returns a callable that can be indexed with grid
+    wrap_triton(_sdpa_fwd_kernel)[grid](
+        query,
+        key,
+        value,
+        O,
+        B,
+        H,
+        S,
+        D,
+        query.stride(0),
+        query.stride(1),
+        query.stride(2),
+        query.stride(3),
+        key.stride(0),
+        key.stride(1),
+        key.stride(2),
+        key.stride(3),
+        value.stride(0),
+        value.stride(1),
+        value.stride(2),
+        value.stride(3),
+        O.stride(0),
+        O.stride(1),
+        O.stride(2),
+        O.stride(3),
+        scale,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        HEAD_DIM=D,
+        num_warps=4,
+        num_stages=2,
+    )
+
+    return O
+
+
+# Register the abstract/fake implementation for torch.export
+# This is critical to avoid accessing real tensor data during export
+@triton_scaled_dot_product_attention.register_fake
+def _triton_sdpa_abstract(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale=None,
+    enable_gqa=False,
+) -> torch.Tensor:
+    """
+    Abstract/fake implementation for torch.export.
+    This just returns an empty tensor with the correct shape/dtype/device.
+    No actual computation happens here - this is only for shape inference during export.
+    """
+    # Validate shapes match
+    assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape"
+    assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
+
+    # Output has the same shape and dtype as query
+    # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation
+    return torch.empty_like(query, dtype=query.dtype, device=query.device)
diff --git a/custom_triton_playground/test-sdpa-with-custom-kernel.py b/custom_triton_playground/test-sdpa-with-custom-kernel.py
new file mode 100644
index 00000000000..16883bc83d9
--- /dev/null
+++ b/custom_triton_playground/test-sdpa-with-custom-kernel.py
@@ -0,0 +1,151 @@
+# ============================================================================
+# IMPORTANT: Import sdpa_triton BEFORE defining the model
+# This automatically enables the custom Triton kernel via monkey-patching
+# ============================================================================
+import argparse
+import os
+from contextlib import nullcontext
+
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from optimized_sdpa_triton import optimized_triton_scaled_dot_product_attention
+from sdpa_triton import triton_scaled_dot_product_attention
+from torch.export import Dim, export
+from torch.nn.attention import SDPBackend
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, query, key, value):
+        # This is the ORIGINAL code - we're NOT changing it!
+        # But it will automatically use our custom Triton kernel
+        # because we imported sdpa_triton above
+        out = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        return out
+
+
+sdpa_ctx = nullcontext()
+
+
+# hacky method to replace system sdpa with my triton
+def init_sdpa_kernel(custom_triton):
+    global sdpa_ctx
+    if custom_triton == "decomposed_kernel":
+        sdpa_ctx = torch.nn.attention.sdpa_kernel([SDPBackend.MATH])
+    elif custom_triton == "unoptimized_triton":
+        torch.nn.functional.scaled_dot_product_attention = (
+            triton_scaled_dot_product_attention
+        )
+    elif custom_triton == "optimized_triton":
+        torch.nn.functional.scaled_dot_product_attention = (
+            optimized_triton_scaled_dot_product_attention
+        )
+    else:
+        assert False, f"{custom_triton} has not been supported yet"
+
+
+def main(kernel_type, output_dir, dtype):
+    print(f"Using kernel type: {kernel_type}")
+    print(f"Using dtype: {dtype}")
+    init_sdpa_kernel(kernel_type)
+
+    model = Model()
+    batch_size, num_heads, seq_len, head_dim = 1, 20, 1500, 64
+
+    # Map dtype string to torch dtype
+    dtype_map = {
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+    }
+    torch_dtype = dtype_map[dtype]
+
+    # Create inputs with specified dtype
+    inputs = (
+        torch.randn(
+            batch_size,
+            num_heads,
+            seq_len,
+            head_dim,
+            dtype=torch_dtype,
+            device="cuda",
+        ),
+        torch.randn(
+            batch_size,
+            num_heads,
+            seq_len,
+            head_dim,
+            dtype=torch_dtype,
+            device="cuda",
+        ),
+        torch.randn(
+            batch_size,
+            num_heads,
+            seq_len,
+            head_dim,
+            dtype=torch_dtype,
+            device="cuda",
+        ),
+    )
+
+    print("Testing model execution with custom kernel...")
+    with torch.no_grad():
+        output = model(*inputs)
+        print(f"✓ Model executed successfully. Output shape: {output.shape}\n")
+
+    print("Exporting model...")
+    exported_program = export(model, inputs)
+    print("✓ Model exported successfully\n")
+
+    print("Lowering to ExecuTorch CUDA backend (using AOTI)...")
+    with sdpa_ctx, torch.no_grad():
+        executorch_program = to_edge_transform_and_lower(
+            exported_program,
+            partitioner=[
+                CudaPartitioner(
+                    [CudaBackend.generate_method_name_compile_spec("forward")]
+                )
+            ],
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        ).to_executorch()
+        print("✓ Model lowered successfully with AOTI\n")
+
+        print("Saving model...")
+        os.makedirs(output_dir, exist_ok=True)
+        with open(os.path.join(output_dir, "model.pte"), "wb") as file:
+            file.write(executorch_program.buffer)
+
+        executorch_program.write_tensor_data_to_file(output_dir)
+        print(f"✓ PTE and PTD files has successfully dumped to {output_dir}\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test SDPA with custom kernel")
+    parser.add_argument(
+        "--kernel_type",
+        type=str,
+        choices=["unoptimized_triton", "optimized_triton", "decomposed_kernel"],
+        help="Type of kernel to use",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=".",
+        help="Directory to save model.pte and tensor data (default: current directory)",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["fp16", "bf16"],
+        default="bf16",
+        help="Data type for model inputs (default: bf16)",
+    )
+
+    args = parser.parse_args()
+
+    main(args.kernel_type, args.output_dir, args.dtype)
diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h
index 1a30e2cc4df..b587628fd1d 100644
--- a/extension/runner_util/inputs.h
+++ b/extension/runner_util/inputs.h
@@ -64,7 +64,7 @@ struct PrepareInputTensorsOptions {
    * all inputs exceeds this, an error is returned. This prevents allocating too
    * much memory if the PTE file is malformed.
    */
-  size_t max_total_allocation_size = 1024 * 1024 * 1024;
+  size_t max_total_allocation_size = 1024 * 1024 * 1024 * 10;
 
   /**
    * The maximum number of inputs to allocate. If the number of inputs exceeds

From 3cd8bbf264ae39a368ced9840c8d85c22c269c21 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 12 Nov 2025 13:53:37 -0800
Subject: [PATCH 2/7] temp save

---
 backends/cuda/cuda_backend.py |  3 ++
 eval.sh                       | 52 +++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 eval.sh

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 86571baa47a..bfafe75559c 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -152,6 +152,7 @@ def preprocess(
             # Separate weight constants from the .so file
             "aot_inductor.package": True,
             "aot_inductor.package_constants_in_so": False,
+            "aot_inductor.freezing": True,
             # Store weight constants on disk in a binary blob
             "aot_inductor.package_constants_on_disk_format": "binary_blob",
             # Enable maximum automatic tuning for optimal performance
@@ -187,6 +188,8 @@ def preprocess(
                 f"Could not find required files in compiled paths, got {paths}"
             )
 
+        print("--- Generate .so lives at", so_path)
+
         # pyre-ignorep[6]: Incompatible parameter type
         with open(so_path, "rb") as f:
             so_data = f.read()
diff --git a/eval.sh b/eval.sh
new file mode 100644
index 00000000000..57f5740d446
--- /dev/null
+++ b/eval.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# 用法: ./evaluate_kernel.sh <kernel_name> <n_evaluation>
+KERNEL_NAME=$1
+N_EVAL=$2
+
+# 路径前缀
+BASE_PATH=~/kernel-gen/whisper-large-v3-turbo/${KERNEL_NAME}/
+
+MODEL_PATH=${BASE_PATH}model.pte
+DATA_PATH=${BASE_PATH}aoti_cuda_blob.ptd
+TOKENIZER_PATH=${BASE_PATH}
+AUDIO_PATH=${BASE_PATH}output.wav
+PROCESSOR_PATH=${BASE_PATH}whisper_preprocessor.pte
+
+CMD="cmake-out/examples/models/whisper/whisper_runner \
+  --model_path ${MODEL_PATH} \
+  --data_path ${DATA_PATH} \
+  --temperature 0 \
+  --tokenizer_path ${TOKENIZER_PATH} \
+  --audio_path ${AUDIO_PATH} \
+  --processor_path ${PROCESSOR_PATH}"
+
+rates=()
+for ((i=1; i<=N_EVAL; i++)); do
+  echo "Running evaluation $i/$N_EVAL..."
+  output=$($CMD 2>&1)
+  # 推荐用 awk
+  rate=$(echo "$output" | grep "Generated 128 tokens:" | awk '{print $(NF-1)}')
+  echo "Generated token rate for run $i: $rate"
+  if [[ ! -z "$rate" ]]; then
+    rates+=($rate)
+  fi
+done
+
+# 计算平均值
+sum=0
+count=0
+for r in "${rates[@]}"; do
+  # 只统计非空数值
+  if [[ ! -z "$r" ]]; then
+    sum=$(echo "$sum + $r" | bc)
+    count=$((count+1))
+  fi
+done
+
+if [[ $count -gt 0 ]]; then
+  avg=$(echo "scale=2; $sum / $count" | bc)
+  echo "Average Generated token rate over $count runs: $avg tokens/second"
+else
+  echo "No valid token rates found."
+fi

From ec8b0e8a4263c8222ce7805b9f64043f2d529bfb Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 12 Nov 2025 15:36:36 -0800
Subject: [PATCH 3/7] remove wrong freezing attribute

---
 backends/cuda/cuda_backend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index bfafe75559c..b871eb75c1a 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -152,7 +152,6 @@ def preprocess(
             # Separate weight constants from the .so file
             "aot_inductor.package": True,
             "aot_inductor.package_constants_in_so": False,
-            "aot_inductor.freezing": True,
             # Store weight constants on disk in a binary blob
             "aot_inductor.package_constants_on_disk_format": "binary_blob",
             # Enable maximum automatic tuning for optimal performance

From 638ad62c49e747b3395b43c5fd82b92ccfaea26f Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 13 Nov 2025 11:19:28 -0800
Subject: [PATCH 4/7] temp save

---
 backends/aoti/common_shims.cpp |   7 -
 backends/aoti/common_shims.h   |   3 -
 backends/cuda/cuda_backend.py  |   8 +-
 benchmarking.py                | 272 +++++++++++++++++++++++++++++++++
 decomposed-freezing-result.txt | 184 ++++++++++++++++++++++
 triton-v2-freeze-result.txt    | 184 ++++++++++++++++++++++
 triton-v3-result.txt           | 184 ++++++++++++++++++++++
 7 files changed, 827 insertions(+), 15 deletions(-)
 create mode 100644 benchmarking.py
 create mode 100644 decomposed-freezing-result.txt
 create mode 100644 triton-v2-freeze-result.txt
 create mode 100644 triton-v3-result.txt

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index dcb4c7b37d4..5fa5c5beef5 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -242,13 +242,6 @@ aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
   return Error::Internal;
 }
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) {
-  (void)orig_handle;
-  (void)new_handle;
-  throw std::runtime_error("Not implemented");
-  return Error::Internal;
-}
 
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
     void* data_ptr,
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 5ae583b81a8..7b4b3202731 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -97,9 +97,6 @@ aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor);
 AOTI_SHIM_EXPORT AOTITorchError
 aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor);
 
-AOTI_SHIM_EXPORT AOTITorchError
-aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
-
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
     void* data_ptr,
     int64_t ndim,
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 8d3c7b3db50..487b0d64c1d 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -141,9 +141,8 @@ def preprocess(  # noqa: C901
                 user_input_placeholders.append(node.meta["val"])
 
         options: dict[str, typing.Any] = {
-            # Disable this to support sdpa decomposition
-            # TODO(gasoonjia): remove it after pin bump to latest pytorch
-            "loop_ordering_after_fusion": False,
+            # Frozen weight during inference for better performance and more optimization like kernel fusion
+            "freezing": True,
             # Better model precision
             "emulate_precision_casts": True,
             # Embed CUDA kernel binaries directly into the compiled shared object
@@ -163,7 +162,6 @@ def preprocess(  # noqa: C901
             "max_autotune_conv_backends": "TRITON",
         }
 
-
         platform = "linux"
         shim_library_path = None
         for spec in compile_specs:
@@ -172,7 +170,7 @@ def preprocess(  # noqa: C901
             if spec.key == "shim_library_path":
                 shim_library_path = spec.value.decode("utf-8")
 
-        assert platform == "linux" or platform == "windows"
+        assert platform == "linux"
         if platform == "windows" and shim_library_path is None:
             lib_dir = resources.files("executorch").joinpath("data/lib")
             shim_library_path = str(lib_dir)
diff --git a/benchmarking.py b/benchmarking.py
new file mode 100644
index 00000000000..d309c25bf9b
--- /dev/null
+++ b/benchmarking.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Benchmark script for Whisper ASR runner.
+Runs the whisper_runner command multiple times and collects throughput metrics.
+"""
+import argparse
+import json
+import os
+import statistics
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+
+@dataclass
+class RunMetrics:
+    """Metrics from a single run."""
+
+    generated_tokens: int
+    tokens_per_sec: float
+    model_load_time_ms: float
+    inference_time_ms: float
+    prompt_eval_to_end_ms: float
+    first_token_latency_ms: float
+
+    def __repr__(self):
+        return (
+            f"Tokens: {self.generated_tokens}, "
+            f"Throughput: {self.tokens_per_sec:.2f} t/s, "
+            f"Model load: {self.model_load_time_ms:.0f}ms, "
+            f"Inference: {self.inference_time_ms:.0f}ms, "
+            f"First token: {self.first_token_latency_ms:.0f}ms"
+        )
+
+
+def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
+    """Parse PyTorchObserver JSON output and compute metrics."""
+    try:
+        # Find the JSON part in the log line
+        if "PyTorchObserver" not in log_line:
+            return None
+
+        json_str = log_line.split("PyTorchObserver")[1].strip()
+        data = json.loads(json_str)
+
+        # Extract values
+        generated_tokens = data.get("generated_tokens", 0)
+        inference_end_ms = data.get("inference_end_ms", 0)
+        prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
+        first_token_ms = data.get("first_token_ms", 0)
+        model_load_start_ms = data.get("model_load_start_ms", 0)
+        model_load_end_ms = data.get("model_load_end_ms", 0)
+
+        # Compute metrics
+        prompt_eval_to_end_ms = inference_end_ms - prompt_eval_end_ms
+        tokens_per_sec = (
+            (generated_tokens / prompt_eval_to_end_ms * 1000)
+            if prompt_eval_to_end_ms > 0
+            else 0
+        )
+        model_load_time_ms = model_load_end_ms - model_load_start_ms
+        inference_time_ms = inference_end_ms - prompt_eval_end_ms
+        first_token_latency_ms = first_token_ms - prompt_eval_end_ms
+
+        return RunMetrics(
+            generated_tokens=generated_tokens,
+            tokens_per_sec=tokens_per_sec,
+            model_load_time_ms=model_load_time_ms,
+            inference_time_ms=inference_time_ms,
+            prompt_eval_to_end_ms=prompt_eval_to_end_ms,
+            first_token_latency_ms=first_token_latency_ms,
+        )
+    except (json.JSONDecodeError, KeyError, ValueError) as e:
+        print(f"Error parsing PyTorchObserver log: {e}", file=sys.stderr)
+        return None
+
+
+def run_whisper_benchmark(
+    command: str, num_runs: int = 5, verbose: bool = False
+) -> List[RunMetrics]:
+    """
+    Run the whisper_runner command multiple times and collect metrics.
+
+    Args:
+        command: Full command to run
+        num_runs: Number of times to run the command
+        verbose: Print detailed output
+
+    Returns:
+        List of RunMetrics from each run
+    """
+    results = []
+
+    for run_num in range(1, num_runs + 1):
+        print(f"\n[Run {run_num}/{num_runs}] Executing: {command}")
+
+        try:
+            # Run command and capture output
+            result = subprocess.run(
+                command,
+                shell=True,
+                capture_output=True,
+                text=True,
+                timeout=300,  # 5 minute timeout
+            )
+
+            if result.returncode != 0:
+                print(
+                    f"Error: Command failed with return code {result.returncode}",
+                    file=sys.stderr,
+                )
+                if result.stderr:
+                    print(f"stderr: {result.stderr}", file=sys.stderr)
+                continue
+
+            # Search for PyTorchObserver line in output
+            observer_line = None
+            for line in result.stdout.split("\n"):
+                if "PyTorchObserver" in line:
+                    observer_line = line
+                    break
+
+            if observer_line is None:
+                print(
+                    f"Warning: No PyTorchObserver output found in run {run_num}",
+                    file=sys.stderr,
+                )
+                if verbose:
+                    print(f"stdout:\n{result.stdout}", file=sys.stderr)
+                continue
+
+            # Parse metrics
+            metrics = parse_pytorch_observer_log(observer_line)
+            if metrics is None:
+                print(
+                    f"Warning: Failed to parse metrics from run {run_num}",
+                    file=sys.stderr,
+                )
+                continue
+
+            results.append(metrics)
+            print(f"✓ {metrics}")
+
+        except subprocess.TimeoutExpired:
+            print(f"Error: Command timed out on run {run_num}", file=sys.stderr)
+        except Exception as e:
+            print(f"Error on run {run_num}: {e}", file=sys.stderr)
+
+    return results
+
+
+def print_summary(results: List[RunMetrics]) -> None:
+    """Print summary statistics."""
+    if not results:
+        print("No valid results to summarize.")
+        return
+
+    tokens_per_sec_list = [r.tokens_per_sec for r in results]
+    model_load_times = [r.model_load_time_ms for r in results]
+    inference_times = [r.inference_time_ms for r in results]
+    first_token_latencies = [r.first_token_latency_ms for r in results]
+
+    print("\n" + "=" * 70)
+    print("BENCHMARK SUMMARY")
+    print("=" * 70)
+    print(f"Total runs: {len(results)}")
+    print(f"Generated tokens per run: {results[0].generated_tokens}")
+    print()
+
+    print("THROUGHPUT (tokens/sec):")
+    print(f"  Min:    {min(tokens_per_sec_list):.2f} t/s")
+    print(f"  Max:    {max(tokens_per_sec_list):.2f} t/s")
+    print(f"  Mean:   {statistics.mean(tokens_per_sec_list):.2f} t/s")
+    if len(tokens_per_sec_list) > 1:
+        print(f"  Stdev:  {statistics.stdev(tokens_per_sec_list):.2f} t/s")
+    print()
+
+    print("MODEL LOAD TIME (ms):")
+    print(f"  Min:    {min(model_load_times):.0f} ms")
+    print(f"  Max:    {max(model_load_times):.0f} ms")
+    print(f"  Mean:   {statistics.mean(model_load_times):.0f} ms")
+    if len(model_load_times) > 1:
+        print(f"  Stdev:  {statistics.stdev(model_load_times):.0f} ms")
+    print()
+
+    print("INFERENCE TIME (ms, prompt_eval_end to inference_end):")
+    print(f"  Min:    {min(inference_times):.0f} ms")
+    print(f"  Max:    {max(inference_times):.0f} ms")
+    print(f"  Mean:   {statistics.mean(inference_times):.0f} ms")
+    if len(inference_times) > 1:
+        print(f"  Stdev:  {statistics.stdev(inference_times):.0f} ms")
+    print()
+
+    print("FIRST TOKEN LATENCY (ms):")
+    print(f"  Min:    {min(first_token_latencies):.0f} ms")
+    print(f"  Max:    {max(first_token_latencies):.0f} ms")
+    print(f"  Mean:   {statistics.mean(first_token_latencies):.0f} ms")
+    if len(first_token_latencies) > 1:
+        print(f"  Stdev:  {statistics.stdev(first_token_latencies):.0f} ms")
+    print("=" * 70)
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Benchmark Whisper ASR runner and collect throughput metrics"
+    )
+    parser.add_argument(
+        "num_runs",
+        type=int,
+        nargs="?",
+        default=50,
+        help="Number of benchmark runs (default: 5)",
+    )
+    parser.add_argument(
+        "--model_dir_name",
+        type=str,
+        default="decomposed",
+        help="Path to the directory that has model .pte and .ptd files",
+    )
+    parser.add_argument(
+        "--processor_path",
+        type=str,
+        default="~/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte",
+        help="Path to the preprocessor/processor .pte file",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Print verbose output")
+
+    args = parser.parse_args()
+
+    base_path = "~/kernel-gen/whisper-large-v3-turbo/"
+    model_dir_path = os.path.join(base_path, args.model_dir_name)
+
+    # Expand user paths
+    model_path = os.path.expanduser(model_dir_path + "/model.pte")
+    data_path = os.path.expanduser(model_dir_path + "/aoti_cuda_blob.ptd")
+    tokenizer_path = os.path.expanduser(
+        "~/kernel-gen/whisper-large-v3-turbo/decomposed"
+    )
+    audio_path = os.path.expanduser(
+        "~/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav"
+    )
+    processor_path = os.path.expanduser(args.processor_path)
+
+    # Build command
+    command = (
+        "cmake-out/examples/models/whisper/whisper_runner "
+        f"--model_path {model_path} "
+        f"--data_path {data_path} "
+        f"--tokenizer_path {tokenizer_path} "
+        f"--audio_path {audio_path} "
+        f"--processor_path {processor_path} "
+        "--model_name whisper_large_v3 "
+        "--temperature 0 "
+    )
+
+    print(f"Running Whisper benchmark {args.num_runs} times...")
+    print(f"Command: {command}\n")
+
+    # Run benchmark
+    results = run_whisper_benchmark(
+        command, num_runs=args.num_runs, verbose=args.verbose
+    )
+
+    # Print summary
+    print_summary(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/decomposed-freezing-result.txt b/decomposed-freezing-result.txt
new file mode 100644
index 00000000000..375ee7c034b
--- /dev/null
+++ b/decomposed-freezing-result.txt
@@ -0,0 +1,184 @@
+Running Whisper benchmark 50 times...
+Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+
+
+[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1138ms, Inference: 292ms, First token: 14ms
+
+[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1162ms, Inference: 297ms, First token: 14ms
+
+[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 407.64 t/s, Model load: 1096ms, Inference: 314ms, First token: 14ms
+
+[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1076ms, Inference: 301ms, First token: 14ms
+
+[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1113ms, Inference: 304ms, First token: 14ms
+
+[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 400.00 t/s, Model load: 1121ms, Inference: 320ms, First token: 21ms
+
+[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1149ms, Inference: 308ms, First token: 16ms
+
+[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1093ms, Inference: 294ms, First token: 14ms
+
+[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1093ms, Inference: 313ms, First token: 15ms
+
+[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1105ms, Inference: 298ms, First token: 14ms
+
+[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1117ms, Inference: 315ms, First token: 14ms
+
+[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1074ms, Inference: 327ms, First token: 14ms
+
+[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1086ms, Inference: 294ms, First token: 14ms
+
+[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1133ms, Inference: 331ms, First token: 14ms
+
+[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1120ms, Inference: 311ms, First token: 15ms
+
+[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 393.85 t/s, Model load: 1118ms, Inference: 325ms, First token: 18ms
+
+[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1008ms, Inference: 309ms, First token: 19ms
+
+[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1074ms, Inference: 296ms, First token: 14ms
+
+[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1045ms, Inference: 297ms, First token: 14ms
+
+[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1077ms, Inference: 296ms, First token: 14ms
+
+[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1095ms, Inference: 298ms, First token: 14ms
+
+[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 423.84 t/s, Model load: 1019ms, Inference: 302ms, First token: 14ms
+
+[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1269ms, Inference: 313ms, First token: 14ms
+
+[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1039ms, Inference: 313ms, First token: 18ms
+
+[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1080ms, Inference: 297ms, First token: 14ms
+
+[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1099ms, Inference: 292ms, First token: 14ms
+
+[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1200ms, Inference: 296ms, First token: 14ms
+
+[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 984ms, Inference: 305ms, First token: 14ms
+
+[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1110ms, Inference: 305ms, First token: 15ms
+
+[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1063ms, Inference: 299ms, First token: 14ms
+
+[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 382.09 t/s, Model load: 1117ms, Inference: 335ms, First token: 14ms
+
+[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1128ms, Inference: 310ms, First token: 14ms
+
+[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1173ms, Inference: 299ms, First token: 14ms
+
+[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1077ms, Inference: 300ms, First token: 14ms
+
+[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 436.86 t/s, Model load: 1078ms, Inference: 293ms, First token: 14ms
+
+[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1003ms, Inference: 297ms, First token: 16ms
+
+[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1103ms, Inference: 307ms, First token: 15ms
+
+[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1053ms, Inference: 313ms, First token: 15ms
+
+[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1091ms, Inference: 307ms, First token: 14ms
+
+[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1850ms, Inference: 331ms, First token: 14ms
+
+[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 384.38 t/s, Model load: 1017ms, Inference: 333ms, First token: 14ms
+
+[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1090ms, Inference: 303ms, First token: 14ms
+
+[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1154ms, Inference: 329ms, First token: 16ms
+
+[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1085ms, Inference: 342ms, First token: 13ms
+
+[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1098ms, Inference: 300ms, First token: 14ms
+
+[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1093ms, Inference: 312ms, First token: 14ms
+
+[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1099ms, Inference: 304ms, First token: 14ms
+
+[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1113ms, Inference: 303ms, First token: 15ms
+
+[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1089ms, Inference: 311ms, First token: 19ms
+
+[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1151ms, Inference: 303ms, First token: 15ms
+
+======================================================================
+BENCHMARK SUMMARY
+======================================================================
+Total runs: 50
+Generated tokens per run: 128
+
+THROUGHPUT (tokens/sec):
+  Min:    374.27 t/s
+  Max:    438.36 t/s
+  Mean:   416.41 t/s
+  Stdev:  16.45 t/s
+
+MODEL LOAD TIME (ms):
+  Min:    984 ms
+  Max:    1850 ms
+  Mean:   1112 ms
+  Stdev:  117 ms
+
+INFERENCE TIME (ms, prompt_eval_end to inference_end):
+  Min:    292 ms
+  Max:    342 ms
+  Mean:   308 ms
+  Stdev:  13 ms
+
+FIRST TOKEN LATENCY (ms):
+  Min:    13 ms
+  Max:    21 ms
+  Mean:   15 ms
+  Stdev:  2 ms
+======================================================================
diff --git a/triton-v2-freeze-result.txt b/triton-v2-freeze-result.txt
new file mode 100644
index 00000000000..16dfd1da218
--- /dev/null
+++ b/triton-v2-freeze-result.txt
@@ -0,0 +1,184 @@
+Running Whisper benchmark 50 times...
+Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+
+
+[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 1070ms, Inference: 326ms, First token: 13ms
+
+[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1036ms, Inference: 315ms, First token: 14ms
+
+[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1043ms, Inference: 327ms, First token: 13ms
+
+[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1027ms, Inference: 330ms, First token: 13ms
+
+[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 1078ms, Inference: 345ms, First token: 14ms
+
+[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 987ms, Inference: 331ms, First token: 13ms
+
+[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1112ms, Inference: 329ms, First token: 13ms
+
+[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 982ms, Inference: 337ms, First token: 15ms
+
+[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1040ms, Inference: 331ms, First token: 13ms
+
+[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1048ms, Inference: 334ms, First token: 13ms
+
+[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 974ms, Inference: 336ms, First token: 13ms
+
+[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1059ms, Inference: 341ms, First token: 13ms
+
+[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1010ms, Inference: 330ms, First token: 12ms
+
+[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1001ms, Inference: 331ms, First token: 13ms
+
+[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 1001ms, Inference: 337ms, First token: 13ms
+
+[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 398.75 t/s, Model load: 1093ms, Inference: 321ms, First token: 13ms
+
+[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 966ms, Inference: 326ms, First token: 12ms
+
+[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1022ms, Inference: 327ms, First token: 13ms
+
+[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1020ms, Inference: 334ms, First token: 12ms
+
+[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 987ms, Inference: 339ms, First token: 13ms
+
+[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1115ms, Inference: 352ms, First token: 16ms
+
+[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1012ms, Inference: 339ms, First token: 13ms
+
+[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 1079ms, Inference: 346ms, First token: 13ms
+
+[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1067ms, Inference: 301ms, First token: 14ms
+
+[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1147ms, Inference: 307ms, First token: 14ms
+
+[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1106ms, Inference: 315ms, First token: 15ms
+
+[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 396.28 t/s, Model load: 1048ms, Inference: 323ms, First token: 12ms
+
+[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 433.90 t/s, Model load: 1098ms, Inference: 295ms, First token: 14ms
+
+[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 287.64 t/s, Model load: 1091ms, Inference: 445ms, First token: 14ms
+
+[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 18ms
+
+[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1075ms, Inference: 304ms, First token: 21ms
+
+[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1029ms, Inference: 310ms, First token: 14ms
+
+[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1177ms, Inference: 313ms, First token: 14ms
+
+[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1188ms, Inference: 305ms, First token: 15ms
+
+[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 390.24 t/s, Model load: 999ms, Inference: 328ms, First token: 13ms
+
+[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 1042ms, Inference: 338ms, First token: 13ms
+
+[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 15ms
+
+[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1065ms, Inference: 308ms, First token: 18ms
+
+[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1173ms, Inference: 304ms, First token: 14ms
+
+[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1034ms, Inference: 303ms, First token: 16ms
+
+[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 245.21 t/s, Model load: 1118ms, Inference: 522ms, First token: 15ms
+
+[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 1043ms, Inference: 336ms, First token: 16ms
+
+[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 999ms, Inference: 330ms, First token: 13ms
+
+[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1262ms, Inference: 334ms, First token: 14ms
+
+[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1059ms, Inference: 308ms, First token: 14ms
+
+[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1019ms, Inference: 304ms, First token: 14ms
+
+[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1015ms, Inference: 296ms, First token: 14ms
+
+[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1074ms, Inference: 312ms, First token: 15ms
+
+[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1059ms, Inference: 311ms, First token: 15ms
+
+[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1064ms, Inference: 307ms, First token: 14ms
+
+======================================================================
+BENCHMARK SUMMARY
+======================================================================
+Total runs: 50
+Generated tokens per run: 128
+
+THROUGHPUT (tokens/sec):
+  Min:    245.21 t/s
+  Max:    433.90 t/s
+  Mean:   392.57 t/s
+  Stdev:  31.88 t/s
+
+MODEL LOAD TIME (ms):
+  Min:    966 ms
+  Max:    1262 ms
+  Mean:   1059 ms
+  Stdev:  59 ms
+
+INFERENCE TIME (ms, prompt_eval_end to inference_end):
+  Min:    295 ms
+  Max:    522 ms
+  Mean:   329 ms
+  Stdev:  36 ms
+
+FIRST TOKEN LATENCY (ms):
+  Min:    12 ms
+  Max:    21 ms
+  Mean:   14 ms
+  Stdev:  2 ms
+======================================================================
diff --git a/triton-v3-result.txt b/triton-v3-result.txt
new file mode 100644
index 00000000000..2c426a28eee
--- /dev/null
+++ b/triton-v3-result.txt
@@ -0,0 +1,184 @@
+Running Whisper benchmark 50 times...
+Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+
+
+[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 969ms, Inference: 347ms, First token: 13ms
+
+[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 922ms, Inference: 338ms, First token: 12ms
+
+[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 915ms, Inference: 337ms, First token: 13ms
+
+[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1004ms, Inference: 339ms, First token: 12ms
+
+[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1030ms, Inference: 348ms, First token: 13ms
+
+[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 988ms, Inference: 348ms, First token: 13ms
+
+[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 355.56 t/s, Model load: 1017ms, Inference: 360ms, First token: 14ms
+
+[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 992ms, Inference: 337ms, First token: 12ms
+
+[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 995ms, Inference: 350ms, First token: 13ms
+
+[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1014ms, Inference: 341ms, First token: 13ms
+
+[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 944ms, Inference: 346ms, First token: 12ms
+
+[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 976ms, Inference: 345ms, First token: 16ms
+
+[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 385.54 t/s, Model load: 1046ms, Inference: 332ms, First token: 14ms
+
+[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 352.62 t/s, Model load: 1014ms, Inference: 363ms, First token: 13ms
+
+[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 343.16 t/s, Model load: 1084ms, Inference: 373ms, First token: 14ms
+
+[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 990ms, Inference: 341ms, First token: 13ms
+
+[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 958ms, Inference: 350ms, First token: 13ms
+
+[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 997ms, Inference: 349ms, First token: 15ms
+
+[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 361.58 t/s, Model load: 1038ms, Inference: 354ms, First token: 13ms
+
+[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1044ms, Inference: 350ms, First token: 12ms
+
+[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 358.54 t/s, Model load: 1027ms, Inference: 357ms, First token: 13ms
+
+[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 943ms, Inference: 346ms, First token: 13ms
+
+[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1005ms, Inference: 312ms, First token: 14ms
+
+[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1016ms, Inference: 350ms, First token: 13ms
+
+[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 405.06 t/s, Model load: 981ms, Inference: 316ms, First token: 14ms
+
+[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1007ms, Inference: 352ms, First token: 13ms
+
+[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1067ms, Inference: 342ms, First token: 13ms
+
+[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1100ms, Inference: 352ms, First token: 15ms
+
+[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 957ms, Inference: 340ms, First token: 13ms
+
+[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1006ms, Inference: 352ms, First token: 13ms
+
+[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 354.57 t/s, Model load: 1040ms, Inference: 361ms, First token: 13ms
+
+[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 973ms, Inference: 334ms, First token: 12ms
+
+[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 982ms, Inference: 349ms, First token: 12ms
+
+[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 958ms, Inference: 347ms, First token: 13ms
+
+[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1031ms, Inference: 309ms, First token: 14ms
+
+[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 962ms, Inference: 349ms, First token: 13ms
+
+[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 967ms, Inference: 341ms, First token: 18ms
+
+[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 977ms, Inference: 346ms, First token: 16ms
+
+[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 1016ms, Inference: 353ms, First token: 18ms
+
+[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1175ms, Inference: 339ms, First token: 13ms
+
+[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 373.18 t/s, Model load: 964ms, Inference: 343ms, First token: 18ms
+
+[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1074ms, Inference: 362ms, First token: 14ms
+
+[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 981ms, Inference: 353ms, First token: 13ms
+
+[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1010ms, Inference: 348ms, First token: 13ms
+
+[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1023ms, Inference: 339ms, First token: 12ms
+
+[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 275.86 t/s, Model load: 1225ms, Inference: 464ms, First token: 19ms
+
+[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 964ms, Inference: 340ms, First token: 12ms
+
+[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 979ms, Inference: 334ms, First token: 13ms
+
+[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1003ms, Inference: 339ms, First token: 12ms
+
+[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
+✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1206ms, Inference: 362ms, First token: 14ms
+
+======================================================================
+BENCHMARK SUMMARY
+======================================================================
+Total runs: 50
+Generated tokens per run: 128
+
+THROUGHPUT (tokens/sec):
+  Min:    275.86 t/s
+  Max:    414.24 t/s
+  Mean:   369.34 t/s
+  Stdev:  18.90 t/s
+
+MODEL LOAD TIME (ms):
+  Min:    915 ms
+  Max:    1225 ms
+  Mean:   1011 ms
+  Stdev:  62 ms
+
+INFERENCE TIME (ms, prompt_eval_end to inference_end):
+  Min:    309 ms
+  Max:    464 ms
+  Mean:   348 ms
+  Stdev:  21 ms
+
+FIRST TOKEN LATENCY (ms):
+  Min:    12 ms
+  Max:    19 ms
+  Mean:   14 ms
+  Stdev:  2 ms
+======================================================================

From d56601f98aa28e4ba00b1ac04ff4e1c28ee2c77f Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 17 Nov 2025 01:15:38 -0800
Subject: [PATCH 5/7] add trtion kernel replacement pass

---
 .github/workflows/cuda.yml                    |   2 +-
 backends/aoti/common_shims.cpp                |   1 -
 backends/cuda/TARGETS                         |  31 ++
 backends/cuda/cuda_backend.py                 |  14 +-
 backends/cuda/tests/test_cuda_export.py       |  43 +++
 backends/cuda/triton/__init__.py              |  17 +
 backends/cuda/triton/kernels/__init__.py      |  11 +
 .../cuda/triton/kernels/sdpa.py               | 244 +++++++++-----
 backends/cuda/triton/replacement_pass.py      | 134 ++++++++
 benchmarking.py                               | 272 ---------------
 custom_triton_playground/sdpa_triton.py       | 317 ------------------
 .../test-sdpa-with-custom-kernel.py           | 151 ---------
 eval.sh                                       |  52 ---
 examples/models/__init__.py                   |   2 +
 examples/models/toy_model/__init__.py         |   2 +
 examples/models/toy_model/model.py            |  30 ++
 extension/runner_util/inputs.h                |   2 +-
 17 files changed, 446 insertions(+), 879 deletions(-)
 create mode 100644 backends/cuda/triton/__init__.py
 create mode 100644 backends/cuda/triton/kernels/__init__.py
 rename custom_triton_playground/optimized_sdpa_triton.py => backends/cuda/triton/kernels/sdpa.py (50%)
 create mode 100644 backends/cuda/triton/replacement_pass.py
 delete mode 100644 benchmarking.py
 delete mode 100644 custom_triton_playground/sdpa_triton.py
 delete mode 100644 custom_triton_playground/test-sdpa-with-custom-kernel.py
 delete mode 100644 eval.sh

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 7cc937fe6ca..1d237f5d8ef 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -71,7 +71,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: [linear, add, add_mul, resnet18, conv1d]
+        model: [linear, add, add_mul, resnet18, conv1d, sdpa]
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index 5fa5c5beef5..82c54617a0a 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -242,7 +242,6 @@ aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
   return Error::Internal;
 }
 
-
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
     void* data_ptr,
     int64_t ndim,
diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
index 94af87bbaed..55519060dc0 100644
--- a/backends/cuda/TARGETS
+++ b/backends/cuda/TARGETS
@@ -11,6 +11,7 @@ runtime.python_library(
         "//executorch/...",
     ],
     deps = [
+        ":triton_replacement_pass",
         "//caffe2:torch",
         "//executorch/backends/aoti/passes:passes",
         "//executorch/exir/_serialize:lib",
@@ -32,3 +33,33 @@ runtime.python_library(
         "//executorch/backends/aoti:aoti_partitioner",
     ],
 )
+
+runtime.python_library(
+    name = "triton_kernels",
+    srcs = [
+        "triton/kernels/__init__.py",
+        "triton/kernels/optimized_sdpa.py",
+    ],
+    visibility = [
+        "//executorch/backends/cuda/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
+runtime.python_library(
+    name = "triton_replacement_pass",
+    srcs = [
+        "triton/__init__.py",
+        "triton/replacement_pass.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        ":triton_kernels",
+        "//caffe2:torch",
+        "//executorch/exir/dialects:lib",
+    ],
+)
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 487b0d64c1d..3506cb685fe 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -16,6 +16,10 @@
 from executorch.backends.aoti.passes.replace_view_copy_with_view import (
     ReplaceViewCopyWithViewPass,
 )
+
+from executorch.backends.cuda.triton.replacement_pass import (
+    ReplaceEdgeOpWithTritonOpPass,
+)
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import (
@@ -27,7 +31,7 @@
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
 from torch._inductor.decomposition import conv1d_to_conv2d
 from torch.export.passes import move_to_device_pass
-from torch.nn.attention import SDPBackend
+
 
 cuda_decomposition_table = {
     torch.ops.aten.conv1d.default: conv1d_to_conv2d,
@@ -127,6 +131,9 @@ def preprocess(  # noqa: C901
         # replace slice_copy.Tensor with slice.Tensor, select_copy.int with select.int
         ReplaceViewCopyWithViewPass()(cuda_edge_program.graph_module)
 
+        # Replace aten ops with triton ops
+        ReplaceEdgeOpWithTritonOpPass()(cuda_edge_program.graph_module)
+
         cuda_edge_program = cuda_edge_program.run_decompositions(
             cuda_decomposition_table
         )
@@ -141,8 +148,9 @@ def preprocess(  # noqa: C901
                 user_input_placeholders.append(node.meta["val"])
 
         options: dict[str, typing.Any] = {
-            # Frozen weight during inference for better performance and more optimization like kernel fusion
-            "freezing": True,
+            # Disable this to support sdpa decomposition
+            # TODO(gasoonjia): remove it after pin bump to latest pytorch
+            "loop_ordering_after_fusion": False,
             # Better model precision
             "emulate_precision_casts": True,
             # Embed CUDA kernel binaries directly into the compiled shared object
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
index ef43a3ab3cb..8dcdbe09083 100644
--- a/backends/cuda/tests/test_cuda_export.py
+++ b/backends/cuda/tests/test_cuda_export.py
@@ -270,3 +270,46 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Test export
         edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
         self.assertIsNotNone(edge_program_manager, "Conv1d operation export failed")
+
+    def test_sdpa_single_kernel(self):
+        """
+        Test CUDA export for model containing single SDPA kernel.
+
+        SDPA: Scaled Dot Product Attention
+        """
+
+        class SDPAModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, query, key, value):
+                out = torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=None,
+                    dropout_p=0.0,
+                    is_causal=False,
+                )
+                return out
+
+        module = SDPAModule()
+        module.eval()
+
+        # Create input tensors (batch, num_heads, seq_len, head_dim)
+        batch_size = 2
+        num_heads = 8
+        seq_len = 128
+        head_dim = 64
+
+        query = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        key = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        value = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        inputs = (query, key, value)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(
+            edge_program_manager,
+            "SDPA single kernel operation export failed",
+        )
diff --git a/backends/cuda/triton/__init__.py b/backends/cuda/triton/__init__.py
new file mode 100644
index 00000000000..4b9c36249ac
--- /dev/null
+++ b/backends/cuda/triton/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Import all kernels to ensure @triton_op decorators are executed
+# and ops are registered to torch.ops.triton namespace
+from executorch.backends.cuda.triton import kernels  # noqa: F401
+
+from executorch.backends.cuda.triton.replacement_pass import (
+    ReplaceEdgeOpWithTritonOpPass,
+)
+
+__all__ = [
+    "ReplaceEdgeOpWithTritonOpPass",
+]
diff --git a/backends/cuda/triton/kernels/__init__.py b/backends/cuda/triton/kernels/__init__.py
new file mode 100644
index 00000000000..5bd582679c4
--- /dev/null
+++ b/backends/cuda/triton/kernels/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.cuda.triton.kernels.sdpa import sdpa
+
+__all__ = [
+    "sdpa",
+]
diff --git a/custom_triton_playground/optimized_sdpa_triton.py b/backends/cuda/triton/kernels/sdpa.py
similarity index 50%
rename from custom_triton_playground/optimized_sdpa_triton.py
rename to backends/cuda/triton/kernels/sdpa.py
index df267202d29..e0996ce5292 100644
--- a/custom_triton_playground/optimized_sdpa_triton.py
+++ b/backends/cuda/triton/kernels/sdpa.py
@@ -1,5 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Optimized Triton SDPA Kernel for ExecuTorch CUDA Backend.
+
+This module provides a Triton-optimized implementation of scaled dot-product attention
+that can replace the default ATen SDPA operator during graph transformation.
+"""
+
 import math
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 import triton
@@ -7,9 +20,50 @@
 from torch.library import triton_op, wrap_triton
 
 
+def _validate_qkv_shapes(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+) -> tuple[int, int, int, int, int, int]:
+    """
+    Validate dimensions and return shape info.
+    Args:
+        query: Query tensor [B, H, L_q, D]
+        key: Key tensor [B, H, L_kv, D]
+        value: Value tensor [B, H, L_kv, D]
+    Returns:
+        Tuple of (B, H, L_q, L_kv, D_q, D_kv)
+    Raises:
+        RuntimeError: If dimensions are incompatible
+    """
+    B_q, H_q, L_q, D_q = query.shape
+    B_k, H_k, L_kv_k, D_k = key.shape
+    B_v, H_v, L_kv_v, D_v = value.shape
+    # Validate batch and head dimensions
+    if not (B_q == B_k == B_v):
+        raise RuntimeError(
+            f"Batch dimension must match; got B_q={B_q}, B_k={B_k}, B_v={B_v}."
+        )
+
+    if not (H_q == H_k == H_v):
+        raise RuntimeError(
+            f"Head dimension must match; got H_q={H_q}, H_k={H_k}, H_v={H_v}."
+        )
+    # Head dimension must match
+    if not (D_q == D_k == D_v):
+        raise RuntimeError(
+            f"Head dimension must match across Q, K, V; got D_q={D_q}, D_k={D_k}, D_v={D_v}."
+        )
+    # Key and Value sequence lengths must match
+    if L_kv_k != L_kv_v:
+        raise RuntimeError(
+            f"Key and Value must have the same sequence length; got L_k={L_kv_k}, L_v={L_kv_v}."
+        )
+    return B_q, H_q, L_q, L_kv_k, D_q, D_k
+
+
 @triton.autotune(
     configs=[
-        # Favor configs tuned for HEAD_DIM=64 and L up to ~1500
         triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_stages=4, num_warps=8),
         triton.Config({"BLOCK_M": 128, "BLOCK_N": 256}, num_stages=4, num_warps=8),
         triton.Config({"BLOCK_M": 64, "BLOCK_N": 256}, num_stages=4, num_warps=4),
@@ -17,17 +71,19 @@
         triton.Config({"BLOCK_M": 128, "BLOCK_N": 64}, num_stages=3, num_warps=4),
         triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_stages=3, num_warps=4),
     ],
-    key=["L", "HEAD_DIM"],
+    key=["L_Q", "L_KV", "HEAD_DIM"],
 )
 @triton.jit
 def _sdpa_fwd_kernel(
     q_ptr,
     k_ptr,
     v_ptr,
+    mask_ptr,
     o_ptr,
     B,
     H,
-    L,
+    L_Q,  # Query sequence length
+    L_KV,  # Key/Value sequence length
     HEAD_DIM,
     stride_qb,
     stride_qh,
@@ -41,104 +97,118 @@ def _sdpa_fwd_kernel(
     stride_vh,
     stride_vl,
     stride_vd,
+    stride_mb,
+    stride_mh,
+    stride_ml,
+    stride_mn,
     stride_ob,
     stride_oh,
     stride_ol,
     stride_od,
     sm_scale,
+    IS_CAUSAL: tl.constexpr,
+    HAS_MASK: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HEAD_DIM_CE: tl.constexpr,
 ):
+    """
+    Fused SDPA kernel that handles different sequence lengths for Q and K/V.
+
+    Q shape: [B, H, L_Q, D]
+    K/V shape: [B, H, L_KV, D]
+    Output shape: [B, H, L_Q, D]
+    """
     # Program IDs
     pid_m = tl.program_id(axis=0)  # along query length
     pid_hz = tl.program_id(axis=1)  # flattened batch*head
-
     off_b = pid_hz // H
     off_h = pid_hz % H
-
-    # Compute ranges
+    # Compute ranges for queries
     start_m = pid_m * BLOCK_M
     offs_m = start_m + tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, HEAD_DIM_CE)
-    mask_m = offs_m < L
-
+    mask_m = offs_m < L_Q  # Mask based on query length
     # Base pointers for this (b, h)
     q_base = q_ptr + off_b * stride_qb + off_h * stride_qh
     k_base = k_ptr + off_b * stride_kb + off_h * stride_kh
     v_base = v_ptr + off_b * stride_vb + off_h * stride_vh
     o_base = o_ptr + off_b * stride_ob + off_h * stride_oh
-
+    # Mask base pointer (if provided)
+    if HAS_MASK:
+        mask_base = mask_ptr + off_b * stride_mb + off_h * stride_mh
     # Make head-dim addresses compiler-friendly
     offs_d_ctg = tl.max_contiguous(tl.multiple_of(offs_d, 16), HEAD_DIM_CE)
-
     # Load Q tile [BLOCK_M, HEAD_DIM] - coalesced along HEAD_DIM
     q_ptrs = q_base + (offs_m[:, None] * stride_ql + offs_d_ctg[None, :] * stride_qd)
     q = tl.load(q_ptrs, mask=mask_m[:, None], other=0.0)
     q = q.to(tl.bfloat16)
-
     # Initialize accumulators and softmax stats
     acc = tl.zeros((BLOCK_M, HEAD_DIM_CE), dtype=tl.float32)
     m_i = tl.full((BLOCK_M,), -float("inf"), dtype=tl.float32)
     l_i = tl.zeros((BLOCK_M,), dtype=tl.float32)
-
     # Convert to base-2 scale for exp2
     qk_scale = sm_scale * 1.4426950408889634
-
-    # Loop over keys/values along sequence length in tiles of BLOCK_N
-    # Load K as [BLOCK_N, HEAD_DIM] for coalesced reads, then use tl.trans(K) in dot
-    for start_n in tl.range(0, L, BLOCK_N):
+    # Loop over keys/values along L_KV dimension (not L_Q!)
+    for start_n in tl.range(0, L_KV, BLOCK_N):
         offs_n = start_n + tl.arange(0, BLOCK_N)
-        mask_n = offs_n < L
-
+        mask_n = offs_n < L_KV  # Mask based on key/value length
         # Load K tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM)
         k_ptrs = k_base + (
             offs_n[:, None] * stride_kl + offs_d_ctg[None, :] * stride_kd
         )
         k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0)
         k = k.to(tl.bfloat16)
-
         # Compute attention logits [BLOCK_M, BLOCK_N] = Q[BM,D] @ K[BN,D]^T
-        qk = tl.dot(q, tl.trans(k)).to(tl.float32)  # accumulator in fp32
+        qk = tl.dot(q, tl.trans(k)).to(tl.float32)
         qk = qk * qk_scale
-
-        # Apply OOB masks for both rows and cols to keep stability
+        # Apply causal mask if needed
+        # For causal masking with different lengths: position i can attend to position j if i >= j
+        if IS_CAUSAL:
+            causal_mask = offs_m[:, None] >= offs_n[None, :]
+            qk = tl.where(causal_mask, qk, -float("inf"))
+        # Apply attention mask if provided
+        if HAS_MASK:
+            # Load mask tile [BLOCK_M, BLOCK_N]
+            # Mask shape should be [B, H, L_Q, L_KV]
+            mask_ptrs = mask_base + (
+                offs_m[:, None] * stride_ml + offs_n[None, :] * stride_mn
+            )
+            attn_mask = tl.load(
+                mask_ptrs, mask=mask_m[:, None] & mask_n[None, :], other=0.0
+            )
+            # Convert boolean mask to additive mask (-inf for False, 0 for True)
+            qk = tl.where(attn_mask, qk, -float("inf"))
+        # Apply OOB masks for both rows and cols
         qk = tl.where(mask_n[None, :], qk, -float("inf"))
         qk = tl.where(mask_m[:, None], qk, -float("inf"))
-
         # Online softmax
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         p = tl.math.exp2(qk - m_ij[:, None])
         l_ij = tl.sum(p, 1)
         alpha = tl.math.exp2(m_i - m_ij)
-
         # Load V tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM)
         v_ptrs = v_base + (
             offs_n[:, None] * stride_vl + offs_d_ctg[None, :] * stride_vd
         )
         v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0)
         v = v.to(tl.bfloat16)
-
         # Update accumulator
         acc = acc * alpha[:, None]
-        # Cast p to bf16 to use tensor-cores in tl.dot; accumulate in fp32
         p_bf16 = p.to(tl.bfloat16)
         acc = tl.dot(p_bf16, v, acc)
-
         # Update softmax stats
         l_i = l_i * alpha + l_ij
         m_i = m_ij
-
     # Normalize accumulator by softmax denominator
     acc = acc / l_i[:, None]
-
-    # Store output [BLOCK_M, HEAD_DIM]
+    # Store output [BLOCK_M, HEAD_DIM] - shape matches query
     o_ptrs = o_base + (offs_m[:, None] * stride_ol + offs_d_ctg[None, :] * stride_od)
     tl.store(o_ptrs, acc.to(tl.bfloat16), mask=mask_m[:, None])
 
 
-@triton_op("custom::optimized_triton_scaled_dot_product_attention", mutates_args={})
-def optimized_triton_scaled_dot_product_attention(
+@triton_op("triton::sdpa", mutates_args={})
+def sdpa(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -149,21 +219,24 @@ def optimized_triton_scaled_dot_product_attention(
     enable_gqa: bool = False,
 ) -> torch.Tensor:
     """
-    Triton fused Scaled Dot-Product Attention (forward, no causal, no dropout).
-    Expected shapes (tested): [B=1, H=20, L<=1500, D=64], dtype bfloat16.
+    Triton fused Scaled Dot-Product Attention with support for different sequence lengths.
 
+    Supports different sequence lengths for query and key/value:
+    - Query: [B, H, L_q, D]
+    - Key: [B, H, L_kv, D]
+    - Value: [B, H, L_kv, D]
+    - Output: [B, H, L_q, D] (matches query shape)
     Args:
-        query: Query tensor [B, H, L, D]
-        key: Key tensor [B, H, L, D]
-        value: Value tensor [B, H, L, D]
-        attn_mask: must be None (not supported)
+        query: Query tensor [B, H, L_q, D]
+        key: Key tensor [B, H, L_kv, D]
+        value: Value tensor [B, H, L_kv, D]
+        attn_mask: Optional attention mask [B, H, L_q, L_kv] or broadcastable shape
         dropout_p: must be 0.0 (not supported)
-        is_causal: must be False (not supported)
-        scale: must be 0.0 (not supported)
+        is_causal: whether to apply causal masking
+        scale: attention scale (default: 1/sqrt(d))
         enable_gqa: must be False (not supported)
-
     Returns:
-        Output tensor [B, H, L, D]
+        Output tensor [B, H, L_q, D]
     """
     # Validate inputs
     if not (query.is_cuda and key.is_cuda and value.is_cuda):
@@ -174,40 +247,25 @@ def optimized_triton_scaled_dot_product_attention(
         or value.dtype != torch.bfloat16
     ):
         raise RuntimeError("Expected bfloat16 inputs")
-    if query.shape != key.shape or query.shape != value.shape:
-        raise RuntimeError(
-            f"Q, K, V must have identical shapes; got query={query.shape}, key={key.shape}, value={value.shape}."
-        )
-    if query.dim() != 4:
-        raise RuntimeError(
-            f"Expected 4D tensors shaped [B, H, L, D]; got {query.dim()}D."
-        )
-
-    # Enforce that only default values are accepted for these arguments
-    if attn_mask is not None:
+    if query.dim() != 4 or key.dim() != 4 or value.dim() != 4:
         raise RuntimeError(
-            "attn_mask must be None (not supported in this implementation)."
+            f"Expected 4D tensors shaped [B, H, L, D]; got query.dim()={query.dim()}, key.dim()={key.dim()}, value.dim()={value.dim()}."
         )
+    # Enforce unsupported features
     if dropout_p != 0.0:
         raise RuntimeError(
             "dropout_p must be 0.0 (not supported in this implementation)."
         )
-    if is_causal is not False:
-        raise RuntimeError(
-            "is_causal must be False (not supported in this implementation)."
-        )
-    if scale != 0.0:
-        raise RuntimeError("scale must be 0.0 (not supported in this implementation).")
     if enable_gqa is not False:
         raise RuntimeError(
             "enable_gqa must be False (not supported in this implementation)."
         )
-
-    B, H, L, D = query.shape
-    # Allocate output
+    # Validate and get dimensions
+    B, H, L_q, L_kv, D_q, D_kv = _validate_qkv_shapes(query, key, value)
+    D = D_q  # Head dimension
+    # Allocate output with query shape
     out = torch.empty_like(query)
-
-    # Element-wise strides (in elements)
+    # Element-wise strides
     sqb, sqh, sql, sqd = query.stride()
     skb, skh, skl, skd = key.stride()
     svb, svh, svl, svd = value.stride()
@@ -216,23 +274,44 @@ def optimized_triton_scaled_dot_product_attention(
     # Grid: tile queries (M) and batch*heads axis
     def grid(META):
         return (
-            triton.cdiv(L, META["BLOCK_M"]),
+            triton.cdiv(L_q, META["BLOCK_M"]),  # Based on query length
             B * H,
         )
 
     # Scale factor for SDPA
-    sm_scale = 1.0 / math.sqrt(D)
-
-    # Launch kernel using wrap_triton to avoid tracing issues during export/compile
-    # Note: wrap_triton returns a callable that can be indexed with grid
+    sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale
+    # Handle attention mask
+    has_mask = attn_mask is not None
+    if has_mask:
+        # Expand mask to [B, H, L_q, L_kv] if needed
+        if attn_mask.dim() == 2:
+            # [L_q, L_kv] -> [B, H, L_q, L_kv]
+            attn_mask = attn_mask.unsqueeze(0).unsqueeze(0).expand(B, H, -1, -1)
+        elif attn_mask.dim() == 3:
+            # [B, L_q, L_kv] -> [B, H, L_q, L_kv]
+            attn_mask = attn_mask.unsqueeze(1).expand(-1, H, -1, -1)
+
+        # Validate mask shape
+        if attn_mask.shape != (B, H, L_q, L_kv):
+            # Try to expand if broadcastable
+            attn_mask = attn_mask.expand(B, H, L_q, L_kv)
+
+        smb, smh, sml, smn = attn_mask.stride()
+    else:
+        # Dummy strides and mask
+        smb, smh, sml, smn = 0, 0, 0, 0
+        attn_mask = torch.empty(0, dtype=torch.bool, device=query.device)
+    # Launch kernel
     wrap_triton(_sdpa_fwd_kernel)[grid](
         query,
         key,
         value,
+        attn_mask,
         out,
         B,
         H,
-        L,
+        L_q,  # Query sequence length
+        L_kv,  # Key/Value sequence length
         D,
         sqb,
         sqh,
@@ -246,21 +325,26 @@ def grid(META):
         svh,
         svl,
         svd,
+        smb,
+        smh,
+        sml,
+        smn,
         sob,
         soh,
         sol,
         sod,
         sm_scale,
+        IS_CAUSAL=is_causal,
+        HAS_MASK=has_mask,
         HEAD_DIM_CE=D,
     )
-
     return out
 
 
 # Register the abstract/fake implementation for torch.export
 # This is critical to avoid accessing real tensor data during export
-@optimized_triton_scaled_dot_product_attention.register_fake
-def _optimized_triton_sdpa_abstract(
+@sdpa.register_fake
+def _sdpa_abstract(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -273,12 +357,10 @@ def _optimized_triton_sdpa_abstract(
     """
     Abstract/fake implementation for torch.export.
     This just returns an empty tensor with the correct shape/dtype/device.
-    No actual computation happens here - this is only for shape inference during export.
     """
-    # Validate shapes match
-    assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape"
+    # Validate dtypes match
     assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
+    # Validate kqv's shape and get the output shape
+    B, H, L_q, _, D_q, _ = _validate_qkv_shapes(query, key, value)
 
-    # Output has the same shape and dtype as query
-    # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation
-    return torch.empty_like(query, dtype=query.dtype, device=query.device)
+    return torch.empty(B, H, L_q, D_q, dtype=query.dtype, device=query.device)
diff --git a/backends/cuda/triton/replacement_pass.py b/backends/cuda/triton/replacement_pass.py
new file mode 100644
index 00000000000..08e1fc7dc86
--- /dev/null
+++ b/backends/cuda/triton/replacement_pass.py
@@ -0,0 +1,134 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Graph Transformation Pass for Triton Kernel Replacement.
+
+This pass replaces ATen operators with optimized Triton kernels in the graph.
+"""
+
+import logging
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+logger = logging.getLogger(__name__)
+triton = torch.ops.triton
+
+# Global mapping from edge dialect operators to Triton kernel functions
+EDGE_TO_TRITON_KERNELS = {
+    exir_ops.edge.aten.scaled_dot_product_attention.default: triton.sdpa,
+}
+
+
+class ReplaceEdgeOpWithTritonOpPass(PassBase):
+    """
+    Pass to replace ATen operators with Triton kernels.
+
+    This pass scans the graph for ATen operators that have registered Triton
+    replacements and replaces them with the optimized Triton implementations.
+
+    It automatically imports EDGE_TO_TRITON_KERNELS from cuda_backend.py.
+    """
+
+    def __init__(self):
+        """Initialize the pass."""
+        super().__init__()
+        self._replacement_count = 0
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        """
+        Execute the pass on the graph module.
+
+        Args:
+            graph_module: The graph module to transform
+
+        Returns:
+            PassResult indicating success/failure and the modified graph module
+        """
+        self._replacement_count = 0
+        modified = False
+
+        if not EDGE_TO_TRITON_KERNELS:
+            return PassResult(graph_module, False)
+
+        # Iterate through all nodes in the graph
+        for node in graph_module.graph.nodes:
+            if self._should_replace_node(node):
+                try:
+                    self._replace_node_with_triton(graph_module, node)
+                    modified = True
+                    self._replacement_count += 1
+                except Exception as e:
+                    logger.warning(f"Failed to replace node {node.name}: {e}")
+                    # Continue with other replacements even if one fails
+
+        if modified:
+            # Recompile the graph module after modifications
+            graph_module.recompile()
+
+        print(f"Replaced {self._replacement_count} nodes with Triton kernels")
+
+        return PassResult(graph_module, modified)
+
+    def _should_replace_node(self, node: Node) -> bool:
+        """
+        Check if a node should be replaced with a Triton kernel.
+
+        Args:
+            node: The node to check
+            EDGE_TO_TRITON_KERNELS: Mapping from edge ops to Triton kernels
+
+        Returns:
+            True if the node should be replaced
+        """
+        # Only consider call_function nodes
+        if node.op != "call_function":
+            return False
+
+        print("Checking:", node.target)
+
+        return node.target in EDGE_TO_TRITON_KERNELS
+
+    def _replace_node_with_triton(self, graph_module: GraphModule, node: Node) -> None:
+        """
+        Replace an edge dialect node with a Triton kernel call.
+
+        Args:
+            graph_module: The graph module containing the node
+            node: The node to replace
+            EDGE_TO_TRITON_KERNELS: Mapping from edge ops to Triton kernels
+        """
+        # Get the target operator (should be an exir_ops edge dialect op)
+        target = node.target
+
+        # Get the replacement kernel
+        if target not in EDGE_TO_TRITON_KERNELS:
+            raise ValueError(f"No replacement kernel found for {target}")
+
+        triton_kernel_fn = EDGE_TO_TRITON_KERNELS[target]
+
+        # Create a new node with the Triton kernel
+        with graph_module.graph.inserting_before(node):
+            # The triton_kernel_fn is already registered as a custom op via @triton_op
+            # We can call it directly
+            new_node = graph_module.graph.call_function(
+                triton_kernel_fn,
+                args=node.args,
+                kwargs=node.kwargs,
+            )
+
+            # Copy metadata from original node
+            new_node.meta = node.meta.copy()
+
+        # Replace all uses of the old node with the new node
+        node.replace_all_uses_with(new_node)
+
+        # Remove the old node
+        graph_module.graph.erase_node(node)
diff --git a/benchmarking.py b/benchmarking.py
deleted file mode 100644
index d309c25bf9b..00000000000
--- a/benchmarking.py
+++ /dev/null
@@ -1,272 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark script for Whisper ASR runner.
-Runs the whisper_runner command multiple times and collects throughput metrics.
-"""
-import argparse
-import json
-import os
-import statistics
-import subprocess
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List, Optional
-
-
-@dataclass
-class RunMetrics:
-    """Metrics from a single run."""
-
-    generated_tokens: int
-    tokens_per_sec: float
-    model_load_time_ms: float
-    inference_time_ms: float
-    prompt_eval_to_end_ms: float
-    first_token_latency_ms: float
-
-    def __repr__(self):
-        return (
-            f"Tokens: {self.generated_tokens}, "
-            f"Throughput: {self.tokens_per_sec:.2f} t/s, "
-            f"Model load: {self.model_load_time_ms:.0f}ms, "
-            f"Inference: {self.inference_time_ms:.0f}ms, "
-            f"First token: {self.first_token_latency_ms:.0f}ms"
-        )
-
-
-def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
-    """Parse PyTorchObserver JSON output and compute metrics."""
-    try:
-        # Find the JSON part in the log line
-        if "PyTorchObserver" not in log_line:
-            return None
-
-        json_str = log_line.split("PyTorchObserver")[1].strip()
-        data = json.loads(json_str)
-
-        # Extract values
-        generated_tokens = data.get("generated_tokens", 0)
-        inference_end_ms = data.get("inference_end_ms", 0)
-        prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
-        first_token_ms = data.get("first_token_ms", 0)
-        model_load_start_ms = data.get("model_load_start_ms", 0)
-        model_load_end_ms = data.get("model_load_end_ms", 0)
-
-        # Compute metrics
-        prompt_eval_to_end_ms = inference_end_ms - prompt_eval_end_ms
-        tokens_per_sec = (
-            (generated_tokens / prompt_eval_to_end_ms * 1000)
-            if prompt_eval_to_end_ms > 0
-            else 0
-        )
-        model_load_time_ms = model_load_end_ms - model_load_start_ms
-        inference_time_ms = inference_end_ms - prompt_eval_end_ms
-        first_token_latency_ms = first_token_ms - prompt_eval_end_ms
-
-        return RunMetrics(
-            generated_tokens=generated_tokens,
-            tokens_per_sec=tokens_per_sec,
-            model_load_time_ms=model_load_time_ms,
-            inference_time_ms=inference_time_ms,
-            prompt_eval_to_end_ms=prompt_eval_to_end_ms,
-            first_token_latency_ms=first_token_latency_ms,
-        )
-    except (json.JSONDecodeError, KeyError, ValueError) as e:
-        print(f"Error parsing PyTorchObserver log: {e}", file=sys.stderr)
-        return None
-
-
-def run_whisper_benchmark(
-    command: str, num_runs: int = 5, verbose: bool = False
-) -> List[RunMetrics]:
-    """
-    Run the whisper_runner command multiple times and collect metrics.
-
-    Args:
-        command: Full command to run
-        num_runs: Number of times to run the command
-        verbose: Print detailed output
-
-    Returns:
-        List of RunMetrics from each run
-    """
-    results = []
-
-    for run_num in range(1, num_runs + 1):
-        print(f"\n[Run {run_num}/{num_runs}] Executing: {command}")
-
-        try:
-            # Run command and capture output
-            result = subprocess.run(
-                command,
-                shell=True,
-                capture_output=True,
-                text=True,
-                timeout=300,  # 5 minute timeout
-            )
-
-            if result.returncode != 0:
-                print(
-                    f"Error: Command failed with return code {result.returncode}",
-                    file=sys.stderr,
-                )
-                if result.stderr:
-                    print(f"stderr: {result.stderr}", file=sys.stderr)
-                continue
-
-            # Search for PyTorchObserver line in output
-            observer_line = None
-            for line in result.stdout.split("\n"):
-                if "PyTorchObserver" in line:
-                    observer_line = line
-                    break
-
-            if observer_line is None:
-                print(
-                    f"Warning: No PyTorchObserver output found in run {run_num}",
-                    file=sys.stderr,
-                )
-                if verbose:
-                    print(f"stdout:\n{result.stdout}", file=sys.stderr)
-                continue
-
-            # Parse metrics
-            metrics = parse_pytorch_observer_log(observer_line)
-            if metrics is None:
-                print(
-                    f"Warning: Failed to parse metrics from run {run_num}",
-                    file=sys.stderr,
-                )
-                continue
-
-            results.append(metrics)
-            print(f"✓ {metrics}")
-
-        except subprocess.TimeoutExpired:
-            print(f"Error: Command timed out on run {run_num}", file=sys.stderr)
-        except Exception as e:
-            print(f"Error on run {run_num}: {e}", file=sys.stderr)
-
-    return results
-
-
-def print_summary(results: List[RunMetrics]) -> None:
-    """Print summary statistics."""
-    if not results:
-        print("No valid results to summarize.")
-        return
-
-    tokens_per_sec_list = [r.tokens_per_sec for r in results]
-    model_load_times = [r.model_load_time_ms for r in results]
-    inference_times = [r.inference_time_ms for r in results]
-    first_token_latencies = [r.first_token_latency_ms for r in results]
-
-    print("\n" + "=" * 70)
-    print("BENCHMARK SUMMARY")
-    print("=" * 70)
-    print(f"Total runs: {len(results)}")
-    print(f"Generated tokens per run: {results[0].generated_tokens}")
-    print()
-
-    print("THROUGHPUT (tokens/sec):")
-    print(f"  Min:    {min(tokens_per_sec_list):.2f} t/s")
-    print(f"  Max:    {max(tokens_per_sec_list):.2f} t/s")
-    print(f"  Mean:   {statistics.mean(tokens_per_sec_list):.2f} t/s")
-    if len(tokens_per_sec_list) > 1:
-        print(f"  Stdev:  {statistics.stdev(tokens_per_sec_list):.2f} t/s")
-    print()
-
-    print("MODEL LOAD TIME (ms):")
-    print(f"  Min:    {min(model_load_times):.0f} ms")
-    print(f"  Max:    {max(model_load_times):.0f} ms")
-    print(f"  Mean:   {statistics.mean(model_load_times):.0f} ms")
-    if len(model_load_times) > 1:
-        print(f"  Stdev:  {statistics.stdev(model_load_times):.0f} ms")
-    print()
-
-    print("INFERENCE TIME (ms, prompt_eval_end to inference_end):")
-    print(f"  Min:    {min(inference_times):.0f} ms")
-    print(f"  Max:    {max(inference_times):.0f} ms")
-    print(f"  Mean:   {statistics.mean(inference_times):.0f} ms")
-    if len(inference_times) > 1:
-        print(f"  Stdev:  {statistics.stdev(inference_times):.0f} ms")
-    print()
-
-    print("FIRST TOKEN LATENCY (ms):")
-    print(f"  Min:    {min(first_token_latencies):.0f} ms")
-    print(f"  Max:    {max(first_token_latencies):.0f} ms")
-    print(f"  Mean:   {statistics.mean(first_token_latencies):.0f} ms")
-    if len(first_token_latencies) > 1:
-        print(f"  Stdev:  {statistics.stdev(first_token_latencies):.0f} ms")
-    print("=" * 70)
-
-
-def main():
-    # Parse command-line arguments
-    parser = argparse.ArgumentParser(
-        description="Benchmark Whisper ASR runner and collect throughput metrics"
-    )
-    parser.add_argument(
-        "num_runs",
-        type=int,
-        nargs="?",
-        default=50,
-        help="Number of benchmark runs (default: 5)",
-    )
-    parser.add_argument(
-        "--model_dir_name",
-        type=str,
-        default="decomposed",
-        help="Path to the directory that has model .pte and .ptd files",
-    )
-    parser.add_argument(
-        "--processor_path",
-        type=str,
-        default="~/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte",
-        help="Path to the preprocessor/processor .pte file",
-    )
-    parser.add_argument("--verbose", action="store_true", help="Print verbose output")
-
-    args = parser.parse_args()
-
-    base_path = "~/kernel-gen/whisper-large-v3-turbo/"
-    model_dir_path = os.path.join(base_path, args.model_dir_name)
-
-    # Expand user paths
-    model_path = os.path.expanduser(model_dir_path + "/model.pte")
-    data_path = os.path.expanduser(model_dir_path + "/aoti_cuda_blob.ptd")
-    tokenizer_path = os.path.expanduser(
-        "~/kernel-gen/whisper-large-v3-turbo/decomposed"
-    )
-    audio_path = os.path.expanduser(
-        "~/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav"
-    )
-    processor_path = os.path.expanduser(args.processor_path)
-
-    # Build command
-    command = (
-        "cmake-out/examples/models/whisper/whisper_runner "
-        f"--model_path {model_path} "
-        f"--data_path {data_path} "
-        f"--tokenizer_path {tokenizer_path} "
-        f"--audio_path {audio_path} "
-        f"--processor_path {processor_path} "
-        "--model_name whisper_large_v3 "
-        "--temperature 0 "
-    )
-
-    print(f"Running Whisper benchmark {args.num_runs} times...")
-    print(f"Command: {command}\n")
-
-    # Run benchmark
-    results = run_whisper_benchmark(
-        command, num_runs=args.num_runs, verbose=args.verbose
-    )
-
-    # Print summary
-    print_summary(results)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/custom_triton_playground/sdpa_triton.py b/custom_triton_playground/sdpa_triton.py
deleted file mode 100644
index 09233e557bd..00000000000
--- a/custom_triton_playground/sdpa_triton.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# kernel.py
-import math
-from typing import Any, Optional
-
-import torch
-import triton
-import triton.language as tl
-from torch.library import triton_op, wrap_triton
-
-
-"""
-Fused Scaled Dot-Product Attention (SDPA) implemented in a single Triton kernel.
-
-This module provides a transparent replacement for torch.nn.functional.scaled_dot_product_attention
-using a custom Triton kernel. The replacement is automatic - no model code changes needed!
-
-How it works:
-1. We register a custom implementation using torch.library
-2. When torch.nn.functional.scaled_dot_product_attention is called,
-   PyTorch's dispatch mechanism routes it to our implementation during AOTI compilation
-3. The model code remains unchanged
-
-What is fused:
-- We fuse QK^T matmul, numerically-stable online softmax, and the final
-  multiplication by V into one streaming kernel. No intermediate attention
-  matrix is materialized in memory.
-
-Design notes:
-- We tile along the query (sequence) dimension with BLOCK_M rows and iterate
-  over the key/value sequence dimension in BLOCK_N columns.
-- For each (batch, head) pair and query tile, we:
-  * Load a tile of Q once and keep it in registers.
-  * Stream over K/V in blocks: compute qk = Q @ K^T, update running row-wise
-    softmax statistics (m_i, l_i) and the output accumulator acc = sum(p * V)
-    using the "online softmax" algorithm:
-       m_new = max(m_old, max(qk))
-       p     = exp(qk - m_new)
-       acc   = acc * exp(m_old - m_new) + p @ V
-       l_new = l_old * exp(m_old - m_new) + sum(p)
-       m_old = m_new
-  * Finally, write O = acc / l_i.
-- All accumulation is done in fp32 for numerical stability; inputs/outputs are fp16.
-- Boundary conditions are handled with masks.
-- The Python wrapper only validates inputs, allocates outputs, configures the grid,
-  and launches the Triton kernel. All math is inside the Triton kernel.
-
-Runtime constraints respected:
-- No torch.nn or torch.nn.functional is used in the execution path.
-- No PyTorch compute ops are used to implement the algorithm; all math happens
-  in Triton via tl.load/tl.store/tl.dot/tl.exp/tl.max/tl.sum.
-"""
-
-
-@triton.jit
-def _sdpa_fwd_kernel(
-    q_ptr,
-    k_ptr,
-    v_ptr,
-    o_ptr,
-    B,
-    H,
-    S,
-    D,  # shapes
-    stride_qb,
-    stride_qh,
-    stride_qs,
-    stride_qd,
-    stride_kb,
-    stride_kh,
-    stride_ks,
-    stride_kd,
-    stride_vb,
-    stride_vh,
-    stride_vs,
-    stride_vd,
-    stride_ob,
-    stride_oh,
-    stride_os,
-    stride_od,
-    scale,  # 1/sqrt(D)
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    HEAD_DIM: tl.constexpr,
-):
-    # Program IDs
-    pid_m = tl.program_id(0)  # along sequence dimension (queries)
-    pid_bh = tl.program_id(1)  # across batch*heads
-
-    b = pid_bh // H
-    h = pid_bh % H
-
-    # Offsets for this block of queries
-    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, HEAD_DIM)
-
-    # Base pointers for this (b, h)
-    q_bh = q_ptr + b * stride_qb + h * stride_qh
-    k_bh = k_ptr + b * stride_kb + h * stride_kh
-    v_bh = v_ptr + b * stride_vb + h * stride_vh
-    o_bh = o_ptr + b * stride_ob + h * stride_oh
-
-    # Load Q tile: [BLOCK_M, HEAD_DIM]
-    q_ptrs = q_bh + (offs_m[:, None] * stride_qs + offs_d[None, :] * stride_qd)
-    q_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D)
-    q = tl.load(q_ptrs, mask=q_mask, other=0.0)
-
-    # Initialize online-softmax stats and output accumulator
-    m_i = tl.full([BLOCK_M], -float("inf"), dtype=tl.float32)
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
-
-    # Iterate over keys/values in blocks of BLOCK_N
-    for start_n in tl.range(0, S, BLOCK_N):
-        offs_n = start_n + tl.arange(0, BLOCK_N)
-        kv_mask_cols = offs_n < S
-
-        # Load K in a layout suitable for qk = q @ kT:
-        # k_ptrs produces a tensor of shape [HEAD_DIM, BLOCK_N]
-        k_ptrs = k_bh + (offs_n[None, :] * stride_ks + offs_d[:, None] * stride_kd)
-        k = tl.load(
-            k_ptrs, mask=(offs_d[:, None] < D) & (kv_mask_cols[None, :]), other=0.0
-        )
-
-        # qk = [BLOCK_M, BLOCK_N] in fp32
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk = tl.dot(q, k, qk)
-        qk = qk * scale  # scale by 1/sqrt(D)
-
-        # Mask out-of-bounds columns so they don't affect max/sum
-        qk = tl.where(kv_mask_cols[None, :], qk, -float("inf"))
-
-        # Online softmax update
-        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
-        p = tl.exp(qk - m_ij[:, None])  # fp32
-        alpha = tl.exp(m_i - m_ij)
-        l_i = l_i * alpha + tl.sum(p, axis=1)
-        m_i = m_ij
-
-        # Load V tile: [BLOCK_N, HEAD_DIM]
-        v_ptrs = v_bh + (offs_n[:, None] * stride_vs + offs_d[None, :] * stride_vd)
-        v = tl.load(
-            v_ptrs, mask=(kv_mask_cols[:, None]) & (offs_d[None, :] < D), other=0.0
-        )
-
-        # Update output accumulator: acc = acc * alpha + p @ v
-        acc = acc * alpha[:, None]
-        # Use fp16 inputs for tl.dot with fp32 accumulation
-        acc = tl.dot(p.to(tl.float16), v.to(tl.float16), acc)
-
-    # Normalize: O = acc / l_i[:, None]
-    o = acc / l_i[:, None]
-    # Store O in fp16
-    o_ptrs = o_bh + (offs_m[:, None] * stride_os + offs_d[None, :] * stride_od)
-    o_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D)
-    tl.store(o_ptrs, o.to(tl.float16), mask=o_mask)
-
-
-@triton_op("custom::scaled_dot_product_attention", mutates_args={})
-def triton_scaled_dot_product_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    scale: float = 0.0,
-    enable_gqa: bool = False,
-) -> torch.Tensor:
-    """
-    Fused Scaled Dot-Product Attention registered as a custom op:
-      O = softmax(Q @ K^T / sqrt(D)) @ V
-    where Q, K, V are shaped [batch, heads, seq_len, head_dim].
-
-    This function is registered with @triton_op so AOTI can discover and use it
-    during compilation as a replacement for torch.nn.functional.scaled_dot_product_attention.
-
-    Wrapper responsibilities:
-    - Validate input tensors (dtype/device/shapes)
-    - Allocate output tensor
-    - Configure grid and launch the Triton kernel
-    - No math is done here beyond basic scalar setup; all heavy compute runs in the Triton kernel.
-
-    Fusion details:
-    - This launches a single kernel that computes QK^T, performs online softmax,
-      and multiplies by V to produce O, all in one pass over K/V blocks.
-    - No intermediate attention matrix is written to global memory.
-
-    Args:
-        query: Query tensor [B, H, S, D]
-        key: Key tensor [B, H, S, D]
-        value: Value tensor [B, H, S, D]
-        attn_mask: has to be None
-        is_causal: has to be False
-        scale: has to be None
-        enable_gqa: has to be False
-
-    Returns:
-        Output tensor [B, H, S, D]
-    """
-    # Basic validation
-    if not (query.is_cuda and key.is_cuda and value.is_cuda):
-        raise RuntimeError("Q, K, V must be CUDA tensors.")
-    if (
-        query.dtype != torch.float16
-        or key.dtype != torch.float16
-        or value.dtype != torch.float16
-    ):
-        raise RuntimeError("This reference implementation expects float16 tensors.")
-    if query.shape != key.shape or query.shape != value.shape:
-        raise RuntimeError(
-            f"Q, K, V must have identical shapes; got Q={query.shape}, K={key.shape}, V={value.shape}."
-        )
-    if query.dim() != 4:
-        raise RuntimeError(
-            f"Expected 4D tensors shaped [B, H, S, D]; got {query.dim()}D."
-        )
-
-    # Enforce that only default values are accepted for these arguments
-    if attn_mask is not None:
-        raise RuntimeError(
-            "attn_mask must be None (not supported in this implementation)."
-        )
-
-    if dropout_p != 0.0:
-        raise RuntimeError(
-            "dropout_p must be 0.0 (not supported in this implementation)."
-        )
-    if is_causal is not False:
-        raise RuntimeError(
-            "is_causal must be False (not supported in this implementation)."
-        )
-    if scale != 0:
-        raise RuntimeError("scale must be None (not supported in this implementation).")
-    if enable_gqa is not False:
-        raise RuntimeError(
-            "enable_gqa must be False (not supported in this implementation)."
-        )
-
-    B, H, S, D = query.shape
-
-    # Allocate output
-    O = torch.empty_like(query)
-
-    # Choose tiling parameters (powers of two, coalesced-friendly)
-    # Conservative sizes to keep register/SMEM pressure reasonable for D=1024
-    BLOCK_M = 16
-    BLOCK_N = 32
-
-    # Compute softmax scale on host (scalar) - this is setup, not heavy math
-    scale = 1.0 / math.sqrt(float(D))
-
-    # Grid: one program per (query block, batch*head)
-    grid = (triton.cdiv(S, BLOCK_M), B * H)
-
-    # Launch kernel using wrap_triton to avoid tracing issues during export/compile
-    # Note: wrap_triton returns a callable that can be indexed with grid
-    wrap_triton(_sdpa_fwd_kernel)[grid](
-        query,
-        key,
-        value,
-        O,
-        B,
-        H,
-        S,
-        D,
-        query.stride(0),
-        query.stride(1),
-        query.stride(2),
-        query.stride(3),
-        key.stride(0),
-        key.stride(1),
-        key.stride(2),
-        key.stride(3),
-        value.stride(0),
-        value.stride(1),
-        value.stride(2),
-        value.stride(3),
-        O.stride(0),
-        O.stride(1),
-        O.stride(2),
-        O.stride(3),
-        scale,
-        BLOCK_M=BLOCK_M,
-        BLOCK_N=BLOCK_N,
-        HEAD_DIM=D,
-        num_warps=4,
-        num_stages=2,
-    )
-
-    return O
-
-
-# Register the abstract/fake implementation for torch.export
-# This is critical to avoid accessing real tensor data during export
-@triton_scaled_dot_product_attention.register_fake
-def _triton_sdpa_abstract(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    scale=None,
-    enable_gqa=False,
-) -> torch.Tensor:
-    """
-    Abstract/fake implementation for torch.export.
-    This just returns an empty tensor with the correct shape/dtype/device.
-    No actual computation happens here - this is only for shape inference during export.
-    """
-    # Validate shapes match
-    assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape"
-    assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
-
-    # Output has the same shape and dtype as query
-    # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation
-    return torch.empty_like(query, dtype=query.dtype, device=query.device)
diff --git a/custom_triton_playground/test-sdpa-with-custom-kernel.py b/custom_triton_playground/test-sdpa-with-custom-kernel.py
deleted file mode 100644
index 16883bc83d9..00000000000
--- a/custom_triton_playground/test-sdpa-with-custom-kernel.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# ============================================================================
-# IMPORTANT: Import sdpa_triton BEFORE defining the model
-# This automatically enables the custom Triton kernel via monkey-patching
-# ============================================================================
-import argparse
-import os
-from contextlib import nullcontext
-
-import torch
-from executorch.backends.cuda.cuda_backend import CudaBackend
-from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
-from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
-from optimized_sdpa_triton import optimized_triton_scaled_dot_product_attention
-from sdpa_triton import triton_scaled_dot_product_attention
-from torch.export import Dim, export
-from torch.nn.attention import SDPBackend
-
-
-class Model(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, query, key, value):
-        # This is the ORIGINAL code - we're NOT changing it!
-        # But it will automatically use our custom Triton kernel
-        # because we imported sdpa_triton above
-        out = torch.nn.functional.scaled_dot_product_attention(
-            query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
-        )
-        return out
-
-
-sdpa_ctx = nullcontext()
-
-
-# hacky method to replace system sdpa with my triton
-def init_sdpa_kernel(custom_triton):
-    global sdpa_ctx
-    if custom_triton == "decomposed_kernel":
-        sdpa_ctx = torch.nn.attention.sdpa_kernel([SDPBackend.MATH])
-    elif custom_triton == "unoptimized_triton":
-        torch.nn.functional.scaled_dot_product_attention = (
-            triton_scaled_dot_product_attention
-        )
-    elif custom_triton == "optimized_triton":
-        torch.nn.functional.scaled_dot_product_attention = (
-            optimized_triton_scaled_dot_product_attention
-        )
-    else:
-        assert False, f"{custom_triton} has not been supported yet"
-
-
-def main(kernel_type, output_dir, dtype):
-    print(f"Using kernel type: {kernel_type}")
-    print(f"Using dtype: {dtype}")
-    init_sdpa_kernel(kernel_type)
-
-    model = Model()
-    batch_size, num_heads, seq_len, head_dim = 1, 20, 1500, 64
-
-    # Map dtype string to torch dtype
-    dtype_map = {
-        "fp16": torch.float16,
-        "bf16": torch.bfloat16,
-    }
-    torch_dtype = dtype_map[dtype]
-
-    # Create inputs with specified dtype
-    inputs = (
-        torch.randn(
-            batch_size,
-            num_heads,
-            seq_len,
-            head_dim,
-            dtype=torch_dtype,
-            device="cuda",
-        ),
-        torch.randn(
-            batch_size,
-            num_heads,
-            seq_len,
-            head_dim,
-            dtype=torch_dtype,
-            device="cuda",
-        ),
-        torch.randn(
-            batch_size,
-            num_heads,
-            seq_len,
-            head_dim,
-            dtype=torch_dtype,
-            device="cuda",
-        ),
-    )
-
-    print("Testing model execution with custom kernel...")
-    with torch.no_grad():
-        output = model(*inputs)
-        print(f"✓ Model executed successfully. Output shape: {output.shape}\n")
-
-    print("Exporting model...")
-    exported_program = export(model, inputs)
-    print("✓ Model exported successfully\n")
-
-    print("Lowering to ExecuTorch CUDA backend (using AOTI)...")
-    with sdpa_ctx, torch.no_grad():
-        executorch_program = to_edge_transform_and_lower(
-            exported_program,
-            partitioner=[
-                CudaPartitioner(
-                    [CudaBackend.generate_method_name_compile_spec("forward")]
-                )
-            ],
-            compile_config=EdgeCompileConfig(_check_ir_validity=False),
-        ).to_executorch()
-        print("✓ Model lowered successfully with AOTI\n")
-
-        print("Saving model...")
-        os.makedirs(output_dir, exist_ok=True)
-        with open(os.path.join(output_dir, "model.pte"), "wb") as file:
-            file.write(executorch_program.buffer)
-
-        executorch_program.write_tensor_data_to_file(output_dir)
-        print(f"✓ PTE and PTD files has successfully dumped to {output_dir}\n")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Test SDPA with custom kernel")
-    parser.add_argument(
-        "--kernel_type",
-        type=str,
-        choices=["unoptimized_triton", "optimized_triton", "decomposed_kernel"],
-        help="Type of kernel to use",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default=".",
-        help="Directory to save model.pte and tensor data (default: current directory)",
-    )
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        choices=["fp16", "bf16"],
-        default="bf16",
-        help="Data type for model inputs (default: bf16)",
-    )
-
-    args = parser.parse_args()
-
-    main(args.kernel_type, args.output_dir, args.dtype)
diff --git a/eval.sh b/eval.sh
deleted file mode 100644
index 57f5740d446..00000000000
--- a/eval.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-
-# 用法: ./evaluate_kernel.sh <kernel_name> <n_evaluation>
-KERNEL_NAME=$1
-N_EVAL=$2
-
-# 路径前缀
-BASE_PATH=~/kernel-gen/whisper-large-v3-turbo/${KERNEL_NAME}/
-
-MODEL_PATH=${BASE_PATH}model.pte
-DATA_PATH=${BASE_PATH}aoti_cuda_blob.ptd
-TOKENIZER_PATH=${BASE_PATH}
-AUDIO_PATH=${BASE_PATH}output.wav
-PROCESSOR_PATH=${BASE_PATH}whisper_preprocessor.pte
-
-CMD="cmake-out/examples/models/whisper/whisper_runner \
-  --model_path ${MODEL_PATH} \
-  --data_path ${DATA_PATH} \
-  --temperature 0 \
-  --tokenizer_path ${TOKENIZER_PATH} \
-  --audio_path ${AUDIO_PATH} \
-  --processor_path ${PROCESSOR_PATH}"
-
-rates=()
-for ((i=1; i<=N_EVAL; i++)); do
-  echo "Running evaluation $i/$N_EVAL..."
-  output=$($CMD 2>&1)
-  # 推荐用 awk
-  rate=$(echo "$output" | grep "Generated 128 tokens:" | awk '{print $(NF-1)}')
-  echo "Generated token rate for run $i: $rate"
-  if [[ ! -z "$rate" ]]; then
-    rates+=($rate)
-  fi
-done
-
-# 计算平均值
-sum=0
-count=0
-for r in "${rates[@]}"; do
-  # 只统计非空数值
-  if [[ ! -z "$r" ]]; then
-    sum=$(echo "$sum + $r" | bc)
-    count=$((count+1))
-  fi
-done
-
-if [[ $count -gt 0 ]]; then
-  avg=$(echo "scale=2; $sum / $count" | bc)
-  echo "Average Generated token rate over $count runs: $avg tokens/second"
-else
-  echo "No valid token rates found."
-fi
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
index 45abfd8f89d..d08bbfe59ee 100644
--- a/examples/models/__init__.py
+++ b/examples/models/__init__.py
@@ -15,6 +15,7 @@ class Model(str, Enum):
     AddMul = "add_mul"
     Softmax = "softmax"
     Conv1d = "conv1d"
+    Sdpa = "sdpa"
     Dl3 = "dl3"
     Edsr = "edsr"
     EmformerTranscribe = "emformer_transcribe"
@@ -62,6 +63,7 @@ def __str__(self) -> str:
     str(Model.AddMul): ("toy_model", "AddMulModule"),
     str(Model.Softmax): ("toy_model", "SoftmaxModule"),
     str(Model.Conv1d): ("toy_model", "Conv1dModule"),
+    str(Model.Sdpa): ("toy_model", "SdpaModule"),
     str(Model.Dl3): ("deeplab_v3", "DeepLabV3ResNet50Model"),
     str(Model.Edsr): ("edsr", "EdsrModel"),
     str(Model.EmformerTranscribe): ("emformer_rnnt", "EmformerRnntTranscriberModel"),
diff --git a/examples/models/toy_model/__init__.py b/examples/models/toy_model/__init__.py
index 333a625af1b..87456e3fd4c 100644
--- a/examples/models/toy_model/__init__.py
+++ b/examples/models/toy_model/__init__.py
@@ -10,6 +10,7 @@
     Conv1dModule,
     LinearModule,
     MulModule,
+    SdpaModule,
     SoftmaxModule,
 )
 
@@ -19,5 +20,6 @@
     Conv1dModule,
     LinearModule,
     MulModule,
+    SdpaModule,
     SoftmaxModule,
 ]
diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py
index e1dd290b829..ed6a9faf314 100644
--- a/examples/models/toy_model/model.py
+++ b/examples/models/toy_model/model.py
@@ -105,3 +105,33 @@ def get_eager_model(self) -> torch.nn.Module:
 
     def get_example_inputs(self):
         return (torch.randn(1, 3, 10),)
+
+
+class SdpaModule(torch.nn.Module, EagerModelBase):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, query, key, value):
+        out = torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+        return out
+
+    def get_eager_model(self) -> torch.nn.Module:
+        return self
+
+    def get_example_inputs(self):
+        # Input shape: (batch, num_heads, seq_len, head_dim)
+        batch_size = 2
+        num_heads = 8
+        seq_len = 128
+        head_dim = 64
+        query = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        key = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        value = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        return (query, key, value)
diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h
index b587628fd1d..1a30e2cc4df 100644
--- a/extension/runner_util/inputs.h
+++ b/extension/runner_util/inputs.h
@@ -64,7 +64,7 @@ struct PrepareInputTensorsOptions {
    * all inputs exceeds this, an error is returned. This prevents allocating too
    * much memory if the PTE file is malformed.
    */
-  size_t max_total_allocation_size = 1024 * 1024 * 1024 * 10;
+  size_t max_total_allocation_size = 1024 * 1024 * 1024;
 
   /**
    * The maximum number of inputs to allocate. If the number of inputs exceeds

From 9c43c113de5f96e4bc838ae36e2698ed66d89ed7 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 17 Nov 2025 01:17:42 -0800
Subject: [PATCH 6/7] remove test result file

---
 decomposed-freezing-result.txt | 184 ---------------------------------
 triton-v2-freeze-result.txt    | 184 ---------------------------------
 triton-v3-result.txt           | 184 ---------------------------------
 3 files changed, 552 deletions(-)
 delete mode 100644 decomposed-freezing-result.txt
 delete mode 100644 triton-v2-freeze-result.txt
 delete mode 100644 triton-v3-result.txt

diff --git a/decomposed-freezing-result.txt b/decomposed-freezing-result.txt
deleted file mode 100644
index 375ee7c034b..00000000000
--- a/decomposed-freezing-result.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-Running Whisper benchmark 50 times...
-Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-
-
-[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1138ms, Inference: 292ms, First token: 14ms
-
-[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1162ms, Inference: 297ms, First token: 14ms
-
-[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 407.64 t/s, Model load: 1096ms, Inference: 314ms, First token: 14ms
-
-[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1076ms, Inference: 301ms, First token: 14ms
-
-[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1113ms, Inference: 304ms, First token: 14ms
-
-[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 400.00 t/s, Model load: 1121ms, Inference: 320ms, First token: 21ms
-
-[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1149ms, Inference: 308ms, First token: 16ms
-
-[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1093ms, Inference: 294ms, First token: 14ms
-
-[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1093ms, Inference: 313ms, First token: 15ms
-
-[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1105ms, Inference: 298ms, First token: 14ms
-
-[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1117ms, Inference: 315ms, First token: 14ms
-
-[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1074ms, Inference: 327ms, First token: 14ms
-
-[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1086ms, Inference: 294ms, First token: 14ms
-
-[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1133ms, Inference: 331ms, First token: 14ms
-
-[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1120ms, Inference: 311ms, First token: 15ms
-
-[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 393.85 t/s, Model load: 1118ms, Inference: 325ms, First token: 18ms
-
-[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1008ms, Inference: 309ms, First token: 19ms
-
-[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1074ms, Inference: 296ms, First token: 14ms
-
-[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1045ms, Inference: 297ms, First token: 14ms
-
-[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1077ms, Inference: 296ms, First token: 14ms
-
-[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1095ms, Inference: 298ms, First token: 14ms
-
-[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 423.84 t/s, Model load: 1019ms, Inference: 302ms, First token: 14ms
-
-[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1269ms, Inference: 313ms, First token: 14ms
-
-[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1039ms, Inference: 313ms, First token: 18ms
-
-[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1080ms, Inference: 297ms, First token: 14ms
-
-[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1099ms, Inference: 292ms, First token: 14ms
-
-[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1200ms, Inference: 296ms, First token: 14ms
-
-[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 984ms, Inference: 305ms, First token: 14ms
-
-[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1110ms, Inference: 305ms, First token: 15ms
-
-[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1063ms, Inference: 299ms, First token: 14ms
-
-[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 382.09 t/s, Model load: 1117ms, Inference: 335ms, First token: 14ms
-
-[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1128ms, Inference: 310ms, First token: 14ms
-
-[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1173ms, Inference: 299ms, First token: 14ms
-
-[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1077ms, Inference: 300ms, First token: 14ms
-
-[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 436.86 t/s, Model load: 1078ms, Inference: 293ms, First token: 14ms
-
-[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1003ms, Inference: 297ms, First token: 16ms
-
-[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1103ms, Inference: 307ms, First token: 15ms
-
-[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1053ms, Inference: 313ms, First token: 15ms
-
-[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1091ms, Inference: 307ms, First token: 14ms
-
-[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1850ms, Inference: 331ms, First token: 14ms
-
-[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 384.38 t/s, Model load: 1017ms, Inference: 333ms, First token: 14ms
-
-[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1090ms, Inference: 303ms, First token: 14ms
-
-[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1154ms, Inference: 329ms, First token: 16ms
-
-[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1085ms, Inference: 342ms, First token: 13ms
-
-[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1098ms, Inference: 300ms, First token: 14ms
-
-[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1093ms, Inference: 312ms, First token: 14ms
-
-[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1099ms, Inference: 304ms, First token: 14ms
-
-[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1113ms, Inference: 303ms, First token: 15ms
-
-[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1089ms, Inference: 311ms, First token: 19ms
-
-[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1151ms, Inference: 303ms, First token: 15ms
-
-======================================================================
-BENCHMARK SUMMARY
-======================================================================
-Total runs: 50
-Generated tokens per run: 128
-
-THROUGHPUT (tokens/sec):
-  Min:    374.27 t/s
-  Max:    438.36 t/s
-  Mean:   416.41 t/s
-  Stdev:  16.45 t/s
-
-MODEL LOAD TIME (ms):
-  Min:    984 ms
-  Max:    1850 ms
-  Mean:   1112 ms
-  Stdev:  117 ms
-
-INFERENCE TIME (ms, prompt_eval_end to inference_end):
-  Min:    292 ms
-  Max:    342 ms
-  Mean:   308 ms
-  Stdev:  13 ms
-
-FIRST TOKEN LATENCY (ms):
-  Min:    13 ms
-  Max:    21 ms
-  Mean:   15 ms
-  Stdev:  2 ms
-======================================================================
diff --git a/triton-v2-freeze-result.txt b/triton-v2-freeze-result.txt
deleted file mode 100644
index 16dfd1da218..00000000000
--- a/triton-v2-freeze-result.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-Running Whisper benchmark 50 times...
-Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-
-
-[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 1070ms, Inference: 326ms, First token: 13ms
-
-[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1036ms, Inference: 315ms, First token: 14ms
-
-[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1043ms, Inference: 327ms, First token: 13ms
-
-[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1027ms, Inference: 330ms, First token: 13ms
-
-[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 1078ms, Inference: 345ms, First token: 14ms
-
-[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 987ms, Inference: 331ms, First token: 13ms
-
-[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1112ms, Inference: 329ms, First token: 13ms
-
-[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 982ms, Inference: 337ms, First token: 15ms
-
-[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1040ms, Inference: 331ms, First token: 13ms
-
-[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1048ms, Inference: 334ms, First token: 13ms
-
-[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 974ms, Inference: 336ms, First token: 13ms
-
-[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1059ms, Inference: 341ms, First token: 13ms
-
-[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1010ms, Inference: 330ms, First token: 12ms
-
-[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1001ms, Inference: 331ms, First token: 13ms
-
-[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 1001ms, Inference: 337ms, First token: 13ms
-
-[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 398.75 t/s, Model load: 1093ms, Inference: 321ms, First token: 13ms
-
-[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 966ms, Inference: 326ms, First token: 12ms
-
-[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1022ms, Inference: 327ms, First token: 13ms
-
-[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1020ms, Inference: 334ms, First token: 12ms
-
-[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 987ms, Inference: 339ms, First token: 13ms
-
-[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1115ms, Inference: 352ms, First token: 16ms
-
-[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1012ms, Inference: 339ms, First token: 13ms
-
-[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 1079ms, Inference: 346ms, First token: 13ms
-
-[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1067ms, Inference: 301ms, First token: 14ms
-
-[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1147ms, Inference: 307ms, First token: 14ms
-
-[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1106ms, Inference: 315ms, First token: 15ms
-
-[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 396.28 t/s, Model load: 1048ms, Inference: 323ms, First token: 12ms
-
-[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 433.90 t/s, Model load: 1098ms, Inference: 295ms, First token: 14ms
-
-[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 287.64 t/s, Model load: 1091ms, Inference: 445ms, First token: 14ms
-
-[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 18ms
-
-[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1075ms, Inference: 304ms, First token: 21ms
-
-[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1029ms, Inference: 310ms, First token: 14ms
-
-[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1177ms, Inference: 313ms, First token: 14ms
-
-[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1188ms, Inference: 305ms, First token: 15ms
-
-[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 390.24 t/s, Model load: 999ms, Inference: 328ms, First token: 13ms
-
-[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 1042ms, Inference: 338ms, First token: 13ms
-
-[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 15ms
-
-[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1065ms, Inference: 308ms, First token: 18ms
-
-[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1173ms, Inference: 304ms, First token: 14ms
-
-[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1034ms, Inference: 303ms, First token: 16ms
-
-[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 245.21 t/s, Model load: 1118ms, Inference: 522ms, First token: 15ms
-
-[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 1043ms, Inference: 336ms, First token: 16ms
-
-[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 999ms, Inference: 330ms, First token: 13ms
-
-[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1262ms, Inference: 334ms, First token: 14ms
-
-[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1059ms, Inference: 308ms, First token: 14ms
-
-[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1019ms, Inference: 304ms, First token: 14ms
-
-[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1015ms, Inference: 296ms, First token: 14ms
-
-[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1074ms, Inference: 312ms, First token: 15ms
-
-[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1059ms, Inference: 311ms, First token: 15ms
-
-[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1064ms, Inference: 307ms, First token: 14ms
-
-======================================================================
-BENCHMARK SUMMARY
-======================================================================
-Total runs: 50
-Generated tokens per run: 128
-
-THROUGHPUT (tokens/sec):
-  Min:    245.21 t/s
-  Max:    433.90 t/s
-  Mean:   392.57 t/s
-  Stdev:  31.88 t/s
-
-MODEL LOAD TIME (ms):
-  Min:    966 ms
-  Max:    1262 ms
-  Mean:   1059 ms
-  Stdev:  59 ms
-
-INFERENCE TIME (ms, prompt_eval_end to inference_end):
-  Min:    295 ms
-  Max:    522 ms
-  Mean:   329 ms
-  Stdev:  36 ms
-
-FIRST TOKEN LATENCY (ms):
-  Min:    12 ms
-  Max:    21 ms
-  Mean:   14 ms
-  Stdev:  2 ms
-======================================================================
diff --git a/triton-v3-result.txt b/triton-v3-result.txt
deleted file mode 100644
index 2c426a28eee..00000000000
--- a/triton-v3-result.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-Running Whisper benchmark 50 times...
-Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-
-
-[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 969ms, Inference: 347ms, First token: 13ms
-
-[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 922ms, Inference: 338ms, First token: 12ms
-
-[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 915ms, Inference: 337ms, First token: 13ms
-
-[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1004ms, Inference: 339ms, First token: 12ms
-
-[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1030ms, Inference: 348ms, First token: 13ms
-
-[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 988ms, Inference: 348ms, First token: 13ms
-
-[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 355.56 t/s, Model load: 1017ms, Inference: 360ms, First token: 14ms
-
-[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 992ms, Inference: 337ms, First token: 12ms
-
-[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 995ms, Inference: 350ms, First token: 13ms
-
-[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1014ms, Inference: 341ms, First token: 13ms
-
-[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 944ms, Inference: 346ms, First token: 12ms
-
-[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 976ms, Inference: 345ms, First token: 16ms
-
-[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 385.54 t/s, Model load: 1046ms, Inference: 332ms, First token: 14ms
-
-[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 352.62 t/s, Model load: 1014ms, Inference: 363ms, First token: 13ms
-
-[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 343.16 t/s, Model load: 1084ms, Inference: 373ms, First token: 14ms
-
-[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 990ms, Inference: 341ms, First token: 13ms
-
-[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 958ms, Inference: 350ms, First token: 13ms
-
-[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 997ms, Inference: 349ms, First token: 15ms
-
-[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 361.58 t/s, Model load: 1038ms, Inference: 354ms, First token: 13ms
-
-[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1044ms, Inference: 350ms, First token: 12ms
-
-[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 358.54 t/s, Model load: 1027ms, Inference: 357ms, First token: 13ms
-
-[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 943ms, Inference: 346ms, First token: 13ms
-
-[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1005ms, Inference: 312ms, First token: 14ms
-
-[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1016ms, Inference: 350ms, First token: 13ms
-
-[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 405.06 t/s, Model load: 981ms, Inference: 316ms, First token: 14ms
-
-[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1007ms, Inference: 352ms, First token: 13ms
-
-[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1067ms, Inference: 342ms, First token: 13ms
-
-[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1100ms, Inference: 352ms, First token: 15ms
-
-[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 957ms, Inference: 340ms, First token: 13ms
-
-[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1006ms, Inference: 352ms, First token: 13ms
-
-[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 354.57 t/s, Model load: 1040ms, Inference: 361ms, First token: 13ms
-
-[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 973ms, Inference: 334ms, First token: 12ms
-
-[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 982ms, Inference: 349ms, First token: 12ms
-
-[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 958ms, Inference: 347ms, First token: 13ms
-
-[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1031ms, Inference: 309ms, First token: 14ms
-
-[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 962ms, Inference: 349ms, First token: 13ms
-
-[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 967ms, Inference: 341ms, First token: 18ms
-
-[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 977ms, Inference: 346ms, First token: 16ms
-
-[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 1016ms, Inference: 353ms, First token: 18ms
-
-[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1175ms, Inference: 339ms, First token: 13ms
-
-[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 373.18 t/s, Model load: 964ms, Inference: 343ms, First token: 18ms
-
-[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1074ms, Inference: 362ms, First token: 14ms
-
-[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 981ms, Inference: 353ms, First token: 13ms
-
-[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1010ms, Inference: 348ms, First token: 13ms
-
-[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1023ms, Inference: 339ms, First token: 12ms
-
-[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 275.86 t/s, Model load: 1225ms, Inference: 464ms, First token: 19ms
-
-[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 964ms, Inference: 340ms, First token: 12ms
-
-[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 979ms, Inference: 334ms, First token: 13ms
-
-[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1003ms, Inference: 339ms, First token: 12ms
-
-[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 
-✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1206ms, Inference: 362ms, First token: 14ms
-
-======================================================================
-BENCHMARK SUMMARY
-======================================================================
-Total runs: 50
-Generated tokens per run: 128
-
-THROUGHPUT (tokens/sec):
-  Min:    275.86 t/s
-  Max:    414.24 t/s
-  Mean:   369.34 t/s
-  Stdev:  18.90 t/s
-
-MODEL LOAD TIME (ms):
-  Min:    915 ms
-  Max:    1225 ms
-  Mean:   1011 ms
-  Stdev:  62 ms
-
-INFERENCE TIME (ms, prompt_eval_end to inference_end):
-  Min:    309 ms
-  Max:    464 ms
-  Mean:   348 ms
-  Stdev:  21 ms
-
-FIRST TOKEN LATENCY (ms):
-  Min:    12 ms
-  Max:    19 ms
-  Mean:   14 ms
-  Stdev:  2 ms
-======================================================================

From fcff1c7deb462d08d8e4609d8eddea5cd92ab498 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 17 Nov 2025 01:20:42 -0800
Subject: [PATCH 7/7] remove extra cuda backend changes

---
 backends/cuda/cuda_backend.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 3506cb685fe..772e24c75b3 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -178,7 +178,7 @@ def preprocess(  # noqa: C901
             if spec.key == "shim_library_path":
                 shim_library_path = spec.value.decode("utf-8")
 
-        assert platform == "linux"
+        assert platform == "linux" or platform == "windows"
         if platform == "windows" and shim_library_path is None:
             lib_dir = resources.files("executorch").joinpath("data/lib")
             shim_library_path = str(lib_dir)
@@ -220,8 +220,6 @@ def preprocess(  # noqa: C901
                 f"Could not find required files in compiled paths, got {paths}"
             )
 
-        print("--- Generate .so lives at", so_path)
-
         # pyre-ignorep[6]: Incompatible parameter type
         with open(so_path, "rb") as f:
             so_data = f.read()