From c225f961e7a281ce6d1fc72fb6b78787964dc3a5 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 11 Nov 2025 11:07:25 -0800 Subject: [PATCH 1/7] inti --- backends/aoti/common_shims.cpp | 4 + backends/aoti/common_shims.h | 1 + backends/aoti/utils.h | 2 + backends/cuda/cuda_backend.py | 6 +- backends/cuda/runtime/shims/memory.cpp | 90 +++++ backends/cuda/runtime/shims/memory.h | 25 ++ backends/cuda/runtime/utils.h | 5 +- .../optimized_sdpa_triton.py | 284 ++++++++++++++++ custom_triton_playground/sdpa_triton.py | 317 ++++++++++++++++++ .../test-sdpa-with-custom-kernel.py | 151 +++++++++ extension/runner_util/inputs.h | 2 +- 11 files changed, 880 insertions(+), 7 deletions(-) create mode 100644 custom_triton_playground/optimized_sdpa_triton.py create mode 100644 custom_triton_playground/sdpa_triton.py create mode 100644 custom_triton_playground/test-sdpa-with-custom-kernel.py diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index deb10478778..f1a6cc031b8 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -164,6 +164,10 @@ int32_t aoti_torch_layout_strided() { } // Dtype constants - these return the PyTorch dtype codes +int32_t aoti_torch_dtype_float16() { + return 5; // PyTorch's float16 dtype code +} + int32_t aoti_torch_dtype_float32() { return 6; // PyTorch's float32 dtype code } diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 91bb785b684..6600d05b1f5 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -57,6 +57,7 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim); // Utility functions for device and layout information int32_t aoti_torch_device_type_cpu(); int32_t aoti_torch_layout_strided(); +int32_t aoti_torch_dtype_float16(); int32_t aoti_torch_dtype_float32(); int32_t aoti_torch_dtype_bfloat16(); int32_t aoti_torch_dtype_int8(); diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h index 8f64bdbe7da..f457b3229c5 100644 --- a/backends/aoti/utils.h +++ b/backends/aoti/utils.h @@ -43,6 +43,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) { return executorch::aten::ScalarType::Int; case 4: // PyTorch's int64 dtype code return executorch::aten::ScalarType::Long; + case 5: // PyTorch's float16 (half) dtype code + return executorch::aten::ScalarType::Half; case 6: // PyTorch's float32 dtype code return executorch::aten::ScalarType::Float; case 11: // PyTorch's bool dtype code diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index f8482835ea5..86571baa47a 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -162,11 +162,7 @@ def preprocess( "max_autotune_conv_backends": "TRITON", } - with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel( - [ - SDPBackend.MATH # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`. - ] - ), torch.no_grad(): + with collect_unsupported_fallback_kernels(), torch.no_grad(): # torch._logging.set_logs(post_grad_graphs=True) # Here we should expect 1 so file and 1 weight blob in the same directory. paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type] diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index 46b8d448a3a..ccaf2538dca 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -582,6 +582,96 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) { return Error::Ok; } +AOTITorchError aoti_torch_new_tensor_handle( + Tensor* orig_handle, + Tensor** new_handle) { + // Validate input parameters + ET_CHECK_OR_RETURN_ERROR( + orig_handle != nullptr, + InvalidArgument, + "aoti_torch_new_tensor_handle failed: orig_handle is null"); + + ET_CHECK_OR_RETURN_ERROR( + new_handle != nullptr, + InvalidArgument, + "aoti_torch_new_tensor_handle failed: new_handle is null"); + + // Get metadata from the original tensor + int64_t* sizes_ptr; + int64_t* strides_ptr; + int32_t dtype; + int32_t device_type; + int32_t device_index; + + ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_sizes(orig_handle, &sizes_ptr)); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_get_strides(orig_handle, &strides_ptr)); + ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(orig_handle, &dtype)); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_get_device_type(orig_handle, &device_type)); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_get_device_index(orig_handle, &device_index)); + + int64_t ndim = orig_handle->dim(); + + // Validate dtype + ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); + + // Ensure device_index is always 0 + ET_CHECK_OR_RETURN_ERROR( + device_index == 0, + InvalidArgument, + "device_index must be 0, got: %d", + device_index); + + // Get the original data pointer from the source tensor + void* data_ptr = orig_handle->mutable_data_ptr(); + ET_CHECK_OR_RETURN_ERROR( + data_ptr != nullptr, + InvalidArgument, + "Source tensor has null data pointer"); + + // Check if the given memory is in the map + auto memory_it = memory_to_n_tensor.find(data_ptr); + ET_CHECK_OR_RETURN_ERROR( + memory_it != memory_to_n_tensor.end(), + InvalidArgument, + "Memory address %p is not being tracked by reference counting system", + data_ptr); + + // Convert sizes and strides to vectors + std::vector sizes = convert_sizes_to_vector(ndim, sizes_ptr); + std::vector strides = + convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); + + // Create new tensor that shares the same memory as the original + // This is similar to PyTorch's Tensor copy constructor - creates a new + // tensor object that shares the same underlying storage + std::shared_ptr tensor = make_tensor( + sizes, // Same sizes as original + data_ptr, // Share the same memory from source tensor + {}, // dim_order (empty, will be auto-generated) + strides, // Same strides as original + dtype_to_scalar_type(dtype) // Same dtype as original + ); + + ET_CHECK_OR_RETURN_ERROR( + tensor != nullptr, InvalidArgument, "Failed to create new tensor handle"); + + // Store the tensor so it doesn't get destroyed + tensors.insert(tensor); + + *new_handle = tensor.get(); + + // Increment the reference count for this memory address only if it is owned + // by tensor + memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN + ? NOT_OWN + : memory_to_n_tensor[data_ptr] + 1; + + return Error::Ok; +} + AOTITorchError aoti_torch__reinterpret_tensor( Tensor* self, int64_t ndim, diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h index 7a8d4c3609b..3411f47cda5 100644 --- a/backends/cuda/runtime/shims/memory.h +++ b/backends/cuda/runtime/shims/memory.h @@ -114,6 +114,31 @@ AOTITorchError aoti_torch__reinterpret_tensor( int64_t storage_offset, Tensor** ret_new_tensor); +/** + * Creates a new tensor handle from an existing one. + * + * This function creates a new tensor object that shares the same underlying + * memory as the original tensor. Similar to PyTorch's Tensor copy constructor, + * it creates a new handle/reference to the same data without performing a deep + * copy. + * + * The new tensor will: + * - Share the same memory/storage as the original tensor + * - Have the same shape, strides, and dtype as the original + * - Increment the reference count for the underlying memory (if owned) + * + * @param orig_handle Original tensor to create a new handle from (must not be + * null) + * @param new_handle Output pointer to store the new tensor handle (must not be + * null) + * + * @return Error::Ok on success, appropriate error code on failure: + * - Error::InvalidArgument: null pointers or invalid parameters + */ +AOTITorchError aoti_torch_new_tensor_handle( + Tensor* orig_handle, + Tensor** new_handle); + /** * Copies data from source tensor to destination tensor. * diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h index 4474f8cf57e..544322a7fee 100644 --- a/backends/cuda/runtime/utils.h +++ b/backends/cuda/runtime/utils.h @@ -61,6 +61,7 @@ enum class SupportedDTypes : int32_t { INT16 = 2, // PyTorch's int16 dtype code INT32 = 3, // PyTorch's int32 dtype code INT64 = 4, // PyTorch's int64 dtype code + FLOAT16 = 5, // PyTorch's float16 dtype code FLOAT32 = 6, // PyTorch's float32 dtype code BOOL = 11, // PyTorch's bool dtype code BFLOAT16 = 15, // PyTorch's bfloat16 dtype code @@ -84,6 +85,7 @@ inline bool is_dtype_supported_in_et_cuda(int32_t dtype) { case static_cast(SupportedDTypes::INT16): case static_cast(SupportedDTypes::INT32): case static_cast(SupportedDTypes::INT64): + case static_cast(SupportedDTypes::FLOAT16): case static_cast(SupportedDTypes::FLOAT32): case static_cast(SupportedDTypes::BOOL): case static_cast(SupportedDTypes::BFLOAT16): @@ -98,12 +100,13 @@ inline AOTITorchError validate_dtype(int32_t dtype) { ET_CHECK_OR_RETURN_ERROR( is_dtype_supported_in_et_cuda(dtype), InvalidArgument, - "Unsupported dtype: %d. Supported dtypes: %d (int8), %d (int16), %d (int32), %d (int64), %d (float32), %d (bool), %d (bfloat16)", + "Unsupported dtype: %d. Supported dtypes: %d (int8), %d (int16), %d (int32), %d (int64), %d (float16), %d (float32), %d (bool), %d (bfloat16)", dtype, static_cast(SupportedDTypes::INT8), static_cast(SupportedDTypes::INT16), static_cast(SupportedDTypes::INT32), static_cast(SupportedDTypes::INT64), + static_cast(SupportedDTypes::FLOAT16), static_cast(SupportedDTypes::FLOAT32), static_cast(SupportedDTypes::BOOL), static_cast(SupportedDTypes::BFLOAT16)); diff --git a/custom_triton_playground/optimized_sdpa_triton.py b/custom_triton_playground/optimized_sdpa_triton.py new file mode 100644 index 00000000000..df267202d29 --- /dev/null +++ b/custom_triton_playground/optimized_sdpa_triton.py @@ -0,0 +1,284 @@ +import math +from typing import Any, Optional + +import torch +import triton +import triton.language as tl +from torch.library import triton_op, wrap_triton + + +@triton.autotune( + configs=[ + # Favor configs tuned for HEAD_DIM=64 and L up to ~1500 + triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_stages=4, num_warps=8), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 256}, num_stages=4, num_warps=8), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 256}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 128}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 64}, num_stages=3, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_stages=3, num_warps=4), + ], + key=["L", "HEAD_DIM"], +) +@triton.jit +def _sdpa_fwd_kernel( + q_ptr, + k_ptr, + v_ptr, + o_ptr, + B, + H, + L, + HEAD_DIM, + stride_qb, + stride_qh, + stride_ql, + stride_qd, + stride_kb, + stride_kh, + stride_kl, + stride_kd, + stride_vb, + stride_vh, + stride_vl, + stride_vd, + stride_ob, + stride_oh, + stride_ol, + stride_od, + sm_scale, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + HEAD_DIM_CE: tl.constexpr, +): + # Program IDs + pid_m = tl.program_id(axis=0) # along query length + pid_hz = tl.program_id(axis=1) # flattened batch*head + + off_b = pid_hz // H + off_h = pid_hz % H + + # Compute ranges + start_m = pid_m * BLOCK_M + offs_m = start_m + tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_DIM_CE) + mask_m = offs_m < L + + # Base pointers for this (b, h) + q_base = q_ptr + off_b * stride_qb + off_h * stride_qh + k_base = k_ptr + off_b * stride_kb + off_h * stride_kh + v_base = v_ptr + off_b * stride_vb + off_h * stride_vh + o_base = o_ptr + off_b * stride_ob + off_h * stride_oh + + # Make head-dim addresses compiler-friendly + offs_d_ctg = tl.max_contiguous(tl.multiple_of(offs_d, 16), HEAD_DIM_CE) + + # Load Q tile [BLOCK_M, HEAD_DIM] - coalesced along HEAD_DIM + q_ptrs = q_base + (offs_m[:, None] * stride_ql + offs_d_ctg[None, :] * stride_qd) + q = tl.load(q_ptrs, mask=mask_m[:, None], other=0.0) + q = q.to(tl.bfloat16) + + # Initialize accumulators and softmax stats + acc = tl.zeros((BLOCK_M, HEAD_DIM_CE), dtype=tl.float32) + m_i = tl.full((BLOCK_M,), -float("inf"), dtype=tl.float32) + l_i = tl.zeros((BLOCK_M,), dtype=tl.float32) + + # Convert to base-2 scale for exp2 + qk_scale = sm_scale * 1.4426950408889634 + + # Loop over keys/values along sequence length in tiles of BLOCK_N + # Load K as [BLOCK_N, HEAD_DIM] for coalesced reads, then use tl.trans(K) in dot + for start_n in tl.range(0, L, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + mask_n = offs_n < L + + # Load K tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM) + k_ptrs = k_base + ( + offs_n[:, None] * stride_kl + offs_d_ctg[None, :] * stride_kd + ) + k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0) + k = k.to(tl.bfloat16) + + # Compute attention logits [BLOCK_M, BLOCK_N] = Q[BM,D] @ K[BN,D]^T + qk = tl.dot(q, tl.trans(k)).to(tl.float32) # accumulator in fp32 + qk = qk * qk_scale + + # Apply OOB masks for both rows and cols to keep stability + qk = tl.where(mask_n[None, :], qk, -float("inf")) + qk = tl.where(mask_m[:, None], qk, -float("inf")) + + # Online softmax + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + p = tl.math.exp2(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + alpha = tl.math.exp2(m_i - m_ij) + + # Load V tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM) + v_ptrs = v_base + ( + offs_n[:, None] * stride_vl + offs_d_ctg[None, :] * stride_vd + ) + v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0) + v = v.to(tl.bfloat16) + + # Update accumulator + acc = acc * alpha[:, None] + # Cast p to bf16 to use tensor-cores in tl.dot; accumulate in fp32 + p_bf16 = p.to(tl.bfloat16) + acc = tl.dot(p_bf16, v, acc) + + # Update softmax stats + l_i = l_i * alpha + l_ij + m_i = m_ij + + # Normalize accumulator by softmax denominator + acc = acc / l_i[:, None] + + # Store output [BLOCK_M, HEAD_DIM] + o_ptrs = o_base + (offs_m[:, None] * stride_ol + offs_d_ctg[None, :] * stride_od) + tl.store(o_ptrs, acc.to(tl.bfloat16), mask=mask_m[:, None]) + + +@triton_op("custom::optimized_triton_scaled_dot_product_attention", mutates_args={}) +def optimized_triton_scaled_dot_product_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale: float = 0.0, + enable_gqa: bool = False, +) -> torch.Tensor: + """ + Triton fused Scaled Dot-Product Attention (forward, no causal, no dropout). + Expected shapes (tested): [B=1, H=20, L<=1500, D=64], dtype bfloat16. + + Args: + query: Query tensor [B, H, L, D] + key: Key tensor [B, H, L, D] + value: Value tensor [B, H, L, D] + attn_mask: must be None (not supported) + dropout_p: must be 0.0 (not supported) + is_causal: must be False (not supported) + scale: must be 0.0 (not supported) + enable_gqa: must be False (not supported) + + Returns: + Output tensor [B, H, L, D] + """ + # Validate inputs + if not (query.is_cuda and key.is_cuda and value.is_cuda): + raise RuntimeError("Q, K, V must be CUDA tensors.") + if ( + query.dtype != torch.bfloat16 + or key.dtype != torch.bfloat16 + or value.dtype != torch.bfloat16 + ): + raise RuntimeError("Expected bfloat16 inputs") + if query.shape != key.shape or query.shape != value.shape: + raise RuntimeError( + f"Q, K, V must have identical shapes; got query={query.shape}, key={key.shape}, value={value.shape}." + ) + if query.dim() != 4: + raise RuntimeError( + f"Expected 4D tensors shaped [B, H, L, D]; got {query.dim()}D." + ) + + # Enforce that only default values are accepted for these arguments + if attn_mask is not None: + raise RuntimeError( + "attn_mask must be None (not supported in this implementation)." + ) + if dropout_p != 0.0: + raise RuntimeError( + "dropout_p must be 0.0 (not supported in this implementation)." + ) + if is_causal is not False: + raise RuntimeError( + "is_causal must be False (not supported in this implementation)." + ) + if scale != 0.0: + raise RuntimeError("scale must be 0.0 (not supported in this implementation).") + if enable_gqa is not False: + raise RuntimeError( + "enable_gqa must be False (not supported in this implementation)." + ) + + B, H, L, D = query.shape + # Allocate output + out = torch.empty_like(query) + + # Element-wise strides (in elements) + sqb, sqh, sql, sqd = query.stride() + skb, skh, skl, skd = key.stride() + svb, svh, svl, svd = value.stride() + sob, soh, sol, sod = out.stride() + + # Grid: tile queries (M) and batch*heads axis + def grid(META): + return ( + triton.cdiv(L, META["BLOCK_M"]), + B * H, + ) + + # Scale factor for SDPA + sm_scale = 1.0 / math.sqrt(D) + + # Launch kernel using wrap_triton to avoid tracing issues during export/compile + # Note: wrap_triton returns a callable that can be indexed with grid + wrap_triton(_sdpa_fwd_kernel)[grid]( + query, + key, + value, + out, + B, + H, + L, + D, + sqb, + sqh, + sql, + sqd, + skb, + skh, + skl, + skd, + svb, + svh, + svl, + svd, + sob, + soh, + sol, + sod, + sm_scale, + HEAD_DIM_CE=D, + ) + + return out + + +# Register the abstract/fake implementation for torch.export +# This is critical to avoid accessing real tensor data during export +@optimized_triton_scaled_dot_product_attention.register_fake +def _optimized_triton_sdpa_abstract( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale=None, + enable_gqa=False, +) -> torch.Tensor: + """ + Abstract/fake implementation for torch.export. + This just returns an empty tensor with the correct shape/dtype/device. + No actual computation happens here - this is only for shape inference during export. + """ + # Validate shapes match + assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape" + assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype" + + # Output has the same shape and dtype as query + # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation + return torch.empty_like(query, dtype=query.dtype, device=query.device) diff --git a/custom_triton_playground/sdpa_triton.py b/custom_triton_playground/sdpa_triton.py new file mode 100644 index 00000000000..09233e557bd --- /dev/null +++ b/custom_triton_playground/sdpa_triton.py @@ -0,0 +1,317 @@ +# kernel.py +import math +from typing import Any, Optional + +import torch +import triton +import triton.language as tl +from torch.library import triton_op, wrap_triton + + +""" +Fused Scaled Dot-Product Attention (SDPA) implemented in a single Triton kernel. + +This module provides a transparent replacement for torch.nn.functional.scaled_dot_product_attention +using a custom Triton kernel. The replacement is automatic - no model code changes needed! + +How it works: +1. We register a custom implementation using torch.library +2. When torch.nn.functional.scaled_dot_product_attention is called, + PyTorch's dispatch mechanism routes it to our implementation during AOTI compilation +3. The model code remains unchanged + +What is fused: +- We fuse QK^T matmul, numerically-stable online softmax, and the final + multiplication by V into one streaming kernel. No intermediate attention + matrix is materialized in memory. + +Design notes: +- We tile along the query (sequence) dimension with BLOCK_M rows and iterate + over the key/value sequence dimension in BLOCK_N columns. +- For each (batch, head) pair and query tile, we: + * Load a tile of Q once and keep it in registers. + * Stream over K/V in blocks: compute qk = Q @ K^T, update running row-wise + softmax statistics (m_i, l_i) and the output accumulator acc = sum(p * V) + using the "online softmax" algorithm: + m_new = max(m_old, max(qk)) + p = exp(qk - m_new) + acc = acc * exp(m_old - m_new) + p @ V + l_new = l_old * exp(m_old - m_new) + sum(p) + m_old = m_new + * Finally, write O = acc / l_i. +- All accumulation is done in fp32 for numerical stability; inputs/outputs are fp16. +- Boundary conditions are handled with masks. +- The Python wrapper only validates inputs, allocates outputs, configures the grid, + and launches the Triton kernel. All math is inside the Triton kernel. + +Runtime constraints respected: +- No torch.nn or torch.nn.functional is used in the execution path. +- No PyTorch compute ops are used to implement the algorithm; all math happens + in Triton via tl.load/tl.store/tl.dot/tl.exp/tl.max/tl.sum. +""" + + +@triton.jit +def _sdpa_fwd_kernel( + q_ptr, + k_ptr, + v_ptr, + o_ptr, + B, + H, + S, + D, # shapes + stride_qb, + stride_qh, + stride_qs, + stride_qd, + stride_kb, + stride_kh, + stride_ks, + stride_kd, + stride_vb, + stride_vh, + stride_vs, + stride_vd, + stride_ob, + stride_oh, + stride_os, + stride_od, + scale, # 1/sqrt(D) + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + HEAD_DIM: tl.constexpr, +): + # Program IDs + pid_m = tl.program_id(0) # along sequence dimension (queries) + pid_bh = tl.program_id(1) # across batch*heads + + b = pid_bh // H + h = pid_bh % H + + # Offsets for this block of queries + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_DIM) + + # Base pointers for this (b, h) + q_bh = q_ptr + b * stride_qb + h * stride_qh + k_bh = k_ptr + b * stride_kb + h * stride_kh + v_bh = v_ptr + b * stride_vb + h * stride_vh + o_bh = o_ptr + b * stride_ob + h * stride_oh + + # Load Q tile: [BLOCK_M, HEAD_DIM] + q_ptrs = q_bh + (offs_m[:, None] * stride_qs + offs_d[None, :] * stride_qd) + q_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D) + q = tl.load(q_ptrs, mask=q_mask, other=0.0) + + # Initialize online-softmax stats and output accumulator + m_i = tl.full([BLOCK_M], -float("inf"), dtype=tl.float32) + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) + + # Iterate over keys/values in blocks of BLOCK_N + for start_n in tl.range(0, S, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + kv_mask_cols = offs_n < S + + # Load K in a layout suitable for qk = q @ kT: + # k_ptrs produces a tensor of shape [HEAD_DIM, BLOCK_N] + k_ptrs = k_bh + (offs_n[None, :] * stride_ks + offs_d[:, None] * stride_kd) + k = tl.load( + k_ptrs, mask=(offs_d[:, None] < D) & (kv_mask_cols[None, :]), other=0.0 + ) + + # qk = [BLOCK_M, BLOCK_N] in fp32 + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk = tl.dot(q, k, qk) + qk = qk * scale # scale by 1/sqrt(D) + + # Mask out-of-bounds columns so they don't affect max/sum + qk = tl.where(kv_mask_cols[None, :], qk, -float("inf")) + + # Online softmax update + m_ij = tl.maximum(m_i, tl.max(qk, axis=1)) + p = tl.exp(qk - m_ij[:, None]) # fp32 + alpha = tl.exp(m_i - m_ij) + l_i = l_i * alpha + tl.sum(p, axis=1) + m_i = m_ij + + # Load V tile: [BLOCK_N, HEAD_DIM] + v_ptrs = v_bh + (offs_n[:, None] * stride_vs + offs_d[None, :] * stride_vd) + v = tl.load( + v_ptrs, mask=(kv_mask_cols[:, None]) & (offs_d[None, :] < D), other=0.0 + ) + + # Update output accumulator: acc = acc * alpha + p @ v + acc = acc * alpha[:, None] + # Use fp16 inputs for tl.dot with fp32 accumulation + acc = tl.dot(p.to(tl.float16), v.to(tl.float16), acc) + + # Normalize: O = acc / l_i[:, None] + o = acc / l_i[:, None] + # Store O in fp16 + o_ptrs = o_bh + (offs_m[:, None] * stride_os + offs_d[None, :] * stride_od) + o_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D) + tl.store(o_ptrs, o.to(tl.float16), mask=o_mask) + + +@triton_op("custom::scaled_dot_product_attention", mutates_args={}) +def triton_scaled_dot_product_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale: float = 0.0, + enable_gqa: bool = False, +) -> torch.Tensor: + """ + Fused Scaled Dot-Product Attention registered as a custom op: + O = softmax(Q @ K^T / sqrt(D)) @ V + where Q, K, V are shaped [batch, heads, seq_len, head_dim]. + + This function is registered with @triton_op so AOTI can discover and use it + during compilation as a replacement for torch.nn.functional.scaled_dot_product_attention. + + Wrapper responsibilities: + - Validate input tensors (dtype/device/shapes) + - Allocate output tensor + - Configure grid and launch the Triton kernel + - No math is done here beyond basic scalar setup; all heavy compute runs in the Triton kernel. + + Fusion details: + - This launches a single kernel that computes QK^T, performs online softmax, + and multiplies by V to produce O, all in one pass over K/V blocks. + - No intermediate attention matrix is written to global memory. + + Args: + query: Query tensor [B, H, S, D] + key: Key tensor [B, H, S, D] + value: Value tensor [B, H, S, D] + attn_mask: has to be None + is_causal: has to be False + scale: has to be None + enable_gqa: has to be False + + Returns: + Output tensor [B, H, S, D] + """ + # Basic validation + if not (query.is_cuda and key.is_cuda and value.is_cuda): + raise RuntimeError("Q, K, V must be CUDA tensors.") + if ( + query.dtype != torch.float16 + or key.dtype != torch.float16 + or value.dtype != torch.float16 + ): + raise RuntimeError("This reference implementation expects float16 tensors.") + if query.shape != key.shape or query.shape != value.shape: + raise RuntimeError( + f"Q, K, V must have identical shapes; got Q={query.shape}, K={key.shape}, V={value.shape}." + ) + if query.dim() != 4: + raise RuntimeError( + f"Expected 4D tensors shaped [B, H, S, D]; got {query.dim()}D." + ) + + # Enforce that only default values are accepted for these arguments + if attn_mask is not None: + raise RuntimeError( + "attn_mask must be None (not supported in this implementation)." + ) + + if dropout_p != 0.0: + raise RuntimeError( + "dropout_p must be 0.0 (not supported in this implementation)." + ) + if is_causal is not False: + raise RuntimeError( + "is_causal must be False (not supported in this implementation)." + ) + if scale != 0: + raise RuntimeError("scale must be None (not supported in this implementation).") + if enable_gqa is not False: + raise RuntimeError( + "enable_gqa must be False (not supported in this implementation)." + ) + + B, H, S, D = query.shape + + # Allocate output + O = torch.empty_like(query) + + # Choose tiling parameters (powers of two, coalesced-friendly) + # Conservative sizes to keep register/SMEM pressure reasonable for D=1024 + BLOCK_M = 16 + BLOCK_N = 32 + + # Compute softmax scale on host (scalar) - this is setup, not heavy math + scale = 1.0 / math.sqrt(float(D)) + + # Grid: one program per (query block, batch*head) + grid = (triton.cdiv(S, BLOCK_M), B * H) + + # Launch kernel using wrap_triton to avoid tracing issues during export/compile + # Note: wrap_triton returns a callable that can be indexed with grid + wrap_triton(_sdpa_fwd_kernel)[grid]( + query, + key, + value, + O, + B, + H, + S, + D, + query.stride(0), + query.stride(1), + query.stride(2), + query.stride(3), + key.stride(0), + key.stride(1), + key.stride(2), + key.stride(3), + value.stride(0), + value.stride(1), + value.stride(2), + value.stride(3), + O.stride(0), + O.stride(1), + O.stride(2), + O.stride(3), + scale, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + HEAD_DIM=D, + num_warps=4, + num_stages=2, + ) + + return O + + +# Register the abstract/fake implementation for torch.export +# This is critical to avoid accessing real tensor data during export +@triton_scaled_dot_product_attention.register_fake +def _triton_sdpa_abstract( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale=None, + enable_gqa=False, +) -> torch.Tensor: + """ + Abstract/fake implementation for torch.export. + This just returns an empty tensor with the correct shape/dtype/device. + No actual computation happens here - this is only for shape inference during export. + """ + # Validate shapes match + assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape" + assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype" + + # Output has the same shape and dtype as query + # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation + return torch.empty_like(query, dtype=query.dtype, device=query.device) diff --git a/custom_triton_playground/test-sdpa-with-custom-kernel.py b/custom_triton_playground/test-sdpa-with-custom-kernel.py new file mode 100644 index 00000000000..16883bc83d9 --- /dev/null +++ b/custom_triton_playground/test-sdpa-with-custom-kernel.py @@ -0,0 +1,151 @@ +# ============================================================================ +# IMPORTANT: Import sdpa_triton BEFORE defining the model +# This automatically enables the custom Triton kernel via monkey-patching +# ============================================================================ +import argparse +import os +from contextlib import nullcontext + +import torch +from executorch.backends.cuda.cuda_backend import CudaBackend +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner +from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower +from optimized_sdpa_triton import optimized_triton_scaled_dot_product_attention +from sdpa_triton import triton_scaled_dot_product_attention +from torch.export import Dim, export +from torch.nn.attention import SDPBackend + + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, query, key, value): + # This is the ORIGINAL code - we're NOT changing it! + # But it will automatically use our custom Triton kernel + # because we imported sdpa_triton above + out = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False + ) + return out + + +sdpa_ctx = nullcontext() + + +# hacky method to replace system sdpa with my triton +def init_sdpa_kernel(custom_triton): + global sdpa_ctx + if custom_triton == "decomposed_kernel": + sdpa_ctx = torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) + elif custom_triton == "unoptimized_triton": + torch.nn.functional.scaled_dot_product_attention = ( + triton_scaled_dot_product_attention + ) + elif custom_triton == "optimized_triton": + torch.nn.functional.scaled_dot_product_attention = ( + optimized_triton_scaled_dot_product_attention + ) + else: + assert False, f"{custom_triton} has not been supported yet" + + +def main(kernel_type, output_dir, dtype): + print(f"Using kernel type: {kernel_type}") + print(f"Using dtype: {dtype}") + init_sdpa_kernel(kernel_type) + + model = Model() + batch_size, num_heads, seq_len, head_dim = 1, 20, 1500, 64 + + # Map dtype string to torch dtype + dtype_map = { + "fp16": torch.float16, + "bf16": torch.bfloat16, + } + torch_dtype = dtype_map[dtype] + + # Create inputs with specified dtype + inputs = ( + torch.randn( + batch_size, + num_heads, + seq_len, + head_dim, + dtype=torch_dtype, + device="cuda", + ), + torch.randn( + batch_size, + num_heads, + seq_len, + head_dim, + dtype=torch_dtype, + device="cuda", + ), + torch.randn( + batch_size, + num_heads, + seq_len, + head_dim, + dtype=torch_dtype, + device="cuda", + ), + ) + + print("Testing model execution with custom kernel...") + with torch.no_grad(): + output = model(*inputs) + print(f"✓ Model executed successfully. Output shape: {output.shape}\n") + + print("Exporting model...") + exported_program = export(model, inputs) + print("✓ Model exported successfully\n") + + print("Lowering to ExecuTorch CUDA backend (using AOTI)...") + with sdpa_ctx, torch.no_grad(): + executorch_program = to_edge_transform_and_lower( + exported_program, + partitioner=[ + CudaPartitioner( + [CudaBackend.generate_method_name_compile_spec("forward")] + ) + ], + compile_config=EdgeCompileConfig(_check_ir_validity=False), + ).to_executorch() + print("✓ Model lowered successfully with AOTI\n") + + print("Saving model...") + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, "model.pte"), "wb") as file: + file.write(executorch_program.buffer) + + executorch_program.write_tensor_data_to_file(output_dir) + print(f"✓ PTE and PTD files has successfully dumped to {output_dir}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test SDPA with custom kernel") + parser.add_argument( + "--kernel_type", + type=str, + choices=["unoptimized_triton", "optimized_triton", "decomposed_kernel"], + help="Type of kernel to use", + ) + parser.add_argument( + "--output_dir", + type=str, + default=".", + help="Directory to save model.pte and tensor data (default: current directory)", + ) + parser.add_argument( + "--dtype", + type=str, + choices=["fp16", "bf16"], + default="bf16", + help="Data type for model inputs (default: bf16)", + ) + + args = parser.parse_args() + + main(args.kernel_type, args.output_dir, args.dtype) diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h index 1a30e2cc4df..b587628fd1d 100644 --- a/extension/runner_util/inputs.h +++ b/extension/runner_util/inputs.h @@ -64,7 +64,7 @@ struct PrepareInputTensorsOptions { * all inputs exceeds this, an error is returned. This prevents allocating too * much memory if the PTE file is malformed. */ - size_t max_total_allocation_size = 1024 * 1024 * 1024; + size_t max_total_allocation_size = 1024 * 1024 * 1024 * 10; /** * The maximum number of inputs to allocate. If the number of inputs exceeds From 3cd8bbf264ae39a368ced9840c8d85c22c269c21 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 12 Nov 2025 13:53:37 -0800 Subject: [PATCH 2/7] temp save --- backends/cuda/cuda_backend.py | 3 ++ eval.sh | 52 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 eval.sh diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index 86571baa47a..bfafe75559c 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -152,6 +152,7 @@ def preprocess( # Separate weight constants from the .so file "aot_inductor.package": True, "aot_inductor.package_constants_in_so": False, + "aot_inductor.freezing": True, # Store weight constants on disk in a binary blob "aot_inductor.package_constants_on_disk_format": "binary_blob", # Enable maximum automatic tuning for optimal performance @@ -187,6 +188,8 @@ def preprocess( f"Could not find required files in compiled paths, got {paths}" ) + print("--- Generate .so lives at", so_path) + # pyre-ignorep[6]: Incompatible parameter type with open(so_path, "rb") as f: so_data = f.read() diff --git a/eval.sh b/eval.sh new file mode 100644 index 00000000000..57f5740d446 --- /dev/null +++ b/eval.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# 用法: ./evaluate_kernel.sh +KERNEL_NAME=$1 +N_EVAL=$2 + +# 路径前缀 +BASE_PATH=~/kernel-gen/whisper-large-v3-turbo/${KERNEL_NAME}/ + +MODEL_PATH=${BASE_PATH}model.pte +DATA_PATH=${BASE_PATH}aoti_cuda_blob.ptd +TOKENIZER_PATH=${BASE_PATH} +AUDIO_PATH=${BASE_PATH}output.wav +PROCESSOR_PATH=${BASE_PATH}whisper_preprocessor.pte + +CMD="cmake-out/examples/models/whisper/whisper_runner \ + --model_path ${MODEL_PATH} \ + --data_path ${DATA_PATH} \ + --temperature 0 \ + --tokenizer_path ${TOKENIZER_PATH} \ + --audio_path ${AUDIO_PATH} \ + --processor_path ${PROCESSOR_PATH}" + +rates=() +for ((i=1; i<=N_EVAL; i++)); do + echo "Running evaluation $i/$N_EVAL..." + output=$($CMD 2>&1) + # 推荐用 awk + rate=$(echo "$output" | grep "Generated 128 tokens:" | awk '{print $(NF-1)}') + echo "Generated token rate for run $i: $rate" + if [[ ! -z "$rate" ]]; then + rates+=($rate) + fi +done + +# 计算平均值 +sum=0 +count=0 +for r in "${rates[@]}"; do + # 只统计非空数值 + if [[ ! -z "$r" ]]; then + sum=$(echo "$sum + $r" | bc) + count=$((count+1)) + fi +done + +if [[ $count -gt 0 ]]; then + avg=$(echo "scale=2; $sum / $count" | bc) + echo "Average Generated token rate over $count runs: $avg tokens/second" +else + echo "No valid token rates found." +fi From ec8b0e8a4263c8222ce7805b9f64043f2d529bfb Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 12 Nov 2025 15:36:36 -0800 Subject: [PATCH 3/7] remove wrong freezing attribute --- backends/cuda/cuda_backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index bfafe75559c..b871eb75c1a 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -152,7 +152,6 @@ def preprocess( # Separate weight constants from the .so file "aot_inductor.package": True, "aot_inductor.package_constants_in_so": False, - "aot_inductor.freezing": True, # Store weight constants on disk in a binary blob "aot_inductor.package_constants_on_disk_format": "binary_blob", # Enable maximum automatic tuning for optimal performance From 638ad62c49e747b3395b43c5fd82b92ccfaea26f Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 13 Nov 2025 11:19:28 -0800 Subject: [PATCH 4/7] temp save --- backends/aoti/common_shims.cpp | 7 - backends/aoti/common_shims.h | 3 - backends/cuda/cuda_backend.py | 8 +- benchmarking.py | 272 +++++++++++++++++++++++++++++++++ decomposed-freezing-result.txt | 184 ++++++++++++++++++++++ triton-v2-freeze-result.txt | 184 ++++++++++++++++++++++ triton-v3-result.txt | 184 ++++++++++++++++++++++ 7 files changed, 827 insertions(+), 15 deletions(-) create mode 100644 benchmarking.py create mode 100644 decomposed-freezing-result.txt create mode 100644 triton-v2-freeze-result.txt create mode 100644 triton-v3-result.txt diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index dcb4c7b37d4..5fa5c5beef5 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -242,13 +242,6 @@ aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) { return Error::Internal; } -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) { - (void)orig_handle; - (void)new_handle; - throw std::runtime_error("Not implemented"); - return Error::Internal; -} AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( void* data_ptr, diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 5ae583b81a8..7b4b3202731 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -97,9 +97,6 @@ aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor); AOTI_SHIM_EXPORT AOTITorchError aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor); -AOTI_SHIM_EXPORT AOTITorchError -aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle); - AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( void* data_ptr, int64_t ndim, diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index 8d3c7b3db50..487b0d64c1d 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -141,9 +141,8 @@ def preprocess( # noqa: C901 user_input_placeholders.append(node.meta["val"]) options: dict[str, typing.Any] = { - # Disable this to support sdpa decomposition - # TODO(gasoonjia): remove it after pin bump to latest pytorch - "loop_ordering_after_fusion": False, + # Frozen weight during inference for better performance and more optimization like kernel fusion + "freezing": True, # Better model precision "emulate_precision_casts": True, # Embed CUDA kernel binaries directly into the compiled shared object @@ -163,7 +162,6 @@ def preprocess( # noqa: C901 "max_autotune_conv_backends": "TRITON", } - platform = "linux" shim_library_path = None for spec in compile_specs: @@ -172,7 +170,7 @@ def preprocess( # noqa: C901 if spec.key == "shim_library_path": shim_library_path = spec.value.decode("utf-8") - assert platform == "linux" or platform == "windows" + assert platform == "linux" if platform == "windows" and shim_library_path is None: lib_dir = resources.files("executorch").joinpath("data/lib") shim_library_path = str(lib_dir) diff --git a/benchmarking.py b/benchmarking.py new file mode 100644 index 00000000000..d309c25bf9b --- /dev/null +++ b/benchmarking.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +Benchmark script for Whisper ASR runner. +Runs the whisper_runner command multiple times and collects throughput metrics. +""" +import argparse +import json +import os +import statistics +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional + + +@dataclass +class RunMetrics: + """Metrics from a single run.""" + + generated_tokens: int + tokens_per_sec: float + model_load_time_ms: float + inference_time_ms: float + prompt_eval_to_end_ms: float + first_token_latency_ms: float + + def __repr__(self): + return ( + f"Tokens: {self.generated_tokens}, " + f"Throughput: {self.tokens_per_sec:.2f} t/s, " + f"Model load: {self.model_load_time_ms:.0f}ms, " + f"Inference: {self.inference_time_ms:.0f}ms, " + f"First token: {self.first_token_latency_ms:.0f}ms" + ) + + +def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]: + """Parse PyTorchObserver JSON output and compute metrics.""" + try: + # Find the JSON part in the log line + if "PyTorchObserver" not in log_line: + return None + + json_str = log_line.split("PyTorchObserver")[1].strip() + data = json.loads(json_str) + + # Extract values + generated_tokens = data.get("generated_tokens", 0) + inference_end_ms = data.get("inference_end_ms", 0) + prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0) + first_token_ms = data.get("first_token_ms", 0) + model_load_start_ms = data.get("model_load_start_ms", 0) + model_load_end_ms = data.get("model_load_end_ms", 0) + + # Compute metrics + prompt_eval_to_end_ms = inference_end_ms - prompt_eval_end_ms + tokens_per_sec = ( + (generated_tokens / prompt_eval_to_end_ms * 1000) + if prompt_eval_to_end_ms > 0 + else 0 + ) + model_load_time_ms = model_load_end_ms - model_load_start_ms + inference_time_ms = inference_end_ms - prompt_eval_end_ms + first_token_latency_ms = first_token_ms - prompt_eval_end_ms + + return RunMetrics( + generated_tokens=generated_tokens, + tokens_per_sec=tokens_per_sec, + model_load_time_ms=model_load_time_ms, + inference_time_ms=inference_time_ms, + prompt_eval_to_end_ms=prompt_eval_to_end_ms, + first_token_latency_ms=first_token_latency_ms, + ) + except (json.JSONDecodeError, KeyError, ValueError) as e: + print(f"Error parsing PyTorchObserver log: {e}", file=sys.stderr) + return None + + +def run_whisper_benchmark( + command: str, num_runs: int = 5, verbose: bool = False +) -> List[RunMetrics]: + """ + Run the whisper_runner command multiple times and collect metrics. + + Args: + command: Full command to run + num_runs: Number of times to run the command + verbose: Print detailed output + + Returns: + List of RunMetrics from each run + """ + results = [] + + for run_num in range(1, num_runs + 1): + print(f"\n[Run {run_num}/{num_runs}] Executing: {command}") + + try: + # Run command and capture output + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + + if result.returncode != 0: + print( + f"Error: Command failed with return code {result.returncode}", + file=sys.stderr, + ) + if result.stderr: + print(f"stderr: {result.stderr}", file=sys.stderr) + continue + + # Search for PyTorchObserver line in output + observer_line = None + for line in result.stdout.split("\n"): + if "PyTorchObserver" in line: + observer_line = line + break + + if observer_line is None: + print( + f"Warning: No PyTorchObserver output found in run {run_num}", + file=sys.stderr, + ) + if verbose: + print(f"stdout:\n{result.stdout}", file=sys.stderr) + continue + + # Parse metrics + metrics = parse_pytorch_observer_log(observer_line) + if metrics is None: + print( + f"Warning: Failed to parse metrics from run {run_num}", + file=sys.stderr, + ) + continue + + results.append(metrics) + print(f"✓ {metrics}") + + except subprocess.TimeoutExpired: + print(f"Error: Command timed out on run {run_num}", file=sys.stderr) + except Exception as e: + print(f"Error on run {run_num}: {e}", file=sys.stderr) + + return results + + +def print_summary(results: List[RunMetrics]) -> None: + """Print summary statistics.""" + if not results: + print("No valid results to summarize.") + return + + tokens_per_sec_list = [r.tokens_per_sec for r in results] + model_load_times = [r.model_load_time_ms for r in results] + inference_times = [r.inference_time_ms for r in results] + first_token_latencies = [r.first_token_latency_ms for r in results] + + print("\n" + "=" * 70) + print("BENCHMARK SUMMARY") + print("=" * 70) + print(f"Total runs: {len(results)}") + print(f"Generated tokens per run: {results[0].generated_tokens}") + print() + + print("THROUGHPUT (tokens/sec):") + print(f" Min: {min(tokens_per_sec_list):.2f} t/s") + print(f" Max: {max(tokens_per_sec_list):.2f} t/s") + print(f" Mean: {statistics.mean(tokens_per_sec_list):.2f} t/s") + if len(tokens_per_sec_list) > 1: + print(f" Stdev: {statistics.stdev(tokens_per_sec_list):.2f} t/s") + print() + + print("MODEL LOAD TIME (ms):") + print(f" Min: {min(model_load_times):.0f} ms") + print(f" Max: {max(model_load_times):.0f} ms") + print(f" Mean: {statistics.mean(model_load_times):.0f} ms") + if len(model_load_times) > 1: + print(f" Stdev: {statistics.stdev(model_load_times):.0f} ms") + print() + + print("INFERENCE TIME (ms, prompt_eval_end to inference_end):") + print(f" Min: {min(inference_times):.0f} ms") + print(f" Max: {max(inference_times):.0f} ms") + print(f" Mean: {statistics.mean(inference_times):.0f} ms") + if len(inference_times) > 1: + print(f" Stdev: {statistics.stdev(inference_times):.0f} ms") + print() + + print("FIRST TOKEN LATENCY (ms):") + print(f" Min: {min(first_token_latencies):.0f} ms") + print(f" Max: {max(first_token_latencies):.0f} ms") + print(f" Mean: {statistics.mean(first_token_latencies):.0f} ms") + if len(first_token_latencies) > 1: + print(f" Stdev: {statistics.stdev(first_token_latencies):.0f} ms") + print("=" * 70) + + +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser( + description="Benchmark Whisper ASR runner and collect throughput metrics" + ) + parser.add_argument( + "num_runs", + type=int, + nargs="?", + default=50, + help="Number of benchmark runs (default: 5)", + ) + parser.add_argument( + "--model_dir_name", + type=str, + default="decomposed", + help="Path to the directory that has model .pte and .ptd files", + ) + parser.add_argument( + "--processor_path", + type=str, + default="~/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte", + help="Path to the preprocessor/processor .pte file", + ) + parser.add_argument("--verbose", action="store_true", help="Print verbose output") + + args = parser.parse_args() + + base_path = "~/kernel-gen/whisper-large-v3-turbo/" + model_dir_path = os.path.join(base_path, args.model_dir_name) + + # Expand user paths + model_path = os.path.expanduser(model_dir_path + "/model.pte") + data_path = os.path.expanduser(model_dir_path + "/aoti_cuda_blob.ptd") + tokenizer_path = os.path.expanduser( + "~/kernel-gen/whisper-large-v3-turbo/decomposed" + ) + audio_path = os.path.expanduser( + "~/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav" + ) + processor_path = os.path.expanduser(args.processor_path) + + # Build command + command = ( + "cmake-out/examples/models/whisper/whisper_runner " + f"--model_path {model_path} " + f"--data_path {data_path} " + f"--tokenizer_path {tokenizer_path} " + f"--audio_path {audio_path} " + f"--processor_path {processor_path} " + "--model_name whisper_large_v3 " + "--temperature 0 " + ) + + print(f"Running Whisper benchmark {args.num_runs} times...") + print(f"Command: {command}\n") + + # Run benchmark + results = run_whisper_benchmark( + command, num_runs=args.num_runs, verbose=args.verbose + ) + + # Print summary + print_summary(results) + + +if __name__ == "__main__": + main() diff --git a/decomposed-freezing-result.txt b/decomposed-freezing-result.txt new file mode 100644 index 00000000000..375ee7c034b --- /dev/null +++ b/decomposed-freezing-result.txt @@ -0,0 +1,184 @@ +Running Whisper benchmark 50 times... +Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 + + +[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1138ms, Inference: 292ms, First token: 14ms + +[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1162ms, Inference: 297ms, First token: 14ms + +[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 407.64 t/s, Model load: 1096ms, Inference: 314ms, First token: 14ms + +[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1076ms, Inference: 301ms, First token: 14ms + +[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1113ms, Inference: 304ms, First token: 14ms + +[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 400.00 t/s, Model load: 1121ms, Inference: 320ms, First token: 21ms + +[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1149ms, Inference: 308ms, First token: 16ms + +[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1093ms, Inference: 294ms, First token: 14ms + +[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1093ms, Inference: 313ms, First token: 15ms + +[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1105ms, Inference: 298ms, First token: 14ms + +[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1117ms, Inference: 315ms, First token: 14ms + +[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1074ms, Inference: 327ms, First token: 14ms + +[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1086ms, Inference: 294ms, First token: 14ms + +[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1133ms, Inference: 331ms, First token: 14ms + +[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1120ms, Inference: 311ms, First token: 15ms + +[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 393.85 t/s, Model load: 1118ms, Inference: 325ms, First token: 18ms + +[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1008ms, Inference: 309ms, First token: 19ms + +[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1074ms, Inference: 296ms, First token: 14ms + +[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1045ms, Inference: 297ms, First token: 14ms + +[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1077ms, Inference: 296ms, First token: 14ms + +[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1095ms, Inference: 298ms, First token: 14ms + +[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 423.84 t/s, Model load: 1019ms, Inference: 302ms, First token: 14ms + +[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1269ms, Inference: 313ms, First token: 14ms + +[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1039ms, Inference: 313ms, First token: 18ms + +[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1080ms, Inference: 297ms, First token: 14ms + +[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1099ms, Inference: 292ms, First token: 14ms + +[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1200ms, Inference: 296ms, First token: 14ms + +[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 984ms, Inference: 305ms, First token: 14ms + +[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1110ms, Inference: 305ms, First token: 15ms + +[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1063ms, Inference: 299ms, First token: 14ms + +[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 382.09 t/s, Model load: 1117ms, Inference: 335ms, First token: 14ms + +[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1128ms, Inference: 310ms, First token: 14ms + +[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1173ms, Inference: 299ms, First token: 14ms + +[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1077ms, Inference: 300ms, First token: 14ms + +[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 436.86 t/s, Model load: 1078ms, Inference: 293ms, First token: 14ms + +[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1003ms, Inference: 297ms, First token: 16ms + +[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1103ms, Inference: 307ms, First token: 15ms + +[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1053ms, Inference: 313ms, First token: 15ms + +[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1091ms, Inference: 307ms, First token: 14ms + +[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1850ms, Inference: 331ms, First token: 14ms + +[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 384.38 t/s, Model load: 1017ms, Inference: 333ms, First token: 14ms + +[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1090ms, Inference: 303ms, First token: 14ms + +[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1154ms, Inference: 329ms, First token: 16ms + +[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1085ms, Inference: 342ms, First token: 13ms + +[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1098ms, Inference: 300ms, First token: 14ms + +[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1093ms, Inference: 312ms, First token: 14ms + +[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1099ms, Inference: 304ms, First token: 14ms + +[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1113ms, Inference: 303ms, First token: 15ms + +[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1089ms, Inference: 311ms, First token: 19ms + +[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1151ms, Inference: 303ms, First token: 15ms + +====================================================================== +BENCHMARK SUMMARY +====================================================================== +Total runs: 50 +Generated tokens per run: 128 + +THROUGHPUT (tokens/sec): + Min: 374.27 t/s + Max: 438.36 t/s + Mean: 416.41 t/s + Stdev: 16.45 t/s + +MODEL LOAD TIME (ms): + Min: 984 ms + Max: 1850 ms + Mean: 1112 ms + Stdev: 117 ms + +INFERENCE TIME (ms, prompt_eval_end to inference_end): + Min: 292 ms + Max: 342 ms + Mean: 308 ms + Stdev: 13 ms + +FIRST TOKEN LATENCY (ms): + Min: 13 ms + Max: 21 ms + Mean: 15 ms + Stdev: 2 ms +====================================================================== diff --git a/triton-v2-freeze-result.txt b/triton-v2-freeze-result.txt new file mode 100644 index 00000000000..16dfd1da218 --- /dev/null +++ b/triton-v2-freeze-result.txt @@ -0,0 +1,184 @@ +Running Whisper benchmark 50 times... +Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 + + +[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 1070ms, Inference: 326ms, First token: 13ms + +[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1036ms, Inference: 315ms, First token: 14ms + +[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1043ms, Inference: 327ms, First token: 13ms + +[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1027ms, Inference: 330ms, First token: 13ms + +[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 1078ms, Inference: 345ms, First token: 14ms + +[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 987ms, Inference: 331ms, First token: 13ms + +[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1112ms, Inference: 329ms, First token: 13ms + +[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 982ms, Inference: 337ms, First token: 15ms + +[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1040ms, Inference: 331ms, First token: 13ms + +[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1048ms, Inference: 334ms, First token: 13ms + +[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 974ms, Inference: 336ms, First token: 13ms + +[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1059ms, Inference: 341ms, First token: 13ms + +[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1010ms, Inference: 330ms, First token: 12ms + +[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1001ms, Inference: 331ms, First token: 13ms + +[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 1001ms, Inference: 337ms, First token: 13ms + +[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 398.75 t/s, Model load: 1093ms, Inference: 321ms, First token: 13ms + +[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 966ms, Inference: 326ms, First token: 12ms + +[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1022ms, Inference: 327ms, First token: 13ms + +[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1020ms, Inference: 334ms, First token: 12ms + +[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 987ms, Inference: 339ms, First token: 13ms + +[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1115ms, Inference: 352ms, First token: 16ms + +[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1012ms, Inference: 339ms, First token: 13ms + +[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 1079ms, Inference: 346ms, First token: 13ms + +[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1067ms, Inference: 301ms, First token: 14ms + +[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1147ms, Inference: 307ms, First token: 14ms + +[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1106ms, Inference: 315ms, First token: 15ms + +[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 396.28 t/s, Model load: 1048ms, Inference: 323ms, First token: 12ms + +[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 433.90 t/s, Model load: 1098ms, Inference: 295ms, First token: 14ms + +[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 287.64 t/s, Model load: 1091ms, Inference: 445ms, First token: 14ms + +[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 18ms + +[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1075ms, Inference: 304ms, First token: 21ms + +[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1029ms, Inference: 310ms, First token: 14ms + +[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1177ms, Inference: 313ms, First token: 14ms + +[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1188ms, Inference: 305ms, First token: 15ms + +[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 390.24 t/s, Model load: 999ms, Inference: 328ms, First token: 13ms + +[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 1042ms, Inference: 338ms, First token: 13ms + +[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 15ms + +[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1065ms, Inference: 308ms, First token: 18ms + +[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1173ms, Inference: 304ms, First token: 14ms + +[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1034ms, Inference: 303ms, First token: 16ms + +[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 245.21 t/s, Model load: 1118ms, Inference: 522ms, First token: 15ms + +[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 1043ms, Inference: 336ms, First token: 16ms + +[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 999ms, Inference: 330ms, First token: 13ms + +[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1262ms, Inference: 334ms, First token: 14ms + +[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1059ms, Inference: 308ms, First token: 14ms + +[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1019ms, Inference: 304ms, First token: 14ms + +[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1015ms, Inference: 296ms, First token: 14ms + +[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1074ms, Inference: 312ms, First token: 15ms + +[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1059ms, Inference: 311ms, First token: 15ms + +[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1064ms, Inference: 307ms, First token: 14ms + +====================================================================== +BENCHMARK SUMMARY +====================================================================== +Total runs: 50 +Generated tokens per run: 128 + +THROUGHPUT (tokens/sec): + Min: 245.21 t/s + Max: 433.90 t/s + Mean: 392.57 t/s + Stdev: 31.88 t/s + +MODEL LOAD TIME (ms): + Min: 966 ms + Max: 1262 ms + Mean: 1059 ms + Stdev: 59 ms + +INFERENCE TIME (ms, prompt_eval_end to inference_end): + Min: 295 ms + Max: 522 ms + Mean: 329 ms + Stdev: 36 ms + +FIRST TOKEN LATENCY (ms): + Min: 12 ms + Max: 21 ms + Mean: 14 ms + Stdev: 2 ms +====================================================================== diff --git a/triton-v3-result.txt b/triton-v3-result.txt new file mode 100644 index 00000000000..2c426a28eee --- /dev/null +++ b/triton-v3-result.txt @@ -0,0 +1,184 @@ +Running Whisper benchmark 50 times... +Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 + + +[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 969ms, Inference: 347ms, First token: 13ms + +[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 922ms, Inference: 338ms, First token: 12ms + +[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 915ms, Inference: 337ms, First token: 13ms + +[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1004ms, Inference: 339ms, First token: 12ms + +[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1030ms, Inference: 348ms, First token: 13ms + +[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 988ms, Inference: 348ms, First token: 13ms + +[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 355.56 t/s, Model load: 1017ms, Inference: 360ms, First token: 14ms + +[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 992ms, Inference: 337ms, First token: 12ms + +[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 995ms, Inference: 350ms, First token: 13ms + +[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1014ms, Inference: 341ms, First token: 13ms + +[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 944ms, Inference: 346ms, First token: 12ms + +[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 976ms, Inference: 345ms, First token: 16ms + +[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 385.54 t/s, Model load: 1046ms, Inference: 332ms, First token: 14ms + +[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 352.62 t/s, Model load: 1014ms, Inference: 363ms, First token: 13ms + +[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 343.16 t/s, Model load: 1084ms, Inference: 373ms, First token: 14ms + +[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 990ms, Inference: 341ms, First token: 13ms + +[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 958ms, Inference: 350ms, First token: 13ms + +[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 997ms, Inference: 349ms, First token: 15ms + +[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 361.58 t/s, Model load: 1038ms, Inference: 354ms, First token: 13ms + +[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1044ms, Inference: 350ms, First token: 12ms + +[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 358.54 t/s, Model load: 1027ms, Inference: 357ms, First token: 13ms + +[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 943ms, Inference: 346ms, First token: 13ms + +[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1005ms, Inference: 312ms, First token: 14ms + +[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1016ms, Inference: 350ms, First token: 13ms + +[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 405.06 t/s, Model load: 981ms, Inference: 316ms, First token: 14ms + +[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1007ms, Inference: 352ms, First token: 13ms + +[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1067ms, Inference: 342ms, First token: 13ms + +[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1100ms, Inference: 352ms, First token: 15ms + +[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 957ms, Inference: 340ms, First token: 13ms + +[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1006ms, Inference: 352ms, First token: 13ms + +[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 354.57 t/s, Model load: 1040ms, Inference: 361ms, First token: 13ms + +[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 973ms, Inference: 334ms, First token: 12ms + +[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 982ms, Inference: 349ms, First token: 12ms + +[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 958ms, Inference: 347ms, First token: 13ms + +[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1031ms, Inference: 309ms, First token: 14ms + +[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 962ms, Inference: 349ms, First token: 13ms + +[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 967ms, Inference: 341ms, First token: 18ms + +[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 977ms, Inference: 346ms, First token: 16ms + +[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 1016ms, Inference: 353ms, First token: 18ms + +[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1175ms, Inference: 339ms, First token: 13ms + +[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 373.18 t/s, Model load: 964ms, Inference: 343ms, First token: 18ms + +[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1074ms, Inference: 362ms, First token: 14ms + +[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 981ms, Inference: 353ms, First token: 13ms + +[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1010ms, Inference: 348ms, First token: 13ms + +[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1023ms, Inference: 339ms, First token: 12ms + +[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 275.86 t/s, Model load: 1225ms, Inference: 464ms, First token: 19ms + +[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 964ms, Inference: 340ms, First token: 12ms + +[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 979ms, Inference: 334ms, First token: 13ms + +[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1003ms, Inference: 339ms, First token: 12ms + +[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 +✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1206ms, Inference: 362ms, First token: 14ms + +====================================================================== +BENCHMARK SUMMARY +====================================================================== +Total runs: 50 +Generated tokens per run: 128 + +THROUGHPUT (tokens/sec): + Min: 275.86 t/s + Max: 414.24 t/s + Mean: 369.34 t/s + Stdev: 18.90 t/s + +MODEL LOAD TIME (ms): + Min: 915 ms + Max: 1225 ms + Mean: 1011 ms + Stdev: 62 ms + +INFERENCE TIME (ms, prompt_eval_end to inference_end): + Min: 309 ms + Max: 464 ms + Mean: 348 ms + Stdev: 21 ms + +FIRST TOKEN LATENCY (ms): + Min: 12 ms + Max: 19 ms + Mean: 14 ms + Stdev: 2 ms +====================================================================== From d56601f98aa28e4ba00b1ac04ff4e1c28ee2c77f Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 17 Nov 2025 01:15:38 -0800 Subject: [PATCH 5/7] add trtion kernel replacement pass --- .github/workflows/cuda.yml | 2 +- backends/aoti/common_shims.cpp | 1 - backends/cuda/TARGETS | 31 ++ backends/cuda/cuda_backend.py | 14 +- backends/cuda/tests/test_cuda_export.py | 43 +++ backends/cuda/triton/__init__.py | 17 + backends/cuda/triton/kernels/__init__.py | 11 + .../cuda/triton/kernels/sdpa.py | 244 +++++++++----- backends/cuda/triton/replacement_pass.py | 134 ++++++++ benchmarking.py | 272 --------------- custom_triton_playground/sdpa_triton.py | 317 ------------------ .../test-sdpa-with-custom-kernel.py | 151 --------- eval.sh | 52 --- examples/models/__init__.py | 2 + examples/models/toy_model/__init__.py | 2 + examples/models/toy_model/model.py | 30 ++ extension/runner_util/inputs.h | 2 +- 17 files changed, 446 insertions(+), 879 deletions(-) create mode 100644 backends/cuda/triton/__init__.py create mode 100644 backends/cuda/triton/kernels/__init__.py rename custom_triton_playground/optimized_sdpa_triton.py => backends/cuda/triton/kernels/sdpa.py (50%) create mode 100644 backends/cuda/triton/replacement_pass.py delete mode 100644 benchmarking.py delete mode 100644 custom_triton_playground/sdpa_triton.py delete mode 100644 custom_triton_playground/test-sdpa-with-custom-kernel.py delete mode 100644 eval.sh diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 7cc937fe6ca..1d237f5d8ef 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -71,7 +71,7 @@ jobs: strategy: fail-fast: false matrix: - model: [linear, add, add_mul, resnet18, conv1d] + model: [linear, add, add_mul, resnet18, conv1d, sdpa] with: timeout: 90 runner: linux.g5.4xlarge.nvidia.gpu diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index 5fa5c5beef5..82c54617a0a 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -242,7 +242,6 @@ aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) { return Error::Internal; } - AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob( void* data_ptr, int64_t ndim, diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS index 94af87bbaed..55519060dc0 100644 --- a/backends/cuda/TARGETS +++ b/backends/cuda/TARGETS @@ -11,6 +11,7 @@ runtime.python_library( "//executorch/...", ], deps = [ + ":triton_replacement_pass", "//caffe2:torch", "//executorch/backends/aoti/passes:passes", "//executorch/exir/_serialize:lib", @@ -32,3 +33,33 @@ runtime.python_library( "//executorch/backends/aoti:aoti_partitioner", ], ) + +runtime.python_library( + name = "triton_kernels", + srcs = [ + "triton/kernels/__init__.py", + "triton/kernels/optimized_sdpa.py", + ], + visibility = [ + "//executorch/backends/cuda/...", + ], + deps = [ + "//caffe2:torch", + ], +) + +runtime.python_library( + name = "triton_replacement_pass", + srcs = [ + "triton/__init__.py", + "triton/replacement_pass.py", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + ":triton_kernels", + "//caffe2:torch", + "//executorch/exir/dialects:lib", + ], +) diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index 487b0d64c1d..3506cb685fe 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -16,6 +16,10 @@ from executorch.backends.aoti.passes.replace_view_copy_with_view import ( ReplaceViewCopyWithViewPass, ) + +from executorch.backends.cuda.triton.replacement_pass import ( + ReplaceEdgeOpWithTritonOpPass, +) from executorch.exir._serialize._named_data_store import NamedDataStore from executorch.exir._warnings import experimental from executorch.exir.backend.backend_details import ( @@ -27,7 +31,7 @@ from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu from torch._inductor.decomposition import conv1d_to_conv2d from torch.export.passes import move_to_device_pass -from torch.nn.attention import SDPBackend + cuda_decomposition_table = { torch.ops.aten.conv1d.default: conv1d_to_conv2d, @@ -127,6 +131,9 @@ def preprocess( # noqa: C901 # replace slice_copy.Tensor with slice.Tensor, select_copy.int with select.int ReplaceViewCopyWithViewPass()(cuda_edge_program.graph_module) + # Replace aten ops with triton ops + ReplaceEdgeOpWithTritonOpPass()(cuda_edge_program.graph_module) + cuda_edge_program = cuda_edge_program.run_decompositions( cuda_decomposition_table ) @@ -141,8 +148,9 @@ def preprocess( # noqa: C901 user_input_placeholders.append(node.meta["val"]) options: dict[str, typing.Any] = { - # Frozen weight during inference for better performance and more optimization like kernel fusion - "freezing": True, + # Disable this to support sdpa decomposition + # TODO(gasoonjia): remove it after pin bump to latest pytorch + "loop_ordering_after_fusion": False, # Better model precision "emulate_precision_casts": True, # Embed CUDA kernel binaries directly into the compiled shared object diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py index ef43a3ab3cb..8dcdbe09083 100644 --- a/backends/cuda/tests/test_cuda_export.py +++ b/backends/cuda/tests/test_cuda_export.py @@ -270,3 +270,46 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Test export edge_program_manager = self._export_to_cuda_with_lower(module, inputs) self.assertIsNotNone(edge_program_manager, "Conv1d operation export failed") + + def test_sdpa_single_kernel(self): + """ + Test CUDA export for model containing single SDPA kernel. + + SDPA: Scaled Dot Product Attention + """ + + class SDPAModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, query, key, value): + out = torch.nn.functional.scaled_dot_product_attention( + query, + key, + value, + attn_mask=None, + dropout_p=0.0, + is_causal=False, + ) + return out + + module = SDPAModule() + module.eval() + + # Create input tensors (batch, num_heads, seq_len, head_dim) + batch_size = 2 + num_heads = 8 + seq_len = 128 + head_dim = 64 + + query = torch.randn(batch_size, num_heads, seq_len, head_dim) + key = torch.randn(batch_size, num_heads, seq_len, head_dim) + value = torch.randn(batch_size, num_heads, seq_len, head_dim) + inputs = (query, key, value) + + # Test export + edge_program_manager = self._export_to_cuda_with_lower(module, inputs) + self.assertIsNotNone( + edge_program_manager, + "SDPA single kernel operation export failed", + ) diff --git a/backends/cuda/triton/__init__.py b/backends/cuda/triton/__init__.py new file mode 100644 index 00000000000..4b9c36249ac --- /dev/null +++ b/backends/cuda/triton/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Import all kernels to ensure @triton_op decorators are executed +# and ops are registered to torch.ops.triton namespace +from executorch.backends.cuda.triton import kernels # noqa: F401 + +from executorch.backends.cuda.triton.replacement_pass import ( + ReplaceEdgeOpWithTritonOpPass, +) + +__all__ = [ + "ReplaceEdgeOpWithTritonOpPass", +] diff --git a/backends/cuda/triton/kernels/__init__.py b/backends/cuda/triton/kernels/__init__.py new file mode 100644 index 00000000000..5bd582679c4 --- /dev/null +++ b/backends/cuda/triton/kernels/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.backends.cuda.triton.kernels.sdpa import sdpa + +__all__ = [ + "sdpa", +] diff --git a/custom_triton_playground/optimized_sdpa_triton.py b/backends/cuda/triton/kernels/sdpa.py similarity index 50% rename from custom_triton_playground/optimized_sdpa_triton.py rename to backends/cuda/triton/kernels/sdpa.py index df267202d29..e0996ce5292 100644 --- a/custom_triton_playground/optimized_sdpa_triton.py +++ b/backends/cuda/triton/kernels/sdpa.py @@ -1,5 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Optimized Triton SDPA Kernel for ExecuTorch CUDA Backend. + +This module provides a Triton-optimized implementation of scaled dot-product attention +that can replace the default ATen SDPA operator during graph transformation. +""" + import math -from typing import Any, Optional +from typing import Optional import torch import triton @@ -7,9 +20,50 @@ from torch.library import triton_op, wrap_triton +def _validate_qkv_shapes( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, +) -> tuple[int, int, int, int, int, int]: + """ + Validate dimensions and return shape info. + Args: + query: Query tensor [B, H, L_q, D] + key: Key tensor [B, H, L_kv, D] + value: Value tensor [B, H, L_kv, D] + Returns: + Tuple of (B, H, L_q, L_kv, D_q, D_kv) + Raises: + RuntimeError: If dimensions are incompatible + """ + B_q, H_q, L_q, D_q = query.shape + B_k, H_k, L_kv_k, D_k = key.shape + B_v, H_v, L_kv_v, D_v = value.shape + # Validate batch and head dimensions + if not (B_q == B_k == B_v): + raise RuntimeError( + f"Batch dimension must match; got B_q={B_q}, B_k={B_k}, B_v={B_v}." + ) + + if not (H_q == H_k == H_v): + raise RuntimeError( + f"Head dimension must match; got H_q={H_q}, H_k={H_k}, H_v={H_v}." + ) + # Head dimension must match + if not (D_q == D_k == D_v): + raise RuntimeError( + f"Head dimension must match across Q, K, V; got D_q={D_q}, D_k={D_k}, D_v={D_v}." + ) + # Key and Value sequence lengths must match + if L_kv_k != L_kv_v: + raise RuntimeError( + f"Key and Value must have the same sequence length; got L_k={L_kv_k}, L_v={L_kv_v}." + ) + return B_q, H_q, L_q, L_kv_k, D_q, D_k + + @triton.autotune( configs=[ - # Favor configs tuned for HEAD_DIM=64 and L up to ~1500 triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_stages=4, num_warps=8), triton.Config({"BLOCK_M": 128, "BLOCK_N": 256}, num_stages=4, num_warps=8), triton.Config({"BLOCK_M": 64, "BLOCK_N": 256}, num_stages=4, num_warps=4), @@ -17,17 +71,19 @@ triton.Config({"BLOCK_M": 128, "BLOCK_N": 64}, num_stages=3, num_warps=4), triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_stages=3, num_warps=4), ], - key=["L", "HEAD_DIM"], + key=["L_Q", "L_KV", "HEAD_DIM"], ) @triton.jit def _sdpa_fwd_kernel( q_ptr, k_ptr, v_ptr, + mask_ptr, o_ptr, B, H, - L, + L_Q, # Query sequence length + L_KV, # Key/Value sequence length HEAD_DIM, stride_qb, stride_qh, @@ -41,104 +97,118 @@ def _sdpa_fwd_kernel( stride_vh, stride_vl, stride_vd, + stride_mb, + stride_mh, + stride_ml, + stride_mn, stride_ob, stride_oh, stride_ol, stride_od, sm_scale, + IS_CAUSAL: tl.constexpr, + HAS_MASK: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, HEAD_DIM_CE: tl.constexpr, ): + """ + Fused SDPA kernel that handles different sequence lengths for Q and K/V. + + Q shape: [B, H, L_Q, D] + K/V shape: [B, H, L_KV, D] + Output shape: [B, H, L_Q, D] + """ # Program IDs pid_m = tl.program_id(axis=0) # along query length pid_hz = tl.program_id(axis=1) # flattened batch*head - off_b = pid_hz // H off_h = pid_hz % H - - # Compute ranges + # Compute ranges for queries start_m = pid_m * BLOCK_M offs_m = start_m + tl.arange(0, BLOCK_M) offs_d = tl.arange(0, HEAD_DIM_CE) - mask_m = offs_m < L - + mask_m = offs_m < L_Q # Mask based on query length # Base pointers for this (b, h) q_base = q_ptr + off_b * stride_qb + off_h * stride_qh k_base = k_ptr + off_b * stride_kb + off_h * stride_kh v_base = v_ptr + off_b * stride_vb + off_h * stride_vh o_base = o_ptr + off_b * stride_ob + off_h * stride_oh - + # Mask base pointer (if provided) + if HAS_MASK: + mask_base = mask_ptr + off_b * stride_mb + off_h * stride_mh # Make head-dim addresses compiler-friendly offs_d_ctg = tl.max_contiguous(tl.multiple_of(offs_d, 16), HEAD_DIM_CE) - # Load Q tile [BLOCK_M, HEAD_DIM] - coalesced along HEAD_DIM q_ptrs = q_base + (offs_m[:, None] * stride_ql + offs_d_ctg[None, :] * stride_qd) q = tl.load(q_ptrs, mask=mask_m[:, None], other=0.0) q = q.to(tl.bfloat16) - # Initialize accumulators and softmax stats acc = tl.zeros((BLOCK_M, HEAD_DIM_CE), dtype=tl.float32) m_i = tl.full((BLOCK_M,), -float("inf"), dtype=tl.float32) l_i = tl.zeros((BLOCK_M,), dtype=tl.float32) - # Convert to base-2 scale for exp2 qk_scale = sm_scale * 1.4426950408889634 - - # Loop over keys/values along sequence length in tiles of BLOCK_N - # Load K as [BLOCK_N, HEAD_DIM] for coalesced reads, then use tl.trans(K) in dot - for start_n in tl.range(0, L, BLOCK_N): + # Loop over keys/values along L_KV dimension (not L_Q!) + for start_n in tl.range(0, L_KV, BLOCK_N): offs_n = start_n + tl.arange(0, BLOCK_N) - mask_n = offs_n < L - + mask_n = offs_n < L_KV # Mask based on key/value length # Load K tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM) k_ptrs = k_base + ( offs_n[:, None] * stride_kl + offs_d_ctg[None, :] * stride_kd ) k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0) k = k.to(tl.bfloat16) - # Compute attention logits [BLOCK_M, BLOCK_N] = Q[BM,D] @ K[BN,D]^T - qk = tl.dot(q, tl.trans(k)).to(tl.float32) # accumulator in fp32 + qk = tl.dot(q, tl.trans(k)).to(tl.float32) qk = qk * qk_scale - - # Apply OOB masks for both rows and cols to keep stability + # Apply causal mask if needed + # For causal masking with different lengths: position i can attend to position j if i >= j + if IS_CAUSAL: + causal_mask = offs_m[:, None] >= offs_n[None, :] + qk = tl.where(causal_mask, qk, -float("inf")) + # Apply attention mask if provided + if HAS_MASK: + # Load mask tile [BLOCK_M, BLOCK_N] + # Mask shape should be [B, H, L_Q, L_KV] + mask_ptrs = mask_base + ( + offs_m[:, None] * stride_ml + offs_n[None, :] * stride_mn + ) + attn_mask = tl.load( + mask_ptrs, mask=mask_m[:, None] & mask_n[None, :], other=0.0 + ) + # Convert boolean mask to additive mask (-inf for False, 0 for True) + qk = tl.where(attn_mask, qk, -float("inf")) + # Apply OOB masks for both rows and cols qk = tl.where(mask_n[None, :], qk, -float("inf")) qk = tl.where(mask_m[:, None], qk, -float("inf")) - # Online softmax m_ij = tl.maximum(m_i, tl.max(qk, 1)) p = tl.math.exp2(qk - m_ij[:, None]) l_ij = tl.sum(p, 1) alpha = tl.math.exp2(m_i - m_ij) - # Load V tile [BLOCK_N, HEAD_DIM] (contiguous along HEAD_DIM) v_ptrs = v_base + ( offs_n[:, None] * stride_vl + offs_d_ctg[None, :] * stride_vd ) v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0) v = v.to(tl.bfloat16) - # Update accumulator acc = acc * alpha[:, None] - # Cast p to bf16 to use tensor-cores in tl.dot; accumulate in fp32 p_bf16 = p.to(tl.bfloat16) acc = tl.dot(p_bf16, v, acc) - # Update softmax stats l_i = l_i * alpha + l_ij m_i = m_ij - # Normalize accumulator by softmax denominator acc = acc / l_i[:, None] - - # Store output [BLOCK_M, HEAD_DIM] + # Store output [BLOCK_M, HEAD_DIM] - shape matches query o_ptrs = o_base + (offs_m[:, None] * stride_ol + offs_d_ctg[None, :] * stride_od) tl.store(o_ptrs, acc.to(tl.bfloat16), mask=mask_m[:, None]) -@triton_op("custom::optimized_triton_scaled_dot_product_attention", mutates_args={}) -def optimized_triton_scaled_dot_product_attention( +@triton_op("triton::sdpa", mutates_args={}) +def sdpa( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -149,21 +219,24 @@ def optimized_triton_scaled_dot_product_attention( enable_gqa: bool = False, ) -> torch.Tensor: """ - Triton fused Scaled Dot-Product Attention (forward, no causal, no dropout). - Expected shapes (tested): [B=1, H=20, L<=1500, D=64], dtype bfloat16. + Triton fused Scaled Dot-Product Attention with support for different sequence lengths. + Supports different sequence lengths for query and key/value: + - Query: [B, H, L_q, D] + - Key: [B, H, L_kv, D] + - Value: [B, H, L_kv, D] + - Output: [B, H, L_q, D] (matches query shape) Args: - query: Query tensor [B, H, L, D] - key: Key tensor [B, H, L, D] - value: Value tensor [B, H, L, D] - attn_mask: must be None (not supported) + query: Query tensor [B, H, L_q, D] + key: Key tensor [B, H, L_kv, D] + value: Value tensor [B, H, L_kv, D] + attn_mask: Optional attention mask [B, H, L_q, L_kv] or broadcastable shape dropout_p: must be 0.0 (not supported) - is_causal: must be False (not supported) - scale: must be 0.0 (not supported) + is_causal: whether to apply causal masking + scale: attention scale (default: 1/sqrt(d)) enable_gqa: must be False (not supported) - Returns: - Output tensor [B, H, L, D] + Output tensor [B, H, L_q, D] """ # Validate inputs if not (query.is_cuda and key.is_cuda and value.is_cuda): @@ -174,40 +247,25 @@ def optimized_triton_scaled_dot_product_attention( or value.dtype != torch.bfloat16 ): raise RuntimeError("Expected bfloat16 inputs") - if query.shape != key.shape or query.shape != value.shape: - raise RuntimeError( - f"Q, K, V must have identical shapes; got query={query.shape}, key={key.shape}, value={value.shape}." - ) - if query.dim() != 4: - raise RuntimeError( - f"Expected 4D tensors shaped [B, H, L, D]; got {query.dim()}D." - ) - - # Enforce that only default values are accepted for these arguments - if attn_mask is not None: + if query.dim() != 4 or key.dim() != 4 or value.dim() != 4: raise RuntimeError( - "attn_mask must be None (not supported in this implementation)." + f"Expected 4D tensors shaped [B, H, L, D]; got query.dim()={query.dim()}, key.dim()={key.dim()}, value.dim()={value.dim()}." ) + # Enforce unsupported features if dropout_p != 0.0: raise RuntimeError( "dropout_p must be 0.0 (not supported in this implementation)." ) - if is_causal is not False: - raise RuntimeError( - "is_causal must be False (not supported in this implementation)." - ) - if scale != 0.0: - raise RuntimeError("scale must be 0.0 (not supported in this implementation).") if enable_gqa is not False: raise RuntimeError( "enable_gqa must be False (not supported in this implementation)." ) - - B, H, L, D = query.shape - # Allocate output + # Validate and get dimensions + B, H, L_q, L_kv, D_q, D_kv = _validate_qkv_shapes(query, key, value) + D = D_q # Head dimension + # Allocate output with query shape out = torch.empty_like(query) - - # Element-wise strides (in elements) + # Element-wise strides sqb, sqh, sql, sqd = query.stride() skb, skh, skl, skd = key.stride() svb, svh, svl, svd = value.stride() @@ -216,23 +274,44 @@ def optimized_triton_scaled_dot_product_attention( # Grid: tile queries (M) and batch*heads axis def grid(META): return ( - triton.cdiv(L, META["BLOCK_M"]), + triton.cdiv(L_q, META["BLOCK_M"]), # Based on query length B * H, ) # Scale factor for SDPA - sm_scale = 1.0 / math.sqrt(D) - - # Launch kernel using wrap_triton to avoid tracing issues during export/compile - # Note: wrap_triton returns a callable that can be indexed with grid + sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale + # Handle attention mask + has_mask = attn_mask is not None + if has_mask: + # Expand mask to [B, H, L_q, L_kv] if needed + if attn_mask.dim() == 2: + # [L_q, L_kv] -> [B, H, L_q, L_kv] + attn_mask = attn_mask.unsqueeze(0).unsqueeze(0).expand(B, H, -1, -1) + elif attn_mask.dim() == 3: + # [B, L_q, L_kv] -> [B, H, L_q, L_kv] + attn_mask = attn_mask.unsqueeze(1).expand(-1, H, -1, -1) + + # Validate mask shape + if attn_mask.shape != (B, H, L_q, L_kv): + # Try to expand if broadcastable + attn_mask = attn_mask.expand(B, H, L_q, L_kv) + + smb, smh, sml, smn = attn_mask.stride() + else: + # Dummy strides and mask + smb, smh, sml, smn = 0, 0, 0, 0 + attn_mask = torch.empty(0, dtype=torch.bool, device=query.device) + # Launch kernel wrap_triton(_sdpa_fwd_kernel)[grid]( query, key, value, + attn_mask, out, B, H, - L, + L_q, # Query sequence length + L_kv, # Key/Value sequence length D, sqb, sqh, @@ -246,21 +325,26 @@ def grid(META): svh, svl, svd, + smb, + smh, + sml, + smn, sob, soh, sol, sod, sm_scale, + IS_CAUSAL=is_causal, + HAS_MASK=has_mask, HEAD_DIM_CE=D, ) - return out # Register the abstract/fake implementation for torch.export # This is critical to avoid accessing real tensor data during export -@optimized_triton_scaled_dot_product_attention.register_fake -def _optimized_triton_sdpa_abstract( +@sdpa.register_fake +def _sdpa_abstract( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -273,12 +357,10 @@ def _optimized_triton_sdpa_abstract( """ Abstract/fake implementation for torch.export. This just returns an empty tensor with the correct shape/dtype/device. - No actual computation happens here - this is only for shape inference during export. """ - # Validate shapes match - assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape" + # Validate dtypes match assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype" + # Validate kqv's shape and get the output shape + B, H, L_q, _, D_q, _ = _validate_qkv_shapes(query, key, value) - # Output has the same shape and dtype as query - # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation - return torch.empty_like(query, dtype=query.dtype, device=query.device) + return torch.empty(B, H, L_q, D_q, dtype=query.dtype, device=query.device) diff --git a/backends/cuda/triton/replacement_pass.py b/backends/cuda/triton/replacement_pass.py new file mode 100644 index 00000000000..08e1fc7dc86 --- /dev/null +++ b/backends/cuda/triton/replacement_pass.py @@ -0,0 +1,134 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Graph Transformation Pass for Triton Kernel Replacement. + +This pass replaces ATen operators with optimized Triton kernels in the graph. +""" + +import logging + +import torch +from executorch.exir.dialects._ops import ops as exir_ops + +from torch.fx import GraphModule, Node +from torch.fx.passes.infra.pass_base import PassBase, PassResult + +logger = logging.getLogger(__name__) +triton = torch.ops.triton + +# Global mapping from edge dialect operators to Triton kernel functions +EDGE_TO_TRITON_KERNELS = { + exir_ops.edge.aten.scaled_dot_product_attention.default: triton.sdpa, +} + + +class ReplaceEdgeOpWithTritonOpPass(PassBase): + """ + Pass to replace ATen operators with Triton kernels. + + This pass scans the graph for ATen operators that have registered Triton + replacements and replaces them with the optimized Triton implementations. + + It automatically imports EDGE_TO_TRITON_KERNELS from cuda_backend.py. + """ + + def __init__(self): + """Initialize the pass.""" + super().__init__() + self._replacement_count = 0 + + def call(self, graph_module: GraphModule) -> PassResult: + """ + Execute the pass on the graph module. + + Args: + graph_module: The graph module to transform + + Returns: + PassResult indicating success/failure and the modified graph module + """ + self._replacement_count = 0 + modified = False + + if not EDGE_TO_TRITON_KERNELS: + return PassResult(graph_module, False) + + # Iterate through all nodes in the graph + for node in graph_module.graph.nodes: + if self._should_replace_node(node): + try: + self._replace_node_with_triton(graph_module, node) + modified = True + self._replacement_count += 1 + except Exception as e: + logger.warning(f"Failed to replace node {node.name}: {e}") + # Continue with other replacements even if one fails + + if modified: + # Recompile the graph module after modifications + graph_module.recompile() + + print(f"Replaced {self._replacement_count} nodes with Triton kernels") + + return PassResult(graph_module, modified) + + def _should_replace_node(self, node: Node) -> bool: + """ + Check if a node should be replaced with a Triton kernel. + + Args: + node: The node to check + EDGE_TO_TRITON_KERNELS: Mapping from edge ops to Triton kernels + + Returns: + True if the node should be replaced + """ + # Only consider call_function nodes + if node.op != "call_function": + return False + + print("Checking:", node.target) + + return node.target in EDGE_TO_TRITON_KERNELS + + def _replace_node_with_triton(self, graph_module: GraphModule, node: Node) -> None: + """ + Replace an edge dialect node with a Triton kernel call. + + Args: + graph_module: The graph module containing the node + node: The node to replace + EDGE_TO_TRITON_KERNELS: Mapping from edge ops to Triton kernels + """ + # Get the target operator (should be an exir_ops edge dialect op) + target = node.target + + # Get the replacement kernel + if target not in EDGE_TO_TRITON_KERNELS: + raise ValueError(f"No replacement kernel found for {target}") + + triton_kernel_fn = EDGE_TO_TRITON_KERNELS[target] + + # Create a new node with the Triton kernel + with graph_module.graph.inserting_before(node): + # The triton_kernel_fn is already registered as a custom op via @triton_op + # We can call it directly + new_node = graph_module.graph.call_function( + triton_kernel_fn, + args=node.args, + kwargs=node.kwargs, + ) + + # Copy metadata from original node + new_node.meta = node.meta.copy() + + # Replace all uses of the old node with the new node + node.replace_all_uses_with(new_node) + + # Remove the old node + graph_module.graph.erase_node(node) diff --git a/benchmarking.py b/benchmarking.py deleted file mode 100644 index d309c25bf9b..00000000000 --- a/benchmarking.py +++ /dev/null @@ -1,272 +0,0 @@ -#!/usr/bin/env python3 -""" -Benchmark script for Whisper ASR runner. -Runs the whisper_runner command multiple times and collects throughput metrics. -""" -import argparse -import json -import os -import statistics -import subprocess -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional - - -@dataclass -class RunMetrics: - """Metrics from a single run.""" - - generated_tokens: int - tokens_per_sec: float - model_load_time_ms: float - inference_time_ms: float - prompt_eval_to_end_ms: float - first_token_latency_ms: float - - def __repr__(self): - return ( - f"Tokens: {self.generated_tokens}, " - f"Throughput: {self.tokens_per_sec:.2f} t/s, " - f"Model load: {self.model_load_time_ms:.0f}ms, " - f"Inference: {self.inference_time_ms:.0f}ms, " - f"First token: {self.first_token_latency_ms:.0f}ms" - ) - - -def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]: - """Parse PyTorchObserver JSON output and compute metrics.""" - try: - # Find the JSON part in the log line - if "PyTorchObserver" not in log_line: - return None - - json_str = log_line.split("PyTorchObserver")[1].strip() - data = json.loads(json_str) - - # Extract values - generated_tokens = data.get("generated_tokens", 0) - inference_end_ms = data.get("inference_end_ms", 0) - prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0) - first_token_ms = data.get("first_token_ms", 0) - model_load_start_ms = data.get("model_load_start_ms", 0) - model_load_end_ms = data.get("model_load_end_ms", 0) - - # Compute metrics - prompt_eval_to_end_ms = inference_end_ms - prompt_eval_end_ms - tokens_per_sec = ( - (generated_tokens / prompt_eval_to_end_ms * 1000) - if prompt_eval_to_end_ms > 0 - else 0 - ) - model_load_time_ms = model_load_end_ms - model_load_start_ms - inference_time_ms = inference_end_ms - prompt_eval_end_ms - first_token_latency_ms = first_token_ms - prompt_eval_end_ms - - return RunMetrics( - generated_tokens=generated_tokens, - tokens_per_sec=tokens_per_sec, - model_load_time_ms=model_load_time_ms, - inference_time_ms=inference_time_ms, - prompt_eval_to_end_ms=prompt_eval_to_end_ms, - first_token_latency_ms=first_token_latency_ms, - ) - except (json.JSONDecodeError, KeyError, ValueError) as e: - print(f"Error parsing PyTorchObserver log: {e}", file=sys.stderr) - return None - - -def run_whisper_benchmark( - command: str, num_runs: int = 5, verbose: bool = False -) -> List[RunMetrics]: - """ - Run the whisper_runner command multiple times and collect metrics. - - Args: - command: Full command to run - num_runs: Number of times to run the command - verbose: Print detailed output - - Returns: - List of RunMetrics from each run - """ - results = [] - - for run_num in range(1, num_runs + 1): - print(f"\n[Run {run_num}/{num_runs}] Executing: {command}") - - try: - # Run command and capture output - result = subprocess.run( - command, - shell=True, - capture_output=True, - text=True, - timeout=300, # 5 minute timeout - ) - - if result.returncode != 0: - print( - f"Error: Command failed with return code {result.returncode}", - file=sys.stderr, - ) - if result.stderr: - print(f"stderr: {result.stderr}", file=sys.stderr) - continue - - # Search for PyTorchObserver line in output - observer_line = None - for line in result.stdout.split("\n"): - if "PyTorchObserver" in line: - observer_line = line - break - - if observer_line is None: - print( - f"Warning: No PyTorchObserver output found in run {run_num}", - file=sys.stderr, - ) - if verbose: - print(f"stdout:\n{result.stdout}", file=sys.stderr) - continue - - # Parse metrics - metrics = parse_pytorch_observer_log(observer_line) - if metrics is None: - print( - f"Warning: Failed to parse metrics from run {run_num}", - file=sys.stderr, - ) - continue - - results.append(metrics) - print(f"✓ {metrics}") - - except subprocess.TimeoutExpired: - print(f"Error: Command timed out on run {run_num}", file=sys.stderr) - except Exception as e: - print(f"Error on run {run_num}: {e}", file=sys.stderr) - - return results - - -def print_summary(results: List[RunMetrics]) -> None: - """Print summary statistics.""" - if not results: - print("No valid results to summarize.") - return - - tokens_per_sec_list = [r.tokens_per_sec for r in results] - model_load_times = [r.model_load_time_ms for r in results] - inference_times = [r.inference_time_ms for r in results] - first_token_latencies = [r.first_token_latency_ms for r in results] - - print("\n" + "=" * 70) - print("BENCHMARK SUMMARY") - print("=" * 70) - print(f"Total runs: {len(results)}") - print(f"Generated tokens per run: {results[0].generated_tokens}") - print() - - print("THROUGHPUT (tokens/sec):") - print(f" Min: {min(tokens_per_sec_list):.2f} t/s") - print(f" Max: {max(tokens_per_sec_list):.2f} t/s") - print(f" Mean: {statistics.mean(tokens_per_sec_list):.2f} t/s") - if len(tokens_per_sec_list) > 1: - print(f" Stdev: {statistics.stdev(tokens_per_sec_list):.2f} t/s") - print() - - print("MODEL LOAD TIME (ms):") - print(f" Min: {min(model_load_times):.0f} ms") - print(f" Max: {max(model_load_times):.0f} ms") - print(f" Mean: {statistics.mean(model_load_times):.0f} ms") - if len(model_load_times) > 1: - print(f" Stdev: {statistics.stdev(model_load_times):.0f} ms") - print() - - print("INFERENCE TIME (ms, prompt_eval_end to inference_end):") - print(f" Min: {min(inference_times):.0f} ms") - print(f" Max: {max(inference_times):.0f} ms") - print(f" Mean: {statistics.mean(inference_times):.0f} ms") - if len(inference_times) > 1: - print(f" Stdev: {statistics.stdev(inference_times):.0f} ms") - print() - - print("FIRST TOKEN LATENCY (ms):") - print(f" Min: {min(first_token_latencies):.0f} ms") - print(f" Max: {max(first_token_latencies):.0f} ms") - print(f" Mean: {statistics.mean(first_token_latencies):.0f} ms") - if len(first_token_latencies) > 1: - print(f" Stdev: {statistics.stdev(first_token_latencies):.0f} ms") - print("=" * 70) - - -def main(): - # Parse command-line arguments - parser = argparse.ArgumentParser( - description="Benchmark Whisper ASR runner and collect throughput metrics" - ) - parser.add_argument( - "num_runs", - type=int, - nargs="?", - default=50, - help="Number of benchmark runs (default: 5)", - ) - parser.add_argument( - "--model_dir_name", - type=str, - default="decomposed", - help="Path to the directory that has model .pte and .ptd files", - ) - parser.add_argument( - "--processor_path", - type=str, - default="~/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte", - help="Path to the preprocessor/processor .pte file", - ) - parser.add_argument("--verbose", action="store_true", help="Print verbose output") - - args = parser.parse_args() - - base_path = "~/kernel-gen/whisper-large-v3-turbo/" - model_dir_path = os.path.join(base_path, args.model_dir_name) - - # Expand user paths - model_path = os.path.expanduser(model_dir_path + "/model.pte") - data_path = os.path.expanduser(model_dir_path + "/aoti_cuda_blob.ptd") - tokenizer_path = os.path.expanduser( - "~/kernel-gen/whisper-large-v3-turbo/decomposed" - ) - audio_path = os.path.expanduser( - "~/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav" - ) - processor_path = os.path.expanduser(args.processor_path) - - # Build command - command = ( - "cmake-out/examples/models/whisper/whisper_runner " - f"--model_path {model_path} " - f"--data_path {data_path} " - f"--tokenizer_path {tokenizer_path} " - f"--audio_path {audio_path} " - f"--processor_path {processor_path} " - "--model_name whisper_large_v3 " - "--temperature 0 " - ) - - print(f"Running Whisper benchmark {args.num_runs} times...") - print(f"Command: {command}\n") - - # Run benchmark - results = run_whisper_benchmark( - command, num_runs=args.num_runs, verbose=args.verbose - ) - - # Print summary - print_summary(results) - - -if __name__ == "__main__": - main() diff --git a/custom_triton_playground/sdpa_triton.py b/custom_triton_playground/sdpa_triton.py deleted file mode 100644 index 09233e557bd..00000000000 --- a/custom_triton_playground/sdpa_triton.py +++ /dev/null @@ -1,317 +0,0 @@ -# kernel.py -import math -from typing import Any, Optional - -import torch -import triton -import triton.language as tl -from torch.library import triton_op, wrap_triton - - -""" -Fused Scaled Dot-Product Attention (SDPA) implemented in a single Triton kernel. - -This module provides a transparent replacement for torch.nn.functional.scaled_dot_product_attention -using a custom Triton kernel. The replacement is automatic - no model code changes needed! - -How it works: -1. We register a custom implementation using torch.library -2. When torch.nn.functional.scaled_dot_product_attention is called, - PyTorch's dispatch mechanism routes it to our implementation during AOTI compilation -3. The model code remains unchanged - -What is fused: -- We fuse QK^T matmul, numerically-stable online softmax, and the final - multiplication by V into one streaming kernel. No intermediate attention - matrix is materialized in memory. - -Design notes: -- We tile along the query (sequence) dimension with BLOCK_M rows and iterate - over the key/value sequence dimension in BLOCK_N columns. -- For each (batch, head) pair and query tile, we: - * Load a tile of Q once and keep it in registers. - * Stream over K/V in blocks: compute qk = Q @ K^T, update running row-wise - softmax statistics (m_i, l_i) and the output accumulator acc = sum(p * V) - using the "online softmax" algorithm: - m_new = max(m_old, max(qk)) - p = exp(qk - m_new) - acc = acc * exp(m_old - m_new) + p @ V - l_new = l_old * exp(m_old - m_new) + sum(p) - m_old = m_new - * Finally, write O = acc / l_i. -- All accumulation is done in fp32 for numerical stability; inputs/outputs are fp16. -- Boundary conditions are handled with masks. -- The Python wrapper only validates inputs, allocates outputs, configures the grid, - and launches the Triton kernel. All math is inside the Triton kernel. - -Runtime constraints respected: -- No torch.nn or torch.nn.functional is used in the execution path. -- No PyTorch compute ops are used to implement the algorithm; all math happens - in Triton via tl.load/tl.store/tl.dot/tl.exp/tl.max/tl.sum. -""" - - -@triton.jit -def _sdpa_fwd_kernel( - q_ptr, - k_ptr, - v_ptr, - o_ptr, - B, - H, - S, - D, # shapes - stride_qb, - stride_qh, - stride_qs, - stride_qd, - stride_kb, - stride_kh, - stride_ks, - stride_kd, - stride_vb, - stride_vh, - stride_vs, - stride_vd, - stride_ob, - stride_oh, - stride_os, - stride_od, - scale, # 1/sqrt(D) - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - HEAD_DIM: tl.constexpr, -): - # Program IDs - pid_m = tl.program_id(0) # along sequence dimension (queries) - pid_bh = tl.program_id(1) # across batch*heads - - b = pid_bh // H - h = pid_bh % H - - # Offsets for this block of queries - offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - offs_d = tl.arange(0, HEAD_DIM) - - # Base pointers for this (b, h) - q_bh = q_ptr + b * stride_qb + h * stride_qh - k_bh = k_ptr + b * stride_kb + h * stride_kh - v_bh = v_ptr + b * stride_vb + h * stride_vh - o_bh = o_ptr + b * stride_ob + h * stride_oh - - # Load Q tile: [BLOCK_M, HEAD_DIM] - q_ptrs = q_bh + (offs_m[:, None] * stride_qs + offs_d[None, :] * stride_qd) - q_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D) - q = tl.load(q_ptrs, mask=q_mask, other=0.0) - - # Initialize online-softmax stats and output accumulator - m_i = tl.full([BLOCK_M], -float("inf"), dtype=tl.float32) - l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) - - # Iterate over keys/values in blocks of BLOCK_N - for start_n in tl.range(0, S, BLOCK_N): - offs_n = start_n + tl.arange(0, BLOCK_N) - kv_mask_cols = offs_n < S - - # Load K in a layout suitable for qk = q @ kT: - # k_ptrs produces a tensor of shape [HEAD_DIM, BLOCK_N] - k_ptrs = k_bh + (offs_n[None, :] * stride_ks + offs_d[:, None] * stride_kd) - k = tl.load( - k_ptrs, mask=(offs_d[:, None] < D) & (kv_mask_cols[None, :]), other=0.0 - ) - - # qk = [BLOCK_M, BLOCK_N] in fp32 - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk = tl.dot(q, k, qk) - qk = qk * scale # scale by 1/sqrt(D) - - # Mask out-of-bounds columns so they don't affect max/sum - qk = tl.where(kv_mask_cols[None, :], qk, -float("inf")) - - # Online softmax update - m_ij = tl.maximum(m_i, tl.max(qk, axis=1)) - p = tl.exp(qk - m_ij[:, None]) # fp32 - alpha = tl.exp(m_i - m_ij) - l_i = l_i * alpha + tl.sum(p, axis=1) - m_i = m_ij - - # Load V tile: [BLOCK_N, HEAD_DIM] - v_ptrs = v_bh + (offs_n[:, None] * stride_vs + offs_d[None, :] * stride_vd) - v = tl.load( - v_ptrs, mask=(kv_mask_cols[:, None]) & (offs_d[None, :] < D), other=0.0 - ) - - # Update output accumulator: acc = acc * alpha + p @ v - acc = acc * alpha[:, None] - # Use fp16 inputs for tl.dot with fp32 accumulation - acc = tl.dot(p.to(tl.float16), v.to(tl.float16), acc) - - # Normalize: O = acc / l_i[:, None] - o = acc / l_i[:, None] - # Store O in fp16 - o_ptrs = o_bh + (offs_m[:, None] * stride_os + offs_d[None, :] * stride_od) - o_mask = (offs_m[:, None] < S) & (offs_d[None, :] < D) - tl.store(o_ptrs, o.to(tl.float16), mask=o_mask) - - -@triton_op("custom::scaled_dot_product_attention", mutates_args={}) -def triton_scaled_dot_product_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - dropout_p: float = 0.0, - is_causal: bool = False, - scale: float = 0.0, - enable_gqa: bool = False, -) -> torch.Tensor: - """ - Fused Scaled Dot-Product Attention registered as a custom op: - O = softmax(Q @ K^T / sqrt(D)) @ V - where Q, K, V are shaped [batch, heads, seq_len, head_dim]. - - This function is registered with @triton_op so AOTI can discover and use it - during compilation as a replacement for torch.nn.functional.scaled_dot_product_attention. - - Wrapper responsibilities: - - Validate input tensors (dtype/device/shapes) - - Allocate output tensor - - Configure grid and launch the Triton kernel - - No math is done here beyond basic scalar setup; all heavy compute runs in the Triton kernel. - - Fusion details: - - This launches a single kernel that computes QK^T, performs online softmax, - and multiplies by V to produce O, all in one pass over K/V blocks. - - No intermediate attention matrix is written to global memory. - - Args: - query: Query tensor [B, H, S, D] - key: Key tensor [B, H, S, D] - value: Value tensor [B, H, S, D] - attn_mask: has to be None - is_causal: has to be False - scale: has to be None - enable_gqa: has to be False - - Returns: - Output tensor [B, H, S, D] - """ - # Basic validation - if not (query.is_cuda and key.is_cuda and value.is_cuda): - raise RuntimeError("Q, K, V must be CUDA tensors.") - if ( - query.dtype != torch.float16 - or key.dtype != torch.float16 - or value.dtype != torch.float16 - ): - raise RuntimeError("This reference implementation expects float16 tensors.") - if query.shape != key.shape or query.shape != value.shape: - raise RuntimeError( - f"Q, K, V must have identical shapes; got Q={query.shape}, K={key.shape}, V={value.shape}." - ) - if query.dim() != 4: - raise RuntimeError( - f"Expected 4D tensors shaped [B, H, S, D]; got {query.dim()}D." - ) - - # Enforce that only default values are accepted for these arguments - if attn_mask is not None: - raise RuntimeError( - "attn_mask must be None (not supported in this implementation)." - ) - - if dropout_p != 0.0: - raise RuntimeError( - "dropout_p must be 0.0 (not supported in this implementation)." - ) - if is_causal is not False: - raise RuntimeError( - "is_causal must be False (not supported in this implementation)." - ) - if scale != 0: - raise RuntimeError("scale must be None (not supported in this implementation).") - if enable_gqa is not False: - raise RuntimeError( - "enable_gqa must be False (not supported in this implementation)." - ) - - B, H, S, D = query.shape - - # Allocate output - O = torch.empty_like(query) - - # Choose tiling parameters (powers of two, coalesced-friendly) - # Conservative sizes to keep register/SMEM pressure reasonable for D=1024 - BLOCK_M = 16 - BLOCK_N = 32 - - # Compute softmax scale on host (scalar) - this is setup, not heavy math - scale = 1.0 / math.sqrt(float(D)) - - # Grid: one program per (query block, batch*head) - grid = (triton.cdiv(S, BLOCK_M), B * H) - - # Launch kernel using wrap_triton to avoid tracing issues during export/compile - # Note: wrap_triton returns a callable that can be indexed with grid - wrap_triton(_sdpa_fwd_kernel)[grid]( - query, - key, - value, - O, - B, - H, - S, - D, - query.stride(0), - query.stride(1), - query.stride(2), - query.stride(3), - key.stride(0), - key.stride(1), - key.stride(2), - key.stride(3), - value.stride(0), - value.stride(1), - value.stride(2), - value.stride(3), - O.stride(0), - O.stride(1), - O.stride(2), - O.stride(3), - scale, - BLOCK_M=BLOCK_M, - BLOCK_N=BLOCK_N, - HEAD_DIM=D, - num_warps=4, - num_stages=2, - ) - - return O - - -# Register the abstract/fake implementation for torch.export -# This is critical to avoid accessing real tensor data during export -@triton_scaled_dot_product_attention.register_fake -def _triton_sdpa_abstract( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - dropout_p: float = 0.0, - is_causal: bool = False, - scale=None, - enable_gqa=False, -) -> torch.Tensor: - """ - Abstract/fake implementation for torch.export. - This just returns an empty tensor with the correct shape/dtype/device. - No actual computation happens here - this is only for shape inference during export. - """ - # Validate shapes match - assert query.shape == key.shape == value.shape, "Q, K, V must have the same shape" - assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype" - - # Output has the same shape and dtype as query - # IMPORTANT: Use the exact same dtype to satisfy ExecuTorch validation - return torch.empty_like(query, dtype=query.dtype, device=query.device) diff --git a/custom_triton_playground/test-sdpa-with-custom-kernel.py b/custom_triton_playground/test-sdpa-with-custom-kernel.py deleted file mode 100644 index 16883bc83d9..00000000000 --- a/custom_triton_playground/test-sdpa-with-custom-kernel.py +++ /dev/null @@ -1,151 +0,0 @@ -# ============================================================================ -# IMPORTANT: Import sdpa_triton BEFORE defining the model -# This automatically enables the custom Triton kernel via monkey-patching -# ============================================================================ -import argparse -import os -from contextlib import nullcontext - -import torch -from executorch.backends.cuda.cuda_backend import CudaBackend -from executorch.backends.cuda.cuda_partitioner import CudaPartitioner -from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower -from optimized_sdpa_triton import optimized_triton_scaled_dot_product_attention -from sdpa_triton import triton_scaled_dot_product_attention -from torch.export import Dim, export -from torch.nn.attention import SDPBackend - - -class Model(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, query, key, value): - # This is the ORIGINAL code - we're NOT changing it! - # But it will automatically use our custom Triton kernel - # because we imported sdpa_triton above - out = torch.nn.functional.scaled_dot_product_attention( - query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False - ) - return out - - -sdpa_ctx = nullcontext() - - -# hacky method to replace system sdpa with my triton -def init_sdpa_kernel(custom_triton): - global sdpa_ctx - if custom_triton == "decomposed_kernel": - sdpa_ctx = torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) - elif custom_triton == "unoptimized_triton": - torch.nn.functional.scaled_dot_product_attention = ( - triton_scaled_dot_product_attention - ) - elif custom_triton == "optimized_triton": - torch.nn.functional.scaled_dot_product_attention = ( - optimized_triton_scaled_dot_product_attention - ) - else: - assert False, f"{custom_triton} has not been supported yet" - - -def main(kernel_type, output_dir, dtype): - print(f"Using kernel type: {kernel_type}") - print(f"Using dtype: {dtype}") - init_sdpa_kernel(kernel_type) - - model = Model() - batch_size, num_heads, seq_len, head_dim = 1, 20, 1500, 64 - - # Map dtype string to torch dtype - dtype_map = { - "fp16": torch.float16, - "bf16": torch.bfloat16, - } - torch_dtype = dtype_map[dtype] - - # Create inputs with specified dtype - inputs = ( - torch.randn( - batch_size, - num_heads, - seq_len, - head_dim, - dtype=torch_dtype, - device="cuda", - ), - torch.randn( - batch_size, - num_heads, - seq_len, - head_dim, - dtype=torch_dtype, - device="cuda", - ), - torch.randn( - batch_size, - num_heads, - seq_len, - head_dim, - dtype=torch_dtype, - device="cuda", - ), - ) - - print("Testing model execution with custom kernel...") - with torch.no_grad(): - output = model(*inputs) - print(f"✓ Model executed successfully. Output shape: {output.shape}\n") - - print("Exporting model...") - exported_program = export(model, inputs) - print("✓ Model exported successfully\n") - - print("Lowering to ExecuTorch CUDA backend (using AOTI)...") - with sdpa_ctx, torch.no_grad(): - executorch_program = to_edge_transform_and_lower( - exported_program, - partitioner=[ - CudaPartitioner( - [CudaBackend.generate_method_name_compile_spec("forward")] - ) - ], - compile_config=EdgeCompileConfig(_check_ir_validity=False), - ).to_executorch() - print("✓ Model lowered successfully with AOTI\n") - - print("Saving model...") - os.makedirs(output_dir, exist_ok=True) - with open(os.path.join(output_dir, "model.pte"), "wb") as file: - file.write(executorch_program.buffer) - - executorch_program.write_tensor_data_to_file(output_dir) - print(f"✓ PTE and PTD files has successfully dumped to {output_dir}\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Test SDPA with custom kernel") - parser.add_argument( - "--kernel_type", - type=str, - choices=["unoptimized_triton", "optimized_triton", "decomposed_kernel"], - help="Type of kernel to use", - ) - parser.add_argument( - "--output_dir", - type=str, - default=".", - help="Directory to save model.pte and tensor data (default: current directory)", - ) - parser.add_argument( - "--dtype", - type=str, - choices=["fp16", "bf16"], - default="bf16", - help="Data type for model inputs (default: bf16)", - ) - - args = parser.parse_args() - - main(args.kernel_type, args.output_dir, args.dtype) diff --git a/eval.sh b/eval.sh deleted file mode 100644 index 57f5740d446..00000000000 --- a/eval.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# 用法: ./evaluate_kernel.sh -KERNEL_NAME=$1 -N_EVAL=$2 - -# 路径前缀 -BASE_PATH=~/kernel-gen/whisper-large-v3-turbo/${KERNEL_NAME}/ - -MODEL_PATH=${BASE_PATH}model.pte -DATA_PATH=${BASE_PATH}aoti_cuda_blob.ptd -TOKENIZER_PATH=${BASE_PATH} -AUDIO_PATH=${BASE_PATH}output.wav -PROCESSOR_PATH=${BASE_PATH}whisper_preprocessor.pte - -CMD="cmake-out/examples/models/whisper/whisper_runner \ - --model_path ${MODEL_PATH} \ - --data_path ${DATA_PATH} \ - --temperature 0 \ - --tokenizer_path ${TOKENIZER_PATH} \ - --audio_path ${AUDIO_PATH} \ - --processor_path ${PROCESSOR_PATH}" - -rates=() -for ((i=1; i<=N_EVAL; i++)); do - echo "Running evaluation $i/$N_EVAL..." - output=$($CMD 2>&1) - # 推荐用 awk - rate=$(echo "$output" | grep "Generated 128 tokens:" | awk '{print $(NF-1)}') - echo "Generated token rate for run $i: $rate" - if [[ ! -z "$rate" ]]; then - rates+=($rate) - fi -done - -# 计算平均值 -sum=0 -count=0 -for r in "${rates[@]}"; do - # 只统计非空数值 - if [[ ! -z "$r" ]]; then - sum=$(echo "$sum + $r" | bc) - count=$((count+1)) - fi -done - -if [[ $count -gt 0 ]]; then - avg=$(echo "scale=2; $sum / $count" | bc) - echo "Average Generated token rate over $count runs: $avg tokens/second" -else - echo "No valid token rates found." -fi diff --git a/examples/models/__init__.py b/examples/models/__init__.py index 45abfd8f89d..d08bbfe59ee 100644 --- a/examples/models/__init__.py +++ b/examples/models/__init__.py @@ -15,6 +15,7 @@ class Model(str, Enum): AddMul = "add_mul" Softmax = "softmax" Conv1d = "conv1d" + Sdpa = "sdpa" Dl3 = "dl3" Edsr = "edsr" EmformerTranscribe = "emformer_transcribe" @@ -62,6 +63,7 @@ def __str__(self) -> str: str(Model.AddMul): ("toy_model", "AddMulModule"), str(Model.Softmax): ("toy_model", "SoftmaxModule"), str(Model.Conv1d): ("toy_model", "Conv1dModule"), + str(Model.Sdpa): ("toy_model", "SdpaModule"), str(Model.Dl3): ("deeplab_v3", "DeepLabV3ResNet50Model"), str(Model.Edsr): ("edsr", "EdsrModel"), str(Model.EmformerTranscribe): ("emformer_rnnt", "EmformerRnntTranscriberModel"), diff --git a/examples/models/toy_model/__init__.py b/examples/models/toy_model/__init__.py index 333a625af1b..87456e3fd4c 100644 --- a/examples/models/toy_model/__init__.py +++ b/examples/models/toy_model/__init__.py @@ -10,6 +10,7 @@ Conv1dModule, LinearModule, MulModule, + SdpaModule, SoftmaxModule, ) @@ -19,5 +20,6 @@ Conv1dModule, LinearModule, MulModule, + SdpaModule, SoftmaxModule, ] diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py index e1dd290b829..ed6a9faf314 100644 --- a/examples/models/toy_model/model.py +++ b/examples/models/toy_model/model.py @@ -105,3 +105,33 @@ def get_eager_model(self) -> torch.nn.Module: def get_example_inputs(self): return (torch.randn(1, 3, 10),) + + +class SdpaModule(torch.nn.Module, EagerModelBase): + def __init__(self): + super().__init__() + + def forward(self, query, key, value): + out = torch.nn.functional.scaled_dot_product_attention( + query, + key, + value, + attn_mask=None, + dropout_p=0.0, + is_causal=False, + ) + return out + + def get_eager_model(self) -> torch.nn.Module: + return self + + def get_example_inputs(self): + # Input shape: (batch, num_heads, seq_len, head_dim) + batch_size = 2 + num_heads = 8 + seq_len = 128 + head_dim = 64 + query = torch.randn(batch_size, num_heads, seq_len, head_dim) + key = torch.randn(batch_size, num_heads, seq_len, head_dim) + value = torch.randn(batch_size, num_heads, seq_len, head_dim) + return (query, key, value) diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h index b587628fd1d..1a30e2cc4df 100644 --- a/extension/runner_util/inputs.h +++ b/extension/runner_util/inputs.h @@ -64,7 +64,7 @@ struct PrepareInputTensorsOptions { * all inputs exceeds this, an error is returned. This prevents allocating too * much memory if the PTE file is malformed. */ - size_t max_total_allocation_size = 1024 * 1024 * 1024 * 10; + size_t max_total_allocation_size = 1024 * 1024 * 1024; /** * The maximum number of inputs to allocate. If the number of inputs exceeds From 9c43c113de5f96e4bc838ae36e2698ed66d89ed7 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 17 Nov 2025 01:17:42 -0800 Subject: [PATCH 6/7] remove test result file --- decomposed-freezing-result.txt | 184 --------------------------------- triton-v2-freeze-result.txt | 184 --------------------------------- triton-v3-result.txt | 184 --------------------------------- 3 files changed, 552 deletions(-) delete mode 100644 decomposed-freezing-result.txt delete mode 100644 triton-v2-freeze-result.txt delete mode 100644 triton-v3-result.txt diff --git a/decomposed-freezing-result.txt b/decomposed-freezing-result.txt deleted file mode 100644 index 375ee7c034b..00000000000 --- a/decomposed-freezing-result.txt +++ /dev/null @@ -1,184 +0,0 @@ -Running Whisper benchmark 50 times... -Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 - - -[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1138ms, Inference: 292ms, First token: 14ms - -[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1162ms, Inference: 297ms, First token: 14ms - -[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 407.64 t/s, Model load: 1096ms, Inference: 314ms, First token: 14ms - -[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1076ms, Inference: 301ms, First token: 14ms - -[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1113ms, Inference: 304ms, First token: 14ms - -[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 400.00 t/s, Model load: 1121ms, Inference: 320ms, First token: 21ms - -[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1149ms, Inference: 308ms, First token: 16ms - -[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1093ms, Inference: 294ms, First token: 14ms - -[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1093ms, Inference: 313ms, First token: 15ms - -[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1105ms, Inference: 298ms, First token: 14ms - -[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1117ms, Inference: 315ms, First token: 14ms - -[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1074ms, Inference: 327ms, First token: 14ms - -[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 435.37 t/s, Model load: 1086ms, Inference: 294ms, First token: 14ms - -[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1133ms, Inference: 331ms, First token: 14ms - -[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1120ms, Inference: 311ms, First token: 15ms - -[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 393.85 t/s, Model load: 1118ms, Inference: 325ms, First token: 18ms - -[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1008ms, Inference: 309ms, First token: 19ms - -[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1074ms, Inference: 296ms, First token: 14ms - -[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1045ms, Inference: 297ms, First token: 14ms - -[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1077ms, Inference: 296ms, First token: 14ms - -[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 429.53 t/s, Model load: 1095ms, Inference: 298ms, First token: 14ms - -[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 423.84 t/s, Model load: 1019ms, Inference: 302ms, First token: 14ms - -[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1269ms, Inference: 313ms, First token: 14ms - -[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1039ms, Inference: 313ms, First token: 18ms - -[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1080ms, Inference: 297ms, First token: 14ms - -[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 438.36 t/s, Model load: 1099ms, Inference: 292ms, First token: 14ms - -[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1200ms, Inference: 296ms, First token: 14ms - -[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 984ms, Inference: 305ms, First token: 14ms - -[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1110ms, Inference: 305ms, First token: 15ms - -[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1063ms, Inference: 299ms, First token: 14ms - -[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 382.09 t/s, Model load: 1117ms, Inference: 335ms, First token: 14ms - -[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1128ms, Inference: 310ms, First token: 14ms - -[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 428.09 t/s, Model load: 1173ms, Inference: 299ms, First token: 14ms - -[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1077ms, Inference: 300ms, First token: 14ms - -[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 436.86 t/s, Model load: 1078ms, Inference: 293ms, First token: 14ms - -[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 430.98 t/s, Model load: 1003ms, Inference: 297ms, First token: 16ms - -[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1103ms, Inference: 307ms, First token: 15ms - -[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1053ms, Inference: 313ms, First token: 15ms - -[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1091ms, Inference: 307ms, First token: 14ms - -[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1850ms, Inference: 331ms, First token: 14ms - -[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 384.38 t/s, Model load: 1017ms, Inference: 333ms, First token: 14ms - -[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1090ms, Inference: 303ms, First token: 14ms - -[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1154ms, Inference: 329ms, First token: 16ms - -[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1085ms, Inference: 342ms, First token: 13ms - -[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 426.67 t/s, Model load: 1098ms, Inference: 300ms, First token: 14ms - -[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1093ms, Inference: 312ms, First token: 14ms - -[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1099ms, Inference: 304ms, First token: 14ms - -[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1113ms, Inference: 303ms, First token: 15ms - -[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1089ms, Inference: 311ms, First token: 19ms - -[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed-freezing/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1151ms, Inference: 303ms, First token: 15ms - -====================================================================== -BENCHMARK SUMMARY -====================================================================== -Total runs: 50 -Generated tokens per run: 128 - -THROUGHPUT (tokens/sec): - Min: 374.27 t/s - Max: 438.36 t/s - Mean: 416.41 t/s - Stdev: 16.45 t/s - -MODEL LOAD TIME (ms): - Min: 984 ms - Max: 1850 ms - Mean: 1112 ms - Stdev: 117 ms - -INFERENCE TIME (ms, prompt_eval_end to inference_end): - Min: 292 ms - Max: 342 ms - Mean: 308 ms - Stdev: 13 ms - -FIRST TOKEN LATENCY (ms): - Min: 13 ms - Max: 21 ms - Mean: 15 ms - Stdev: 2 ms -====================================================================== diff --git a/triton-v2-freeze-result.txt b/triton-v2-freeze-result.txt deleted file mode 100644 index 16dfd1da218..00000000000 --- a/triton-v2-freeze-result.txt +++ /dev/null @@ -1,184 +0,0 @@ -Running Whisper benchmark 50 times... -Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 - - -[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 1070ms, Inference: 326ms, First token: 13ms - -[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1036ms, Inference: 315ms, First token: 14ms - -[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1043ms, Inference: 327ms, First token: 13ms - -[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1027ms, Inference: 330ms, First token: 13ms - -[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 1078ms, Inference: 345ms, First token: 14ms - -[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 987ms, Inference: 331ms, First token: 13ms - -[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 389.06 t/s, Model load: 1112ms, Inference: 329ms, First token: 13ms - -[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 982ms, Inference: 337ms, First token: 15ms - -[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1040ms, Inference: 331ms, First token: 13ms - -[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1048ms, Inference: 334ms, First token: 13ms - -[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 974ms, Inference: 336ms, First token: 13ms - -[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1059ms, Inference: 341ms, First token: 13ms - -[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 1010ms, Inference: 330ms, First token: 12ms - -[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 386.71 t/s, Model load: 1001ms, Inference: 331ms, First token: 13ms - -[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 1001ms, Inference: 337ms, First token: 13ms - -[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 398.75 t/s, Model load: 1093ms, Inference: 321ms, First token: 13ms - -[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 392.64 t/s, Model load: 966ms, Inference: 326ms, First token: 12ms - -[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 391.44 t/s, Model load: 1022ms, Inference: 327ms, First token: 13ms - -[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1020ms, Inference: 334ms, First token: 12ms - -[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 987ms, Inference: 339ms, First token: 13ms - -[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1115ms, Inference: 352ms, First token: 16ms - -[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1012ms, Inference: 339ms, First token: 13ms - -[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 1079ms, Inference: 346ms, First token: 13ms - -[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 425.25 t/s, Model load: 1067ms, Inference: 301ms, First token: 14ms - -[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1147ms, Inference: 307ms, First token: 14ms - -[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 406.35 t/s, Model load: 1106ms, Inference: 315ms, First token: 15ms - -[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 396.28 t/s, Model load: 1048ms, Inference: 323ms, First token: 12ms - -[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 433.90 t/s, Model load: 1098ms, Inference: 295ms, First token: 14ms - -[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 287.64 t/s, Model load: 1091ms, Inference: 445ms, First token: 14ms - -[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 18ms - -[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1075ms, Inference: 304ms, First token: 21ms - -[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1029ms, Inference: 310ms, First token: 14ms - -[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 408.95 t/s, Model load: 1177ms, Inference: 313ms, First token: 14ms - -[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 419.67 t/s, Model load: 1188ms, Inference: 305ms, First token: 15ms - -[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 390.24 t/s, Model load: 999ms, Inference: 328ms, First token: 13ms - -[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 1042ms, Inference: 338ms, First token: 13ms - -[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 412.90 t/s, Model load: 1074ms, Inference: 310ms, First token: 15ms - -[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1065ms, Inference: 308ms, First token: 18ms - -[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1173ms, Inference: 304ms, First token: 14ms - -[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 422.44 t/s, Model load: 1034ms, Inference: 303ms, First token: 16ms - -[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 245.21 t/s, Model load: 1118ms, Inference: 522ms, First token: 15ms - -[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 380.95 t/s, Model load: 1043ms, Inference: 336ms, First token: 16ms - -[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 387.88 t/s, Model load: 999ms, Inference: 330ms, First token: 13ms - -[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 1262ms, Inference: 334ms, First token: 14ms - -[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 415.58 t/s, Model load: 1059ms, Inference: 308ms, First token: 14ms - -[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 421.05 t/s, Model load: 1019ms, Inference: 304ms, First token: 14ms - -[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 432.43 t/s, Model load: 1015ms, Inference: 296ms, First token: 14ms - -[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1074ms, Inference: 312ms, First token: 15ms - -[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 411.58 t/s, Model load: 1059ms, Inference: 311ms, First token: 15ms - -[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v2-freeze/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 416.94 t/s, Model load: 1064ms, Inference: 307ms, First token: 14ms - -====================================================================== -BENCHMARK SUMMARY -====================================================================== -Total runs: 50 -Generated tokens per run: 128 - -THROUGHPUT (tokens/sec): - Min: 245.21 t/s - Max: 433.90 t/s - Mean: 392.57 t/s - Stdev: 31.88 t/s - -MODEL LOAD TIME (ms): - Min: 966 ms - Max: 1262 ms - Mean: 1059 ms - Stdev: 59 ms - -INFERENCE TIME (ms, prompt_eval_end to inference_end): - Min: 295 ms - Max: 522 ms - Mean: 329 ms - Stdev: 36 ms - -FIRST TOKEN LATENCY (ms): - Min: 12 ms - Max: 21 ms - Mean: 14 ms - Stdev: 2 ms -====================================================================== diff --git a/triton-v3-result.txt b/triton-v3-result.txt deleted file mode 100644 index 2c426a28eee..00000000000 --- a/triton-v3-result.txt +++ /dev/null @@ -1,184 +0,0 @@ -Running Whisper benchmark 50 times... -Command: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 - - -[Run 1/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 969ms, Inference: 347ms, First token: 13ms - -[Run 2/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 378.70 t/s, Model load: 922ms, Inference: 338ms, First token: 12ms - -[Run 3/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 915ms, Inference: 337ms, First token: 13ms - -[Run 4/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1004ms, Inference: 339ms, First token: 12ms - -[Run 5/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1030ms, Inference: 348ms, First token: 13ms - -[Run 6/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 988ms, Inference: 348ms, First token: 13ms - -[Run 7/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 355.56 t/s, Model load: 1017ms, Inference: 360ms, First token: 14ms - -[Run 8/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 379.82 t/s, Model load: 992ms, Inference: 337ms, First token: 12ms - -[Run 9/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 995ms, Inference: 350ms, First token: 13ms - -[Run 10/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 1014ms, Inference: 341ms, First token: 13ms - -[Run 11/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 944ms, Inference: 346ms, First token: 12ms - -[Run 12/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 371.01 t/s, Model load: 976ms, Inference: 345ms, First token: 16ms - -[Run 13/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 385.54 t/s, Model load: 1046ms, Inference: 332ms, First token: 14ms - -[Run 14/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 352.62 t/s, Model load: 1014ms, Inference: 363ms, First token: 13ms - -[Run 15/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 343.16 t/s, Model load: 1084ms, Inference: 373ms, First token: 14ms - -[Run 16/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 990ms, Inference: 341ms, First token: 13ms - -[Run 17/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 958ms, Inference: 350ms, First token: 13ms - -[Run 18/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 997ms, Inference: 349ms, First token: 15ms - -[Run 19/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 361.58 t/s, Model load: 1038ms, Inference: 354ms, First token: 13ms - -[Run 20/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1044ms, Inference: 350ms, First token: 12ms - -[Run 21/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 358.54 t/s, Model load: 1027ms, Inference: 357ms, First token: 13ms - -[Run 22/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 943ms, Inference: 346ms, First token: 13ms - -[Run 23/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 410.26 t/s, Model load: 1005ms, Inference: 312ms, First token: 14ms - -[Run 24/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 365.71 t/s, Model load: 1016ms, Inference: 350ms, First token: 13ms - -[Run 25/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 405.06 t/s, Model load: 981ms, Inference: 316ms, First token: 14ms - -[Run 26/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1007ms, Inference: 352ms, First token: 13ms - -[Run 27/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 374.27 t/s, Model load: 1067ms, Inference: 342ms, First token: 13ms - -[Run 28/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1100ms, Inference: 352ms, First token: 15ms - -[Run 29/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 957ms, Inference: 340ms, First token: 13ms - -[Run 30/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 363.64 t/s, Model load: 1006ms, Inference: 352ms, First token: 13ms - -[Run 31/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 354.57 t/s, Model load: 1040ms, Inference: 361ms, First token: 13ms - -[Run 32/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 973ms, Inference: 334ms, First token: 12ms - -[Run 33/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 982ms, Inference: 349ms, First token: 12ms - -[Run 34/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 368.88 t/s, Model load: 958ms, Inference: 347ms, First token: 13ms - -[Run 35/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 414.24 t/s, Model load: 1031ms, Inference: 309ms, First token: 14ms - -[Run 36/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 366.76 t/s, Model load: 962ms, Inference: 349ms, First token: 13ms - -[Run 37/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 375.37 t/s, Model load: 967ms, Inference: 341ms, First token: 18ms - -[Run 38/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 369.94 t/s, Model load: 977ms, Inference: 346ms, First token: 16ms - -[Run 39/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 1016ms, Inference: 353ms, First token: 18ms - -[Run 40/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1175ms, Inference: 339ms, First token: 13ms - -[Run 41/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 373.18 t/s, Model load: 964ms, Inference: 343ms, First token: 18ms - -[Run 42/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1074ms, Inference: 362ms, First token: 14ms - -[Run 43/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 362.61 t/s, Model load: 981ms, Inference: 353ms, First token: 13ms - -[Run 44/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 367.82 t/s, Model load: 1010ms, Inference: 348ms, First token: 13ms - -[Run 45/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1023ms, Inference: 339ms, First token: 12ms - -[Run 46/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 275.86 t/s, Model load: 1225ms, Inference: 464ms, First token: 19ms - -[Run 47/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 376.47 t/s, Model load: 964ms, Inference: 340ms, First token: 12ms - -[Run 48/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 383.23 t/s, Model load: 979ms, Inference: 334ms, First token: 13ms - -[Run 49/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 377.58 t/s, Model load: 1003ms, Inference: 339ms, First token: 12ms - -[Run 50/50] Executing: cmake-out/examples/models/whisper/whisper_runner --model_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/model.pte --data_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/triton-v3/aoti_cuda_blob.ptd --tokenizer_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed --audio_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/output.wav --processor_path /home/gasoonjia/kernel-gen/whisper-large-v3-turbo/decomposed/whisper_preprocessor.pte --model_name whisper_large_v3 --temperature 0 -✓ Tokens: 128, Throughput: 353.59 t/s, Model load: 1206ms, Inference: 362ms, First token: 14ms - -====================================================================== -BENCHMARK SUMMARY -====================================================================== -Total runs: 50 -Generated tokens per run: 128 - -THROUGHPUT (tokens/sec): - Min: 275.86 t/s - Max: 414.24 t/s - Mean: 369.34 t/s - Stdev: 18.90 t/s - -MODEL LOAD TIME (ms): - Min: 915 ms - Max: 1225 ms - Mean: 1011 ms - Stdev: 62 ms - -INFERENCE TIME (ms, prompt_eval_end to inference_end): - Min: 309 ms - Max: 464 ms - Mean: 348 ms - Stdev: 21 ms - -FIRST TOKEN LATENCY (ms): - Min: 12 ms - Max: 19 ms - Mean: 14 ms - Stdev: 2 ms -====================================================================== From fcff1c7deb462d08d8e4609d8eddea5cd92ab498 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 17 Nov 2025 01:20:42 -0800 Subject: [PATCH 7/7] remove extra cuda backend changes --- backends/cuda/cuda_backend.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index 3506cb685fe..772e24c75b3 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -178,7 +178,7 @@ def preprocess( # noqa: C901 if spec.key == "shim_library_path": shim_library_path = spec.value.decode("utf-8") - assert platform == "linux" + assert platform == "linux" or platform == "windows" if platform == "windows" and shim_library_path is None: lib_dir = resources.files("executorch").joinpath("data/lib") shim_library_path = str(lib_dir) @@ -220,8 +220,6 @@ def preprocess( # noqa: C901 f"Could not find required files in compiled paths, got {paths}" ) - print("--- Generate .so lives at", so_path) - # pyre-ignorep[6]: Incompatible parameter type with open(so_path, "rb") as f: so_data = f.read()