NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh‎
Lines changed: 3 additions & 3 deletions b/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh‎
Lines changed: 25 additions & 16 deletions b/‎cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh‎
Lines changed: 25 additions & 16 deletions
diff --git a/‎cpp/tensorrt_llm/thop/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/thop/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/thop/fusedAddRMSNormQuant.cpp‎
Lines changed: 197 additions & 0 deletions b/‎cpp/tensorrt_llm/thop/fusedAddRMSNormQuant.cpp‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/torch_custom_ops.py‎
Lines changed: 53 additions & 0 deletions b/‎tensorrt_llm/_torch/custom_ops/torch_custom_ops.py‎
Lines changed: 53 additions & 0 deletions
@@ -139,7 +139,7 @@ struct LowLatencyLayerNorm
             for (int i = 0; i < PACKED_PER_N_BLOCK; i++)
             {
                 auto offset = (thread_id + i * N_THREADS) * Traits::PACKED_ELEMS_PER_COMPUTE;
-                if (offset <= sz)
+                if (offset < sz)
                 {
                     data[i] = *reinterpret_cast<PackedType const*>(&g_data[offset]);
                 }
@@ -260,11 +260,11 @@ struct LowLatencyLayerNorm
         {
             mean = var_and_mean[1] / param.n;
             variance = rsqrtf(
-                var_and_mean[0] / param.n - var_and_mean[1] * var_and_mean[1] + (Traits::AccumulatorType)(1e-5));
+                var_and_mean[0] / param.n - var_and_mean[1] * var_and_mean[1] + (Traits::AccumulatorType)(param.layernorm_eps));
         }
         else
         {
-            variance = rsqrtf(var_and_mean[0] / param.n + (Traits::AccumulatorType)(1e-5));
+            variance = rsqrtf(var_and_mean[0] / param.n + (Traits::AccumulatorType)(param.layernorm_eps));
         }
 
         for (int i = 0; i < PACKED_PER_N_BLOCK; i++)
 
@@ -201,9 +201,11 @@ struct WarpSpecializedLayerNorm
                 }
                 // if (blockIdx.x == 0) printf("Pushed tile %d to MATH.\n", m_base);
 
+                const uint32_t eff_m_block
+                    = std::min(static_cast<uint32_t>(Traits::M_BLOCK), static_cast<uint32_t>(param.m - m_base));
                 const auto tx
-                    = (Traits::M_BLOCK * param.n * sizeof(typename Traits::InputType) * (Traits::RESIDUAL ? 2 : 1))
-                    + (FIRST_RUN ? sizeof(AuxData) / Traits::N_BLOCK * param.n : 0);
+                    = (eff_m_block * param.n * sizeof(typename Traits::InputType) * (Traits::RESIDUAL ? 2 : 1))
+                    + (FIRST_RUN ? (sizeof(AuxData) / Traits::N_BLOCK * param.n) : 0);
 
                 auto vec_buffer_ptr = input_vec_fifo_w.tmaReserve(tx);
 
@@ -216,10 +218,13 @@ struct WarpSpecializedLayerNorm
 
                 for (int i = 0; i < Traits::M_BLOCK; i++)
                 {
-                    load_a_vec(&param.input[(m_base + i) * param.n],
-                        __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][0][i * Traits::N_BLOCK]),
-                        param.n * sizeof(typename Traits::InputType),
-                        __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));
+                    if (i < eff_m_block) [[likely]]
+                    {
+                        load_a_vec(&param.input[(m_base + i) * param.n],
+                            __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][0][i * Traits::N_BLOCK]),
+                            param.n * sizeof(typename Traits::InputType),
+                            __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));
+                    }
                 }
 
                 // Use templated lambdas to defer resolving the symbols like "param.residual".
@@ -231,10 +236,13 @@ struct WarpSpecializedLayerNorm
                     {
                         for (int i = 0; i < Traits::M_BLOCK; i++)
                         {
-                            load_a_vec(&param.residual[(m_base + i) * param.n],
-                                __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][1][i * Traits::N_BLOCK]),
-                                param.n * sizeof(typename Traits::InputType),
-                                __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));
+                            if (i < eff_m_block) [[likely]]
+                            {
+                                load_a_vec(&param.residual[(m_base + i) * param.n],
+                                    __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][1][i * Traits::N_BLOCK]),
+                                    param.n * sizeof(typename Traits::InputType),
+                                    __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));
+                            }
                         }
                     }(param);
                 }
@@ -446,6 +454,9 @@ struct WarpSpecializedLayerNorm
             {
                 m_base = block_id;
             }
+            const uint32_t eff_m_block
+                = std::min(static_cast<uint32_t>(Traits::M_BLOCK), static_cast<uint32_t>(param.m - m_base));
+
             // if (blockIdx.x == 0 && thread_id == 0) printf("MATH got tile %d.\n", m_base);
 
             // Peek for data ready.
@@ -613,11 +624,11 @@ struct WarpSpecializedLayerNorm
                 {
                     mean[m_offset] /= param.n;
                     variance[m_offset] = rsqrtf(variance[m_offset] / param.n - mean[m_offset] * mean[m_offset]
-                        + (Traits::AccumulatorType)(1e-5));
+                        + (Traits::AccumulatorType)(param.layernorm_eps));
                 }
                 else
                 {
-                    variance[m_offset] = rsqrtf(variance[m_offset] / param.n + (Traits::AccumulatorType)(1e-5));
+                    variance[m_offset] = rsqrtf(variance[m_offset] / param.n + (Traits::AccumulatorType)(param.layernorm_eps));
                 }
             }
 
@@ -660,7 +671,7 @@ struct WarpSpecializedLayerNorm
                 }
 
 #pragma unroll Traits::M_BLOCK
-                for (int m_offset = 0; m_offset < Traits::M_BLOCK; m_offset++)
+                for (int m_offset = 0; m_offset < eff_m_block; m_offset++)
                 {
                     auto m = m_base + m_offset;
 
@@ -801,8 +812,7 @@ struct WarpSpecializedLayerNorm
         shared->init(threadIdx.x == 0);
 
         __syncthreads();
-#if (defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12))
-#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM100_ALL))
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)
         if constexpr (arch::is_major_v<9> || arch::is_major_v<10>)
         {
             auto block_id = blockIdx.x;
@@ -830,7 +840,6 @@ struct WarpSpecializedLayerNorm
                 compute(block_id, threadIdx.x / 128 - 1, tid_in_wg, param, shared);
             }
         }
-#endif
 #endif
     }
 };
 
@@ -66,6 +66,7 @@ add_library(
   fp8Quantize.cpp
   dsv3FusedAGemmOp.cpp
   fusedQKNormRopeOp.cpp
+  fusedAddRMSNormQuant.cpp
   fusedTopkSoftmax.cpp
   gatherTreeOp.cpp
   groupRmsNormOp.cpp
 
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h"
+#include "tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h"
+#include "tensorrt_llm/kernels/quantization.h"
+#include "tensorrt_llm/thop/thUtils.h"
+
+#include <ATen/Functions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/EmptyTensor.h>
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cstdint>
+#include <optional>
+#include <tuple>
+#include <unordered_map>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace torch_ext
+{
+
+// Fused Add + RMSNorm + FP4 Quantization kernel
+// input: [M, N] - input tensor (fp16/bf16)
+// residual: [M, N] - residual tensor (fp16/bf16)
+// gamma: [N] - RMSNorm weight (fp16/bf16)
+// sf_scale: [1] - optional scale factor for FP4 quantization (float)
+// use_rms_norm: bool - if true use RMSNorm, else use LayerNorm
+// Returns:
+//   normed_output: [M, N/8] - FP4 quantized normalized output (uint32_t, packed)
+//   output: [M, N] - pre-norm output (input + residual), same dtype as input
+//   sf_out: scale factors for FP4 (uint8_t), swizzled layout
+//
+// NOTE: This kernel requires SM90 (Hopper) or SM100 (Blackwell) GPU architecture.
+// NOTE: Hidden dimension N must be >= 2048 and <= 16384.
+std::tuple<at::Tensor, at::Tensor, at::Tensor> fused_add_rms_norm_quant(at::Tensor const& input,
+    at::Tensor const& residual, at::Tensor const& gamma, std::optional<at::Tensor> const& sf_scale, bool use_rms_norm,
+    double eps)
+{
+    CHECK_TH_CUDA(input);
+    CHECK_CONTIGUOUS(input);
+    CHECK_TH_CUDA(residual);
+    CHECK_CONTIGUOUS(residual);
+    CHECK_TH_CUDA(gamma);
+    CHECK_CONTIGUOUS(gamma);
+
+    // Check GPU architecture - kernel requires SM90+ (Hopper/Blackwell)
+    auto const device = input.get_device();
+    cudaDeviceProp props;
+    AT_CUDA_CHECK(cudaGetDeviceProperties(&props, device));
+    TORCH_CHECK(props.major >= 9,
+        "fused_add_rms_norm_quant requires SM90 (Hopper) or newer GPU architecture. "
+        "Current device: sm_",
+        props.major, props.minor);
+
+    auto const& inputShape = input.sizes();
+    auto const& rank = inputShape.size();
+
+    TORCH_CHECK(rank == 2, "input should be 2D tensor [M, N].");
+    TORCH_CHECK(residual.sizes() == inputShape, "residual shape must match input shape.");
+
+    int64_t const m = inputShape[0];
+    int64_t const n = inputShape[1];
+    // Some warp-specialized kernels may issue vectorized stores that assume M is padded.
+    // Allocate a bit of extra space to avoid out-of-bounds writes when M is not a multiple of 8.
+    int64_t const m_padded = ((m + 15) / 16) * 16;
+
+    TORCH_CHECK(gamma.sizes()[0] == n, "gamma size must match hidden dimension N.");
+    TORCH_CHECK(n >= 2048, "Hidden dimension N must be >= 2048 (kernel constraint).");
+    TORCH_CHECK(n <= 16384, "Hidden dimension N must be <= 16384.");
+    TORCH_CHECK(n % 16 == 0, "Hidden dimension N must be divisible by 16 for FP4 quantization.");
+
+    // Validate sf_scale if provided
+    float* sfScalePtr = nullptr;
+    if (sf_scale.has_value())
+    {
+        CHECK_INPUT(sf_scale.value(), torch::kFloat32);
+        sfScalePtr = sf_scale.value().data_ptr<float>();
+    }
+
+    // Allocate output tensors
+    // normed_output: FP4 packed output [M, N/8] as uint32_t (8 FP4 values packed per uint32)
+    // NOTE: allocate [M_padded, ...] to avoid OOB writes; return a view of [M, ...] to keep API stable.
+    at::Tensor normed_output_padded
+        = at::detail::empty_cuda({m_padded, n / 8}, torch::kInt32, input.device(), std::nullopt);
+    at::Tensor normed_output = (m_padded == m) ? normed_output_padded : normed_output_padded.narrow(0, 0, m);
+
+    // output: pre-norm output (input + residual) [M, N], same dtype as input
+    // NOTE: allocate [M_padded, ...] to avoid OOB writes; return a view of [M, ...] to keep API stable.
+    at::Tensor output_padded = at::detail::empty_cuda({m_padded, n}, input.scalar_type(), input.device(), std::nullopt);
+    at::Tensor output = (m_padded == m) ? output_padded : output_padded.narrow(0, 0, m);
+
+    // sf_out: scale factors for FP4, swizzled layout
+    // sfVecSize = 16 for FP4 quantization (16 FP4 values share one scale factor)
+    int64_t const sfVecSize = 16;
+    int64_t const sfSize = tensorrt_llm::computeSwizzledLayoutSFSize(m, n / sfVecSize);
+    at::Tensor sf_out = at::detail::empty_cuda({sfSize}, SF_DTYPE, input.device(), std::nullopt);
+
+    // Get number of SMs for persistent kernel
+    static int const multiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
+
+    // Allocate counters for warp-specialized kernel using PyTorch allocator.
+    //
+    // NOTE: We cache this tensor to avoid per-call allocations. We use `thread_local` so
+    // concurrent calls from different threads don't share the same counters buffer (which
+    // could cause races across different CUDA streams).
+    static thread_local std::unordered_map<int, at::Tensor> counters_tensor_cache;
+    auto& counters_tensor = counters_tensor_cache[device];
+    int64_t const counters_bytes = static_cast<int64_t>(sizeof(tensorrt_llm::kernels::WarpSpecializedCounters));
+    if (!counters_tensor.defined() || counters_tensor.numel() != counters_bytes)
+    {
+        counters_tensor = at::detail::empty_cuda({counters_bytes}, torch::kByte, input.device(), std::nullopt);
+        counters_tensor.zero_();
+    }
+    auto* counters
+        = reinterpret_cast<tensorrt_llm::kernels::WarpSpecializedCounters*>(counters_tensor.mutable_data_ptr());
+
+    auto stream = at::cuda::getCurrentCUDAStream(device);
+
+#define LAUNCH_FUSED_ADD_RMS_NORM_QUANT(T)                                                                             \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        using Param = tensorrt_llm::kernels::GeneralFP4AddBiasResidualPreLayerNormParam<T>;                            \
+        tensorrt_llm::kernels::WarpSpecializedParam<Param> param;                                                      \
+        param.normed_output = reinterpret_cast<uint32_t*>(normed_output.data_ptr());                                   \
+        param.output = reinterpret_cast<T*>(output.data_ptr());                                                        \
+        param.input = const_cast<T*>(reinterpret_cast<T const*>(input.data_ptr()));                                    \
+        param.sf_scale = sfScalePtr;                                                                                   \
+        param.sf_out = reinterpret_cast<uint32_t*>(sf_out.data_ptr());                                                 \
+        param.residual = reinterpret_cast<T const*>(residual.data_ptr());                                              \
+        param.bias = nullptr;                                                                                          \
+        param.gamma = reinterpret_cast<T const*>(gamma.data_ptr());                                                    \
+        param.beta = nullptr;                                                                                          \
+        param.m = static_cast<int>(m);                                                                                 \
+        param.n = static_cast<int>(n);                                                                                 \
+        param.layernorm_eps = static_cast<float>(eps);                                                                 \
+        param.stream = stream;                                                                                         \
+        param.counters = counters;                                                                                     \
+        tensorrt_llm::kernels::invokeWSLayerNorm<Param>(param, use_rms_norm, multiProcessorCount);                     \
+    } while (0)
+
+    if (input.scalar_type() == at::ScalarType::Half)
+    {
+        LAUNCH_FUSED_ADD_RMS_NORM_QUANT(half);
+    }
+    else if (input.scalar_type() == at::ScalarType::BFloat16)
+    {
+#ifdef ENABLE_BF16
+        LAUNCH_FUSED_ADD_RMS_NORM_QUANT(__nv_bfloat16);
+#else
+        C10_THROW_ERROR(NotImplementedError, "BFloat16 must be enabled for fused_add_rms_norm_quant with bf16 input.");
+#endif
+    }
+    else
+    {
+        C10_THROW_ERROR(
+            NotImplementedError, "fused_add_rms_norm_quant only supports input tensor with dtypes fp16/bf16.");
+    }
+
+#undef LAUNCH_FUSED_ADD_RMS_NORM_QUANT
+
+    // No explicit sync needed - kernel runs asynchronously on the stream
+    return std::make_tuple(normed_output, output, sf_out);
+}
+
+} // namespace torch_ext
+
+TRTLLM_NAMESPACE_END
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.def(
+        "fused_add_rms_norm_quant(Tensor input, Tensor residual, Tensor gamma, "
+        "Tensor? sf_scale, bool use_rms_norm=True, float eps=1e-5) -> (Tensor, Tensor, Tensor)");
+}
+
+TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
+{
+    m.impl("fused_add_rms_norm_quant", &tensorrt_llm::torch_ext::fused_add_rms_norm_quant);
+}
@@ -1869,3 +1869,56 @@ def record_stream(tensor: torch.Tensor, stream_id: int) -> None:
     stream = get_stream(stream_id)
     assert stream is not None
     tensor.record_stream(stream)
+
+
+def fused_add_rms_norm_quant(
+    input: torch.Tensor,
+    residual: torch.Tensor,
+    gamma: torch.Tensor,
+    sf_scale: Optional[torch.Tensor],
+    use_rms_norm: bool = True,
+    eps: float = 1e-5,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Fused Add + RMSNorm/LayerNorm + FP4 Quantization kernel.
+
+    Args:
+        input: [M, N] input tensor (fp16/bf16)
+        residual: [M, N] residual tensor (fp16/bf16)
+        gamma: [N] normalization weight (fp16/bf16)
+        sf_scale: [1] optional scale factor for FP4 quantization (float32)
+        use_rms_norm: if True use RMSNorm, else use LayerNorm
+        eps: epsilon for normalization
+
+    Returns:
+        normed_output_fp4: [M, N/8] FP4 quantized normalized output (int32, packed)
+        output: [M, N] pre-norm output (input + residual), same dtype as input
+        sf_out: scale factors for FP4 quantization (uint8), swizzled layout
+
+    Note:
+        This kernel requires SM90 (Hopper) or SM100 (Blackwell) GPU.
+        Hidden dimension N must be >= 2048 and <= 16384.
+    """
+    return torch.ops.trtllm.fused_add_rms_norm_quant(input, residual, gamma,
+                                                     sf_scale, use_rms_norm,
+                                                     eps)
+
+
+@torch.library.register_fake("trtllm::fused_add_rms_norm_quant")
+def _fused_add_rms_norm_quant_fake(
+    input: torch.Tensor,
+    residual: torch.Tensor,
+    gamma: torch.Tensor,
+    sf_scale: Optional[torch.Tensor],
+    use_rms_norm: bool = True,
+    eps: float = 1e-5,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    m, n = input.shape
+    # normed_output_fp4: [M, N/8] as int32 (8 FP4 values packed per int32)
+    normed_output_fp4 = input.new_empty((m, n // 8), dtype=torch.int32)
+    # output: [M, N] pre-norm output, same dtype as input
+    output = input.new_empty((m, n), dtype=input.dtype)
+    # sf_out: scale factors, swizzled layout
+    sf_vec_size = 16
+    sf_size = ((m + 127) // 128) * 128 * ((n // sf_vec_size + 3) // 4) * 4
+    sf_out = input.new_empty((sf_size, ), dtype=torch.uint8)
+    return normed_output_fp4, output, sf_out
Original file line number	Diff line number	Diff line change
`@@ -139,7 +139,7 @@ struct LowLatencyLayerNorm`
`139`	`139`	`for (int i = 0; i < PACKED_PER_N_BLOCK; i++)`
`140`	`140`	`{`
`141`	`141`	`auto offset = (thread_id + i * N_THREADS) * Traits::PACKED_ELEMS_PER_COMPUTE;`
`142`		`- if (offset <= sz)`
	`142`	`+ if (offset < sz)`
`143`	`143`	`{`
`144`	`144`	`data[i] = reinterpret_cast<PackedType const>(&g_data[offset]);`
`145`	`145`	`}`
`@@ -260,11 +260,11 @@ struct LowLatencyLayerNorm`
`260`	`260`	`{`
`261`	`261`	`mean = var_and_mean[1] / param.n;`
`262`	`262`	`variance = rsqrtf(`
`263`		`- var_and_mean[0] / param.n - var_and_mean[1] * var_and_mean[1] + (Traits::AccumulatorType)(1e-5));`
	`263`	`+ var_and_mean[0] / param.n - var_and_mean[1] * var_and_mean[1] + (Traits::AccumulatorType)(param.layernorm_eps));`
`264`	`264`	`}`
`265`	`265`	`else`
`266`	`266`	`{`
`267`		`- variance = rsqrtf(var_and_mean[0] / param.n + (Traits::AccumulatorType)(1e-5));`
	`267`	`+ variance = rsqrtf(var_and_mean[0] / param.n + (Traits::AccumulatorType)(param.layernorm_eps));`
`268`	`268`	`}`
`269`	`269`
`270`	`270`	`for (int i = 0; i < PACKED_PER_N_BLOCK; i++)`
Original file line number	Diff line number	Diff line change
`@@ -201,9 +201,11 @@ struct WarpSpecializedLayerNorm`
`201`	`201`	`}`
`202`	`202`	`// if (blockIdx.x == 0) printf("Pushed tile %d to MATH.\n", m_base);`
`203`	`203`
	`204`	`+ const uint32_t eff_m_block`
	`205`	`+ = std::min(static_cast<uint32_t>(Traits::M_BLOCK), static_cast<uint32_t>(param.m - m_base));`
`204`	`206`	`const auto tx`
`205`		`- = (Traits::M_BLOCK * param.n * sizeof(typename Traits::InputType) * (Traits::RESIDUAL ? 2 : 1))`
`206`		`- + (FIRST_RUN ? sizeof(AuxData) / Traits::N_BLOCK * param.n : 0);`
	`207`	`+ = (eff_m_block * param.n * sizeof(typename Traits::InputType) * (Traits::RESIDUAL ? 2 : 1))`
	`208`	`+ + (FIRST_RUN ? (sizeof(AuxData) / Traits::N_BLOCK * param.n) : 0);`
`207`	`209`
`208`	`210`	`auto vec_buffer_ptr = input_vec_fifo_w.tmaReserve(tx);`
`209`	`211`
`@@ -216,10 +218,13 @@ struct WarpSpecializedLayerNorm`
`216`	`218`
`217`	`219`	`for (int i = 0; i < Traits::M_BLOCK; i++)`
`218`	`220`	`{`
`219`		`- load_a_vec(&param.input[(m_base + i) * param.n],`
`220`		`- __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][0][i * Traits::N_BLOCK]),`
`221`		`- param.n * sizeof(typename Traits::InputType),`
`222`		`- __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));`
	`221`	`+ if (i < eff_m_block) [[likely]]`
	`222`	`+ {`
	`223`	`+ load_a_vec(&param.input[(m_base + i) * param.n],`
	`224`	`+ __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][0][i * Traits::N_BLOCK]),`
	`225`	`+ param.n * sizeof(typename Traits::InputType),`
	`226`	`+ __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));`
	`227`	`+ }`
`223`	`228`	`}`
`224`	`229`
`225`	`230`	`// Use templated lambdas to defer resolving the symbols like "param.residual".`
`@@ -231,10 +236,13 @@ struct WarpSpecializedLayerNorm`
`231`	`236`	`{`
`232`	`237`	`for (int i = 0; i < Traits::M_BLOCK; i++)`
`233`	`238`	`{`
`234`		`- load_a_vec(&param.residual[(m_base + i) * param.n],`
`235`		`- __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][1][i * Traits::N_BLOCK]),`
`236`		`- param.n * sizeof(typename Traits::InputType),`
`237`		`- __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));`
	`239`	`+ if (i < eff_m_block) [[likely]]`
	`240`	`+ {`
	`241`	`+ load_a_vec(&param.residual[(m_base + i) * param.n],`
	`242`	`+ __nvvm_get_smem_pointer(&shared->input_vec[vec_buffer_ptr][1][i * Traits::N_BLOCK]),`
	`243`	`+ param.n * sizeof(typename Traits::InputType),`
	`244`	`+ __nvvm_get_smem_pointer(input_vec_fifo_w.barrier_ptr(vec_buffer_ptr)));`
	`245`	`+ }`
`238`	`246`	`}`
`239`	`247`	`}(param);`
`240`	`248`	`}`
`@@ -446,6 +454,9 @@ struct WarpSpecializedLayerNorm`
`446`	`454`	`{`
`447`	`455`	`m_base = block_id;`
`448`	`456`	`}`
	`457`	`+ const uint32_t eff_m_block`
	`458`	`+ = std::min(static_cast<uint32_t>(Traits::M_BLOCK), static_cast<uint32_t>(param.m - m_base));`
	`459`	`+`
`449`	`460`	`// if (blockIdx.x == 0 && thread_id == 0) printf("MATH got tile %d.\n", m_base);`
`450`	`461`
`451`	`462`	`// Peek for data ready.`
`@@ -613,11 +624,11 @@ struct WarpSpecializedLayerNorm`
`613`	`624`	`{`
`614`	`625`	`mean[m_offset] /= param.n;`
`615`	`626`	`variance[m_offset] = rsqrtf(variance[m_offset] / param.n - mean[m_offset] * mean[m_offset]`
`616`		`- + (Traits::AccumulatorType)(1e-5));`
	`627`	`+ + (Traits::AccumulatorType)(param.layernorm_eps));`
`617`	`628`	`}`
`618`	`629`	`else`
`619`	`630`	`{`
`620`		`- variance[m_offset] = rsqrtf(variance[m_offset] / param.n + (Traits::AccumulatorType)(1e-5));`
	`631`	`+ variance[m_offset] = rsqrtf(variance[m_offset] / param.n + (Traits::AccumulatorType)(param.layernorm_eps));`
`621`	`632`	`}`
`622`	`633`	`}`
`623`	`634`
`@@ -660,7 +671,7 @@ struct WarpSpecializedLayerNorm`
`660`	`671`	`}`
`661`	`672`
`662`	`673`	`#pragma unroll Traits::M_BLOCK`
`663`		`- for (int m_offset = 0; m_offset < Traits::M_BLOCK; m_offset++)`
	`674`	`+ for (int m_offset = 0; m_offset < eff_m_block; m_offset++)`
`664`	`675`	`{`
`665`	`676`	`auto m = m_base + m_offset;`
`666`	`677`
`@@ -801,8 +812,7 @@ struct WarpSpecializedLayerNorm`
`801`	`812`	`shared->init(threadIdx.x == 0);`
`802`	`813`
`803`	`814`	`__syncthreads();`
`804`		`-#if (defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12))`
`805`		`-#if (defined(__CUDA_ARCH_FEAT_SM90_ALL) \|\| defined(__CUDA_ARCH_FEAT_SM100_ALL))`
	`815`	`+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && (__CUDACC_VER_MAJOR__ >= 12)`
`806`	`816`	`if constexpr (arch::is_major_v<9> \|\| arch::is_major_v<10>)`
`807`	`817`	`{`
`808`	`818`	`auto block_id = blockIdx.x;`
`@@ -830,7 +840,6 @@ struct WarpSpecializedLayerNorm`
`830`	`840`	`compute(block_id, threadIdx.x / 128 - 1, tid_in_wg, param, shared);`
`831`	`841`	`}`
`832`	`842`	`}`
`833`		`-#endif`
`834`	`843`	`#endif`
`835`	`844`	`}`
`836`	`845`	`};`