Extend the block quantization op to have optional swizzled block scales output (#5554)

protonu · naoyam · greptile-apps[bot] · web-flow · commit cce887595dc8 · 2025-11-20T13:01:31.000-05:00
1. Extends the runtime function.
- Extended block_quantize_to_nvfp4 function with optional swizzling
parameters
- Added support for 5D allocation domain swizzling with specific
pattern: [m/128, k/4, 32(m_i), 4(m_o), 4(k)]
- Implemented swizzled address calculation logic for optimal memory
layout as per Blackwell documentation
2. Relaxes checks in validation to allows for allocation domain in the
block scales output.
3. Extend codegen to pass the allocation domain extents to the runtime
function.

---------

Co-authored-by: Naoya Maruyama &lt;naoyam@users.noreply.github.com&gt;
Co-authored-by: greptile-apps[bot] &lt;165735046+greptile-apps[bot]@users.noreply.github.com&gt;
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
@@ -1807,70 +1807,83 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
   }
 
   // Special handling of BlockQuantizationOp to call the runtime function.
-  // TODO: add support for global scaling factor
   void handle(const BlockQuantizationOp* bqop) final {
     // This operator is plumbed down to a runtime function call.
     // One of the assumptions is that the device runtime expects
     // n consecutive inputs per thread. Where n can be 2 or 4 for Float, and 2,
     // 4, or 8 for Half. We achieve this by having the quantized output tv
     // scheduled to have the inner dimension grouped by 2/4/8.
     auto output = bqop->quantizedOutput()->as<kir::TensorIndex>()->view();
-    int64_t group_size = 1;
 
-    // Get the loop domain of the TensorView output and check for group
-    // parallel types. This assumes that both parallel types aren't present.
+    // Extract group size from the loop domain
+    int64_t group_size = 1;
     const auto& loop_domain = output->getLoopDomain();
-    for (auto* domain : loop_domain) {
-      auto parallel_type = domain->getParallelType();
-      if (parallel_type == ParallelType::Group) {
-        if (domain->extent()->isConstInt()) {
-          group_size = domain->extent()->evaluate().as<int64_t>();
-        }
+    for (const auto* domain : loop_domain) {
+      if (domain->getParallelType() == ParallelType::Group &&
+          domain->extent()->isConstInt()) {
+        group_size = domain->extent()->evaluate().as<int64_t>();
+        break;
       }
     }
 
-    auto input_dtype =
-        bqop->in()->as<kir::TensorIndex>()->view()->getDataType();
-
-    if (input_dtype == DataType::BFloat16 || input_dtype == DataType::Half) {
-      NVF_ERROR(
-          group_size == 8 || group_size == 4 || group_size == 2,
-          "Group size should be 2, 4 or 8 for "
-          "BlockQuantizationOp: ",
-          bqop->toString());
-
-    } else {
-      NVF_ERROR(
-          group_size == 4 || group_size == 2,
-          "Group size should be 2 or 4 for "
-          "BlockQuantizationOp: ",
-          bqop->toString());
-    }
+    // Validate group size based on input data type
+    const auto input_dtype =
+        bqop->in()->as<kir::TensorIndex>()->view()->getDataType().value();
+    const bool is_half_precision =
+        (input_dtype == DataType::BFloat16 || input_dtype == DataType::Half);
+    const bool is_valid_group_size = is_half_precision
+        ? (group_size == 2 || group_size == 4 || group_size == 8)
+        : (group_size == 2 || group_size == 4);
 
+    NVF_ERROR(
+        is_valid_group_size,
+        "Group size should be ",
+        is_half_precision ? "2, 4 or 8" : "2 or 4",
+        " for BlockQuantizationOp with input type ",
+        input_dtype,
+        ". Found: ",
+        group_size,
+        ". Expr: ",
+        bqop->toString());
+
+    // Build template arguments
     ArgumentBuilder template_args;
-    template_args.arg(
-        bqop->hasGlobalScale() ? true : false); // HAS_GLOBAL_SCALE
+    template_args.arg(bqop->hasGlobalScale()); // HAS_GLOBAL_SCALE
     template_args.arg(group_size); // ITEMS_PER_THREAD
 
-    // Function arguments
+    // Build function arguments
     ArgumentBuilder func_args;
-
-    // First argument: input data array
-    // Second argument: quantized output
-    // Third argument: block scale output
-    func_args.arg(genInline(bqop->input(0)->as<kir::TensorIndex>()->view()));
-    func_args.arg(genInline(output));
-    func_args.arg(
-        genInline(bqop->blockScales()->as<kir::TensorIndex>()->view()));
-
-    // Fourth argument: This holds the linearized index that will be used to
-    // write out the block scaling factors in the runtime function.
-    func_args.arg(genInline(bqop->attributeVal(0)));
-
-    // Fifth argument: global scale (if any)
+    func_args.arg(genInline(
+        bqop->input(0)->as<kir::TensorIndex>()->view())); // input data
+    func_args.arg(genInline(output)); // quantized output
+    func_args.arg(genInline(
+        bqop->blockScales()->as<kir::TensorIndex>()->view())); // block scales
+    func_args.arg(genInline(
+        bqop->attributeVal(0))); // linearized index for runtime function
     func_args.arg(
         bqop->hasGlobalScale() ? genInline(bqop->globalScale()) : "{}");
 
+    // Add swizzled allocation domain parameters if needed
+    auto block_scales_tv = bqop->blockScales()->as<kir::TensorIndex>()->view();
+    if (block_scales_tv->hasAllocation()) {
+      auto logical_domain =
+          TensorDomain::noReductions(block_scales_tv->getLogicalDomain());
+      auto allocation_domain =
+          TensorDomain::noReductions(block_scales_tv->getAllocationDomain());
+
+      // Swizzled layout: 2D logical -> 5D allocation
+      if (logical_domain.size() == 2 && allocation_domain.size() == 5) {
+        // Add logical domain extent of the inner dimension
+        func_args.arg(genInline(logical_domain[1]->extent()));
+
+        // Add all allocation domain extents
+        for (const auto* alloc_id : allocation_domain) {
+          func_args.arg(genInline(alloc_id->extent()));
+        }
+      }
+    }
+
+    // Generate the function call
     indent() << genCall("bq::block_quantize_to_nvfp4", template_args, func_args)
              << ";\n";
   }
diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp
@@ -274,6 +274,147 @@ bool isInnermost(IterDomain* base_id, IterDomain* maybe_innermost_id) {
   return !frontier.empty() && frontier.back() == maybe_innermost_id;
 }
 
+// Validate the swizzling pattern:
+// We support a very restricted pattern from 2D logical to 5D allocation
+// Expected pattern:
+// m, k -> m, k/4, 4 (split k by 4)
+// m, k/4, 4 -> m/128, 128, k/4, 4 (split m by 128)
+// m/128, 128, k/4, 4 -> m/128, 4(m_o), 32(m_i), k/4, 4 (split 128 by 32)
+// Then reorder to: m/128, k/4, 32(m_i), 4(m_o), 4(k)
+void isValidBlockScaleSwizzle(TensorView* block_scale) {
+  auto logical_domain =
+      TensorDomain::noReductions(block_scale->getLogicalDomain());
+  auto allocation_domain =
+      TensorDomain::noReductions(block_scale->getAllocationDomain());
+
+  // check that size of logical domain is 2 and allocation domain is 5
+  NVF_ERROR(
+      logical_domain.size() == 2 && allocation_domain.size() == 5,
+      "Block scale swizzle must have 2D logical domain and 5D allocation "
+      "domain. Found: ",
+      logical_domain.size(),
+      "D logical and ",
+      allocation_domain.size(),
+      "D allocation for TensorView: ",
+      block_scale->toString());
+
+  // keep count of splits
+  int num_splits = 0;
+
+  // keeps track of the split
+  // M -> M/128, 128
+  Split* middle_split = nullptr;
+
+  // A lambda to check the transforms from logical to allocation domain
+  // Each transform must be a split, and there can be only 3 splits.
+  auto check_transform = [block_scale,
+                          &logical_domain,
+                          &num_splits,
+                          &middle_split](Expr* expr) {
+    if (auto split_expr = dynamic_cast<Split*>(expr)) {
+      // Can have a max of 3 splits - checked later
+      num_splits++;
+
+      // If expr and it's input is logical_domain back()
+      // the inner split output should have an extent of 4.
+      // Check K -> K/4, 4
+      if (split_expr->in() == logical_domain.back()) {
+        NVF_ERROR(
+            split_expr->inner()->extent()->isConstInt() &&
+                split_expr->inner()->extent()->evaluate().as<int64_t>() == 4,
+            "The innermost split in block scale swizzle must have an extent of "
+            "4. "
+            "Found extent: ",
+            split_expr->inner()->extent()->toString(),
+            " in expr: ",
+            expr->toString(),
+            " for TensorView: ",
+            block_scale->toString());
+      } else if (split_expr->in() == logical_domain.front()) {
+        // Check M -> M/128, 128
+        NVF_ERROR(
+            split_expr->inner()->extent()->isConstInt() &&
+                split_expr->inner()->extent()->evaluate().as<int64_t>() == 128,
+            "The outermost split in block scale swizzle must have an extent of "
+            "128. "
+            "Found extent: ",
+            split_expr->inner()->extent()->toString(),
+            " in expr: ",
+            expr->toString(),
+            " for TensorView: ",
+            block_scale->toString());
+
+        // Cache the M -> M/128, 128 split
+        middle_split = split_expr;
+      } else {
+        // Check that the input to this split is the inner output of
+        // middle_split. As we should have 128 -> 4, 32
+        NVF_ERROR(
+            middle_split && split_expr->in() == middle_split->inner(),
+            "The third split in block scale swizzle must split the inner "
+            "output "
+            "(extent 128) of the second split. Expected input to be the inner "
+            "output "
+            "of the M/128, 128 split. Found expr: ",
+            split_expr->toString(),
+            " for TensorView: ",
+            block_scale->toString());
+
+        NVF_ERROR(
+            split_expr->inner()->extent()->isConstInt() &&
+                split_expr->inner()->extent()->evaluate().as<int64_t>() == 32,
+            "The third split in block scale swizzle (128 -> 4, 32) must have "
+            "an "
+            "inner extent of 32. "
+            "Found extent: ",
+            split_expr->inner()->extent()->toString(),
+            " in expr: ",
+            split_expr->toString(),
+            " for TensorView: ",
+            block_scale->toString());
+      }
+    } else {
+      NVF_THROW(
+          "Logical to allocation domain transforms for block scale swizzle "
+          "can only contain split operations");
+    }
+  };
+
+  // Get all exprs between logical and allocation domain
+  auto transform_exprs = DependencyCheck::getAllExprsBetween(
+      {logical_domain.begin(), logical_domain.end()},
+      {allocation_domain.begin(), allocation_domain.end()});
+
+  std::vector<IterDomain*> ids_to_transform = logical_domain;
+
+  // Transform the logical domain to the allocation domain
+  // without the permutation.
+  scheduler_utils::applyTransforms(
+      ids_to_transform, transform_exprs, check_transform);
+
+  // Check that there are exactly 3 splits
+  NVF_ERROR_EQ(
+      num_splits,
+      3,
+      "Block scale swizzle must have exactly 3 splits. Found ",
+      num_splits,
+      " splits in TensorView: ",
+      block_scale->toString());
+
+  // Get the permutation.
+  auto permutation =
+      ir_utils::computePermutation(ids_to_transform, allocation_domain);
+
+  // m/128, 4(m_o), 32(m_i), k/4, 4(k)
+  // -> m/128, k/4, 32(m_i), 4(m_o), 4(k)
+  // check that permutation has a value and it is 0, 3, 2, 1, 4
+  NVF_ERROR(
+      permutation.has_value() &&
+          permutation.value() == std::vector<int64_t>({0, 3, 2, 1, 4}),
+      "Block scale swizzle permutation is invalid for TensorView: ",
+      block_scale->toString());
+}
+
 // Expr-specific validaion
 //
 // TODO: Move individual validations to here, e.g.,
@@ -515,15 +656,15 @@ class ExprValidator : public OptOutDispatch {
         !quantized_output->hasAllocation(),
         "Quantized output must not have an allocation domain.");
 
-    // TODO: Relax these for swizzled block scaling factor outputs
-    // When scaling will be swizzled we will need to allow these checks
-    // to be relaxed, but we will need to ensure that the swizzling
+    // When output scales is swizzled we will need to allow these checks
+    // to be relaxed. We will need to ensure that the swizzling
     // allocation allowed is a fixed pattern:
     // 2D logical and 5D allocation domain.
     // https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#scale-factor-layouts
-    NVF_ERROR(
-        !block_scaling_factor->hasAllocation(),
-        "Block scaling factor must not have an allocation domain.");
+    if (block_scaling_factor->hasAllocation()) {
+      isValidBlockScaleSwizzle(block_scaling_factor);
+    }
+
     NVF_ERROR(
         std::all_of(
             block_scaling_factor->getContiguity().begin(),
diff --git a/runtime/block_quantization_kernels.cu b/runtime/block_quantization_kernels.cu
@@ -52,7 +52,13 @@ __device__ void block_quantize_to_nvfp4(
     Array<__e2m1, ITEMS_PER_THREAD, ALIGNMENT_2>& output,
     Tensor<__e4m3, BLOCK_SCALE_DIM, BLOCK_SCALE_ALLOC>& block_scales,
     nvfuser_index_t logical_index,
-    Tensor<float, 0, 0> global_scale) {
+    Tensor<float, 0, 0> global_scale,
+    int64_t fp8_scaling_factors_inner_dim = -1,
+    int64_t alloc_dim0 = -1,
+    int64_t alloc_dim1 = -1,
+    int64_t alloc_dim2 = -1,
+    int64_t alloc_dim3 = -1,
+    int64_t alloc_dim4 = -1) {
   constexpr bool is_half_or_bfloat =
       std::is_same<T, __bfloat>::value || std::is_same<T, __half>::value;
   constexpr bool is_float = std::is_same<T, float>::value;
@@ -124,6 +130,34 @@ __device__ void block_quantize_to_nvfp4(
   // Only one block scaling factor is written out per 16(assumed block size)
   // elements.
   int offset = logical_index / 16;
+
+  if (fp8_scaling_factors_inner_dim > 0) {
+    auto stride_4 = 1;
+    auto stride_3 = stride_4 * alloc_dim4;
+    auto stride_2 = stride_3 * alloc_dim3;
+    auto stride_1 = stride_2 * alloc_dim2;
+    auto stride_0 = stride_1 * alloc_dim1;
+
+    auto logical_inner = offset % fp8_scaling_factors_inner_dim;
+    auto logical_outer = offset / fp8_scaling_factors_inner_dim;
+
+    // The allocation domain swizzle logic is:
+    // m, k -> m, k/4, 4
+    // m, k/4, 4 -> m/128, 128, k/4, 4 ->
+    // m/128, 4(m), 32, k/4, 4(k) ->
+    // m/128, k/4, 32, 4(m), 4(k)
+
+    auto pos_4 = logical_inner % 4;
+    auto pos_1 = logical_inner / 4;
+    auto pos_t = logical_outer % 128;
+    auto pos_0 = logical_outer / 128;
+    auto pos_3 = pos_t / 32;
+    auto pos_2 = pos_t % 32;
+
+    offset = pos_4 * stride_4 + pos_3 * stride_3 + pos_2 * stride_2 +
+        pos_1 * stride_1 + pos_0 * stride_0;
+  }
+
   if (threadIdx.x % THREADS_PER_SCALING_FACTOR == 0) {
     block_scales[offset] = clamped_max_fp8;
   }
diff --git a/tests/cpp/test_low_precision_recipe.cpp b/tests/cpp/test_low_precision_recipe.cpp