intel
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 69 additions & 54 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 69 additions & 54 deletions
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 2 additions & 2 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/test/unit/language/test_core.py‎
Lines changed: 2 additions & 0 deletions b/‎python/test/unit/language/test_core.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 2 additions & 2 deletions b/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 5 additions & 5 deletions b/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎python/tutorials/gluon/01-attention-forward.py‎
Lines changed: 10 additions & 6 deletions b/‎python/tutorials/gluon/01-attention-forward.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎test/Analysis/test-alias.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/Analysis/test-alias.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/async-ops-alias-scopes.mlir‎
Lines changed: 3 additions & 3 deletions b/‎test/Conversion/amd/async-ops-alias-scopes.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/Conversion/amd/compute-base-ptr.mlir‎
Lines changed: 3 additions & 3 deletions b/‎test/Conversion/amd/compute-base-ptr.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/Conversion/tritongpu_to_llvm.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/tritongpu_to_llvm.mlir‎
Lines changed: 1 addition & 1 deletion
@@ -143,8 +143,10 @@ SmallVector<unsigned> getMatrixOrder(unsigned rank, bool rowMajor) {
   // Return the order that represents that the batch is in row-major or
   // column-major order for a batch of matrices of shape [*, m, n] with
   // len(shape) == rank.
-  assert(rank >= 2);
   SmallVector<unsigned> order(rank);
+  if (rank < 2) {
+    return order;
+  }
   std::iota(order.rbegin(), order.rend(), 0);
   if (!rowMajor) {
     std::swap(order[0], order[1]);
@@ -397,6 +399,21 @@ BlockedEncodingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
     return emitError() << "sizePerThread, threadsPerWarp, warpsPerCTA, and "
                           "order must all have the same rank.";
   }
+  if (llvm::any_of(sizePerThread,
+                   [](unsigned x) { return !llvm::isPowerOf2_64(x); })) {
+    return emitError()
+           << "Every element in sizePerThread must be a power of two.";
+  }
+  if (llvm::any_of(threadsPerWarp,
+                   [](unsigned x) { return !llvm::isPowerOf2_64(x); })) {
+    return emitError()
+           << "Every element in threadsPerWarp must be a power of two.";
+  }
+  if (llvm::any_of(warpsPerCTA,
+                   [](unsigned x) { return !llvm::isPowerOf2_64(x); })) {
+    return emitError()
+           << "Every element in warpsPerCTA must be a power of two.";
+  }
 
   // Empty CTALayout is allowed, but if it's present its rank must match the
   // BlockedEncodingAttr's rank.
@@ -1996,6 +2013,8 @@ NvidiaMmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> shape, int bitwidth,
 SmallVector<unsigned> DotOperandEncodingAttr::getRepOrder() const {
   if (auto mma = mlir::dyn_cast<MmaEncodingTrait>(getParent())) {
     return mma.getRepOrderForOperand(getOpIdx());
+  } else if (auto blocked = mlir::dyn_cast<BlockedEncodingAttr>(getParent())) {
+    return to_vector(blocked.getOrder());
   }
   llvm::report_fatal_error(
       "getRepOrder not implemented for DotOperandEncodingAttr");
@@ -2696,60 +2715,56 @@ struct TritonGPUVerifyTensorLayoutInterface
   LogicalResult verifyTensorLayout(
       Attribute layout, RankedTensorType rankedTy, Operation *op,
       function_ref<InFlightDiagnostic()> makeErr) const override {
-    if (isa<triton::gpu::SharedEncodingTrait>(layout))
-      return makeErr() << "Shared layout is not allowed on tensor type.";
-    // TODO(jlebar): Currently this only checks blocked layouts, but other
-    // layouts also have invariants!
-
-    // TODO(jlebar): Handle the case when the encoding is nested within tt.ptr.
-    if (auto blocked = dyn_cast<BlockedEncodingAttr>(layout)) {
-      ModuleOp module = op->getParentOfType<ModuleOp>();
-
-      // A different verifier should have checked that the layout itself is
-      // valid, including that threads-per-warp has the same rank as
-      // warps-per-block etc.
-      if (blocked.getRank() != rankedTy.getRank()) {
-        return makeErr() << layout << ".\nLayout has rank " << blocked.getRank()
-                         << ", but the tensor it's attached to has rank "
-                         << rankedTy.getRank() << ".";
-      }
-
-      int moduleThreadsPerWarp = TritonGPUDialect::getThreadsPerWarp(module);
-      int64_t layoutThreadsPerWarp = product(blocked.getThreadsPerWarp());
-      if (layoutThreadsPerWarp != moduleThreadsPerWarp) {
-        return makeErr() << layout << ".\nLayout has a total of "
-                         << layoutThreadsPerWarp
-                         << " threads per warp, but the module specifies "
-                         << moduleThreadsPerWarp << " threads per warp.";
-      }
-
-      std::optional<int> moduleWarpsPerCTA = maybeLookupNumWarps(op);
-      if (!moduleWarpsPerCTA) {
-        return makeErr()
-               << "Could not determine the number of warps per CTA. Operation "
-                  "is not in a context with `ttg.num-warps`.";
-      }
-      int64_t layoutWarpsPerCTA = product(blocked.getWarpsPerCTA());
-      if (layoutWarpsPerCTA != *moduleWarpsPerCTA) {
-        return makeErr() << layout << ".\nLayout has a total of "
-                         << layoutWarpsPerCTA
-                         << " warps per CTA, but the context requires "
-                         << *moduleWarpsPerCTA << " warps per CTA.";
-      }
-
-      if (blocked.getCTALayout().getCTAsPerCGA().size() > 0) {
-        int moduleCTAsPerCGA = TritonGPUDialect::getNumCTAs(module);
-        int64_t layoutCTAsPerCGA =
-            product(blocked.getCTALayout().getCTAsPerCGA());
-        if (layoutCTAsPerCGA != moduleCTAsPerCGA) {
-          return makeErr() << layout << ".\nLayout has a total of "
-                           << layoutCTAsPerCGA
-                           << " CTAs per CGA, but the module specifies "
-                           << moduleCTAsPerCGA << " CTAs per CGA.";
-        }
-      }
+    auto distr = dyn_cast<triton::gpu::DistributedEncodingTrait>(layout);
+    if (!distr)
+      return makeErr()
+             << "Non-distributed layout is not allowed in tensor type.";
+    auto rank = distr.getRepOrder().size();
+    if (rank != rankedTy.getRank())
+      return makeErr() << "Layout has rank " << rank
+                       << ", but the tensor it's attached to has rank "
+                       << rankedTy.getRank() << ".";
+    if (llvm::any_of(rankedTy.getShape(),
+                     [](int64_t i) { return !llvm::isPowerOf2_64(i); })) {
+      return makeErr() << "Layout has shape " << rankedTy.getShape()
+                       << ", but the tensor it's attached to has shape "
+                       << rankedTy.getShape()
+                       << " which is not a power of two.";
+    }
+    auto ll = toLinearLayout(rankedTy.getShape(), layout);
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+
+    // Number of threads per warp.
+    auto kLane = StringAttr::get(module.getContext(), "lane");
+    int moduleThreadsPerWarp = TritonGPUDialect::getThreadsPerWarp(module);
+    if (ll.getInDimSize(kLane) != moduleThreadsPerWarp) {
+      return makeErr() << layout << ".\nLayout has " << ll.getInDimSize(kLane)
+                       << " threads per warp, but the module specifies "
+                       << moduleThreadsPerWarp << " threads per warp.";
+    }
+
+    // Number of warps per CTA.
+    std::optional<int> moduleWarpsPerCTA = maybeLookupNumWarps(op);
+    if (!moduleWarpsPerCTA) {
+      return makeErr()
+             << "Could not determine the number of warps per CTA. Operation "
+                "is not in a context with `ttg.num-warps`.";
+    }
+    auto kWarp = StringAttr::get(module.getContext(), "warp");
+    if (ll.getInDimSize(kWarp) != *moduleWarpsPerCTA) {
+      return makeErr() << layout << ".\nLayout has " << ll.getInDimSize(kWarp)
+                       << " warps per CTA, but the context requires "
+                       << *moduleWarpsPerCTA << " warps per CTA.";
+    }
+
+    // Number of CTAs per CGA.
+    auto kBlock = StringAttr::get(module.getContext(), "block");
+    int moduleCTAsPerCGA = TritonGPUDialect::getNumCTAs(module);
+    if (ll.getInDimSize(kBlock) != moduleCTAsPerCGA) {
+      return makeErr() << layout << ".\nLayout has " << ll.getInDimSize(kBlock)
+                       << " CTAs per CGA, but the context requires "
+                       << moduleCTAsPerCGA << " CTAs per CGA.";
     }
-
     return success();
   }
 };
 
@@ -356,14 +356,14 @@ def test_warp_specialize():
     c = ttgl.arange(0, 4, layout=layout)
     pair = Pair(a, b)
     e: ttgl.constexpr = 42
-    a, b = ttgl.warp_specialize((pair, c, e), warp_specialize_default,
+    a, b = ttgl.warp_specialize((pair, c, e), warp_specialize_default, (pair, c, e),
                                 [warp_specialize_worker0, warp_specialize_worker1], [4, 4], [24, 48])
     anchor(a)
     anchor(b)
 
     # CHECK: ttg.warp_specialize([[A]], [[B]], [[C]])
     # CHECK: (tensor<1xi32, [[BLOCKED]]>, tensor<2xi32, [[BLOCKED]]>, tensor<4xi32, [[BLOCKED]]>) -> ()
-    ttgl.warp_specialize((pair, c, e), warp_specialize_worker0, [warp_specialize_worker1], [4], [48])
+    ttgl.warp_specialize((pair, c, e), warp_specialize_worker0, (pair, c, e), [warp_specialize_worker1], [4], [48])
 
 
 @gluon.jit
 
@@ -3114,6 +3114,8 @@ def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, add_ov
         pytest.skip("Skipping because tensor shape is smaller than M(f)maLayout instr_shape")
     if reduce_op == "sum" and dtype_str == "float16" and M * N > 1024:
         pytest.xfail("Skipping sum reduction on float16 due to accuracy issues")
+    if isinstance(src_layout, LinearLayout) and THREADS_PER_WARP != (1 << len(src_layout.lane)):
+        pytest.xfail(f"Skipping. This LinearLayout assumes {1 << len(src_layout.lane)} threads per warp")
 
     if isinstance(src_layout, MmaLayout) and src_layout.version == 3:
         src_layout.instr_shape[2] = 16 if dtype_str == "float16" else 8
 
@@ -302,11 +302,11 @@ def allocate_shared_memory(element_ty, shape, layout, value=None, _semantic=None
 
 
 @builtin
-def warp_specialize(args, default_partition, worker_partitions, worker_num_warps, worker_num_regs,  #
+def warp_specialize(default_args, default_partition, worker_args, worker_partitions, worker_num_warps, worker_num_regs,
                     _semantic=None, _generator=None):
     worker_num_warps = [_unwrap_if_constexpr(w) for w in worker_num_warps]
     worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
-    return _semantic.warp_specialize(args, default_partition, worker_partitions, worker_num_warps,  #
+    return _semantic.warp_specialize(default_args, default_partition, worker_args, worker_partitions, worker_num_warps,
                                      worker_num_regs, _generator)
 
 
 
@@ -239,8 +239,8 @@ def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) ->
             self.wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape, ret_layout)
             for i in range(len(inputs)))
 
-    def warp_specialize(self, args, default_partition, worker_partitions, worker_num_warps: Sequence[int],
-                        worker_num_regs: Sequence[int], generator):
+    def warp_specialize(self, default_args, default_partition, worker_args, worker_partitions,
+                        worker_num_warps: Sequence[int], worker_num_regs: Sequence[int], generator):
         num_partitions = len(worker_partitions)
         assert num_partitions == len(
             worker_num_warps
@@ -255,7 +255,7 @@ def warp_specialize(self, args, default_partition, worker_partitions, worker_num
         # Emit the default partition to get the result types.
         default_block = builder.new_block()
         builder.set_insertion_point_to_start(default_block)
-        default_results = generator.call_JitFunction(default_partition, args, kwargs={})
+        default_results = generator.call_JitFunction(default_partition, default_args, kwargs={})
         mlir_results = []
         if default_results is not None:
             mlir_results = flatten_values_to_ir(default_results)
@@ -264,7 +264,7 @@ def warp_specialize(self, args, default_partition, worker_partitions, worker_num
 
         # Create the warp specialize op.
         builder.restore_insertion_point(insert_pt)
-        mlir_args = flatten_values_to_ir(args)
+        mlir_args = flatten_values_to_ir(worker_args)
         ws_op = builder.create_warp_specialize(result_types, mlir_args, worker_num_warps)
         ws_op.get_default_region().push_back(default_block)
         ws_op.set_requested_registers(worker_num_regs)
@@ -276,7 +276,7 @@ def warp_specialize(self, args, default_partition, worker_partitions, worker_num
         for i in range(num_partitions):
             block = builder.create_block_with_parent(partitions_op.get_region(i), arg_types)
             block_args = [block.get_argument(j) for j in range(len(mlir_args))]
-            block_args = unflatten_ir_values(block_args, [arg.type for arg in args])
+            block_args = unflatten_ir_values(block_args, [arg.type for arg in worker_args])
             generator.call_JitFunction(worker_partitions[i], block_args, kwargs={})
             builder.create_warp_return()
 
 
@@ -589,7 +589,7 @@ def consume_result(self, tile):
 
 @gluon.jit
 def _attn_fwd_load(config,  #
-                   m_is, infos, k_load_ctx, v_load_ctx,  #
+                   infos, k_load_ctx, v_load_ctx,  #
                    STAGE: gl.constexpr):
     prog = config.get_program()
     lo, hi = prog.get_loop_bounds(STAGE)
@@ -609,7 +609,7 @@ def _attn_fwd_load(config,  #
 
 @gluon.jit
 def _attn_fwd_mma(config,  #
-                  m_is, infos, k_load_ctx, v_load_ctx,  #
+                  infos, k_load_ctx, v_load_ctx,  #
                   STAGE: gl.constexpr):
     prog = config.get_program()
     lo, hi = prog.get_loop_bounds(STAGE)
@@ -684,7 +684,7 @@ def _attn_fwd_correction_compute(config, mi_consumer, o_consumer, m_i):
 
 @gluon.jit
 def _attn_fwd_correction(config,  #
-                         m_is, infos, k_load_ctx, v_load_ctx,  #
+                         m_is, infos,  #
                          STAGE: gl.constexpr):
     prog = config.get_program()
     lo, hi = prog.get_loop_bounds(STAGE)
@@ -757,14 +757,14 @@ def _softmax_tile(tile_id: gl.constexpr, config, info, STAGE: gl.constexpr):
 
 @gluon.jit
 def _attn_fwd_softmax0(config,  #
-                       m_is, infos, k_load_ctx, v_load_ctx,  #
+                       infos, k_load_ctx, v_load_ctx,  #
                        STAGE: gl.constexpr):
     _softmax_tile(0, config, infos[0], STAGE)
 
 
 @gluon.jit
 def _attn_fwd_softmax1(config,  #
-                       m_is, infos, k_load_ctx, v_load_ctx,  #
+                       infos, k_load_ctx, v_load_ctx,  #
                        STAGE: gl.constexpr):
     _softmax_tile(1, config, infos[1], STAGE)
 
@@ -781,10 +781,14 @@ def _attn_fwd_inner(config, info0, info1, m_i0, m_i1,  #
         config,
         (m_i0, m_i1),
         (info0, info1),
+        STAGE,
+    ), _attn_fwd_correction, (
+        config,
+        (info0, info1),
         k_load_ctx,
         v_load_ctx,
         STAGE,
-    ), _attn_fwd_correction, [
+    ), [
         _attn_fwd_softmax0,
         _attn_fwd_softmax1,
         _attn_fwd_mma,
 
@@ -5,7 +5,7 @@
 #A_SHARED = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
 #A_SHARED_T = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [0, 1]}>
 #B_SHARED = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
-#C = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}>
+#C = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
 #A_DOT = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth=2}>
 #B_DOT = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth=2}>
 
 
@@ -59,7 +59,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1]}>
 #shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #smem = #ttg.shared_memory
-#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [32, 32], isTransposed = true}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
   // COMMON-LABEL: @local_loads_with_token_from_async_wait
   tt.func public @local_loads_with_token_from_async_wait(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
@@ -98,7 +98,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1]}>
 #shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #smem = #ttg.shared_memory
-#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [32, 32], isTransposed = true}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
   // COMMON-LABEL: @local_loads_without_token_from_async_wait
   tt.func public @local_loads_without_token_from_async_wait(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
@@ -137,7 +137,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1]}>
 #shared1 = #ttg.swizzled_shared<{vec = 8, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #smem = #ttg.shared_memory
-#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [32, 32], isTransposed = true}>
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [32, 32], isTransposed = true}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
   // COMMON-LABEL: @local_loads_with_loop_carried_token
   tt.func public @local_loads_with_loop_carried_token(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
 
@@ -1,10 +1,10 @@
 // RUN: triton-opt %s --convert-triton-amdgpu-to-llvm=arch=gfx942 --mlir-print-debuginfo --mlir-pretty-debuginfo| FileCheck %s
 
-#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
-#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = false}>
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 16], isTransposed = false}>
 #shared = #ttg.swizzled_shared<{vec = 16, perPhase = 4, maxPhase = 1, order = [1, 0]}>
 #smem = #ttg.shared_memory
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 544 : i32, "ttg.threads-per-warp" = 32 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 544 : i32, "ttg.threads-per-warp" = 64 : i32} {
   // CHECK-LABEL: @local_load_offset
   tt.func @local_load_offset(%arg0: tensor<16x16xf16, #mma>) {
     %0 = ttg.convert_layout %arg0 {allocation.offset = 0 : i32} : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #blocked> loc(#loc1)
 
@@ -2167,7 +2167,7 @@ tt.func @gather_in_shared(%arg0: tensor<16x4xi32, #blocked1>, %arg1: tensor<8x4x
 
 // -----
 
-#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [1, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [1, 1]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [1, 1]}>
 #dot = #ttg.dot_op<{opIdx=0, parent=#mma, kWidth=1}>
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>