Merge commit 'c6ee6266538f252035b8836643b5fa05fa61b707'

whitneywhtsang · whitneywhtsang · commit a35e8b348288 · 2025-05-13T18:32:48.000Z
diff --git a/bench/triton_bench/swiglu.py b/bench/triton_bench/swiglu.py
@@ -69,7 +69,7 @@ def forward(ctx, a, alpha, precision_config, routing_data):
             n_tokens,
             BLOCK_M=BLOCK_M,
             BLOCK_N=BLOCK_N,
-            EVEN_N=(N // 2) % 2 == 0,
+            EVEN_N=(N // 2) % BLOCK_N == 0,
             M_BLOCKS=M_BLOCKS,
             N_BLOCKS=N_BLOCKS,
             flexpoint_saturate_inf=flex_ctx.saturate_inf,
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -1174,7 +1174,7 @@ static LogicalResult pipelineMMA(scf::ForOp &loop, PipelinedMMA &mma,
     // in this synchronization edge.
     decltype(operandDefs) nextOperandDefs;
     for (auto &[defOp, defPartition] : operandDefs) {
-      if (defPartition == partition)
+      if (defPartition == partition && inBody(node.op)->isBeforeInBlock(mmaOp))
         defs.push_back(defOp);
       else
         nextOperandDefs.emplace_back(defOp, defPartition);
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp
@@ -38,10 +38,6 @@ static bool isTMACompatibleEncoding(Attribute enc) {
   if (auto nvmma = dyn_cast<ttg::NVMMASharedEncodingAttr>(enc)) {
     return !nvmma.getTransposed();
   }
-  if (auto swizzled = dyn_cast<ttg::SwizzledSharedEncodingAttr>(enc)) {
-    return swizzled.getVec() == 1 && swizzled.getPerPhase() == 1 &&
-           swizzled.getMaxPhase() == 1;
-  }
   return false;
 }
 
diff --git a/python/test/unit/runtime/test_compilation_listener.py b/python/test/unit/runtime/test_compilation_listener.py
@@ -18,13 +18,13 @@ def cumsum_kernel(ptr):
 
 
 def test_compile_stats(device: str, fresh_knobs_except_libraries: Any, fresh_triton_cache: str) -> None:
-    captured: Union[tuple[Union[ASTSource, IRSource], dict[str, Any], CompileTimes, bool], None] = None
+    captured: Union[tuple[Union[ASTSource, IRSource], dict[str, Any], dict[str, Any], CompileTimes, bool], None] = None
 
-    def compile_listener(src: Union[ASTSource, IRSource], metadata: dict[str, Any], times: CompileTimes,
-                         cache_hit: bool) -> None:
+    def compile_listener(src: Union[ASTSource, IRSource], metadata: dict[str, Any], metadata_group: dict[str, Any],
+                         times: CompileTimes, cache_hit: bool) -> None:
         nonlocal captured
         assert captured is None
-        captured = (src, metadata, times, cache_hit)
+        captured = (src, metadata, metadata_group, times, cache_hit)
 
     fresh_knobs_except_libraries.compilation.listener = compile_listener
 
@@ -34,17 +34,17 @@ def compile_listener(src: Union[ASTSource, IRSource], metadata: dict[str, Any],
     assert captured is not None
 
     # No cache hit at first
-    assert not captured[3]
+    assert not captured[4]
 
     # Expected metadata
     assert len(captured[1]["hash"]) > 0
     assert isinstance(captured[1]["target"], GPUTarget)
 
     # It in fact did take some time to do compilation
-    assert captured[2].ir_initialization > 0
-    assert captured[2].total_lowering > 0
-    assert captured[2].store_results > 0
-    assert captured[2].total > 0
+    assert captured[3].ir_initialization > 0
+    assert captured[3].total_lowering > 0
+    assert captured[3].store_results > 0
+    assert captured[3].total > 0
 
     # Now lets create a new instance of the same kernel to pick up cache_hit=True
     cumsum_kernel.device_caches.clear()
@@ -53,14 +53,14 @@ def compile_listener(src: Union[ASTSource, IRSource], metadata: dict[str, Any],
 
     assert captured is not None
     # Cache hit!
-    assert captured[3]
+    assert captured[4]
 
     # Expected metadata
     assert len(captured[1]["hash"]) > 0
     assert isinstance(captured[1]["target"], GPUTarget)
 
     # It in fact did take some time to do compilation
-    assert captured[2].ir_initialization > 0
-    assert captured[2].total_lowering == 0
-    assert captured[2].store_results == 0
-    assert captured[2].total > 0
+    assert captured[3].ir_initialization > 0
+    assert captured[3].total_lowering == 0
+    assert captured[3].store_results == 0
+    assert captured[3].total > 0
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
@@ -305,6 +305,7 @@ def compile(src, target=None, options=None):
             compilation_listener(
                 src=src,
                 metadata=res.metadata._asdict(),
+                metadata_group=metadata_group,
                 times=timer.end(),
                 cache_hit=True,
             )
@@ -384,7 +385,8 @@ def compile(src, target=None, options=None):
 
     # notify any listener
     if compilation_listener:
-        compilation_listener(src=src, metadata=metadata, times=timer.end(), cache_hit=False)
+        compilation_listener(src=src, metadata=metadata, metadata_group=metadata_group, times=timer.end(),
+                             cache_hit=False)
     # return handle to compiled kernel
     return CompiledKernel(src, metadata_group, hash)
 
diff --git a/python/triton/knobs.py b/python/triton/knobs.py
@@ -306,8 +306,8 @@ def total(self) -> int:
 
 class CompilationListener(Protocol):
 
-    def __call__(self, *, src: Union[ASTSource, IRSource], metadata: dict[str, Any], times: CompileTimes,
-                 cache_hit: bool) -> None:
+    def __call__(self, *, src: Union[ASTSource, IRSource], metadata: dict[str, Any], metadata_group: dict[str, Any],
+                 times: CompileTimes, cache_hit: bool) -> None:
         ...
 
 
diff --git a/test/TritonGPU/load-mma-specialization.mlir b/test/TritonGPU/load-mma-specialization.mlir
@@ -4,9 +4,11 @@
 
 #acc_layout = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 #oper_layout = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#oper_layout_trans = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
 // CHECK-DAG: [[SHARED:#.*]] = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #shared_trans = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#nvmma_smem = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>
 #smem = #ttg.shared_memory
 // CHECK-DAG: [[ACC_TMEM:#.*]] = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 #acc_tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
@@ -791,8 +793,8 @@ tt.func @matmul_scaled_rhs_scales_tma(
   %k_tiles: i32,
   %off_m: i32,
   %off_n: i32,
-  %a_desc: !tt.tensordesc<tensor<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>>>,
-  %b_desc: !tt.tensordesc<tensor<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>>>,
+  %a_desc: !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>>,
+  %b_desc: !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>>,
   %b_scale_desc: !tt.tensordesc<tensor<128x8xi8, #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 3, 2, 1, 0]}>>>
 ) {
   %true = arith.constant true
@@ -814,14 +816,14 @@ tt.func @matmul_scaled_rhs_scales_tma(
 
     // CHECK: ttng.wait_barrier
     // CHECK-COUNT-3: async_tma_copy_global_to_local {{.*}} {ttg.partition = 2 : i32}
-    %a_reg = tt.descriptor_load %a_desc[%off_m, %off_k] : !tt.tensordesc<tensor<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>>> -> tensor<128x64xf8E4M3FN, #oper_layout>
-    %b_reg = tt.descriptor_load %b_desc[%off_n, %off_k] : !tt.tensordesc<tensor<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>>> -> tensor<128x64xf8E4M3FN, #oper_layout>
+    %a_reg = tt.descriptor_load %a_desc[%off_m, %off_k] : !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>> -> tensor<128x64xf8E4M3FN, #oper_layout>
+    %b_reg = tt.descriptor_load %b_desc[%off_n, %off_k] : !tt.tensordesc<tensor<128x64xf8E4M3FN, #nvmma_smem>> -> tensor<128x64xf8E4M3FN, #oper_layout>
     %b_scales_reg = tt.descriptor_load %b_scale_desc[%off_m, %c0_i32] : !tt.tensordesc<tensor<128x8xi8, #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 3, 2, 1, 0]}>>> -> tensor<128x8xi8, #oper_layout>
 
-    %a_sh = ttg.local_alloc %a_reg : (tensor<128x64xf8E4M3FN, #oper_layout>) -> !ttg.memdesc<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>, #smem>
-    %b_sh_raw = ttg.local_alloc %b_reg : (tensor<128x64xf8E4M3FN, #oper_layout>) -> !ttg.memdesc<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>, #smem>
+    %a_sh = ttg.local_alloc %a_reg : (tensor<128x64xf8E4M3FN, #oper_layout>) -> !ttg.memdesc<128x64xf8E4M3FN, #nvmma_smem, #smem>
+    %b_sh_raw = ttg.local_alloc %b_reg : (tensor<128x64xf8E4M3FN, #oper_layout>) -> !ttg.memdesc<128x64xf8E4M3FN, #nvmma_smem, #smem>
     // CHECK-NEXT: memdesc_trans {{.*}} ttg.partition = 1 : i32
-    %b_sh = ttg.memdesc_trans %b_sh_raw {order = array<i32: 1, 0>} : !ttg.memdesc<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>, #smem> -> !ttg.memdesc<64x128xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 8}>, #smem>
+    %b_sh = ttg.memdesc_trans %b_sh_raw {order = array<i32: 1, 0>} : !ttg.memdesc<128x64xf8E4M3FN, #nvmma_smem, #smem> -> !ttg.memdesc<64x128xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 8}>, #smem>
 
     // CHECK-NEXT: wait_barrier {{.*}} {ttg.partition = 1 : i32}
 
@@ -831,7 +833,7 @@ tt.func @matmul_scaled_rhs_scales_tma(
 
     // CHECK-NEXT: [[IS_LAST:%.*]] = arith.cmpi eq, %arg6, [[LAST_ITER]]
     // CHECK-NEXT: tc_gen5_mma_scaled {{.*}} {ttg.partition = 1 : i32}
-    %mma_tok = ttng.tc_gen5_mma_scaled %a_sh, %b_sh, %c_tmem[%c_tok], %a_scales_tmem, %b_scales_tmem, %true, %true lhs = e4m3 rhs = e4m3 : !ttg.memdesc<128x64xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>, #smem>, !ttg.memdesc<64x128xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 8}>, #smem>, !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>
+    %mma_tok = ttng.tc_gen5_mma_scaled %a_sh, %b_sh, %c_tmem[%c_tok], %a_scales_tmem, %b_scales_tmem, %true, %true lhs = e4m3 rhs = e4m3 : !ttg.memdesc<128x64xf8E4M3FN, #nvmma_smem, #smem>, !ttg.memdesc<64x128xf8E4M3FN, #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 8}>, #smem>, !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>
 
     %c, %load_tok = ttng.tmem_load %c_tmem[%mma_tok] : !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #acc_layout>
     scf.yield %c : tensor<128x128xf32, #acc_layout>
@@ -1125,6 +1127,56 @@ tt.func @specialize_mma_only(%rhs_desc: !tt.tensordesc<tensor<64x128xf16, #share
   tt.return
 }
 
+// CHECK-LABEL: @load_scale_mma_user
+tt.func @load_scale_mma_user(
+  %lhs: !ttg.memdesc<128x64xf16, #shared, #smem>,
+  %rhs: !ttg.memdesc<64x128xf16, #shared, #smem>,
+  %scales_desc: !tt.tensordesc<tensor<8x128xi8, #shared>>,
+  %b_scales: !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>,
+  %ub: i32
+) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %true = arith.constant true
+  %zero = arith.constant dense<0.0> : tensor<128x128xf32, #acc_layout>
+
+  // CHECK: scf.for
+  %out = scf.for %i = %c0_i32 to %ub step %c1_i32 iter_args(%acc = %zero) -> tensor<128x128xf32, #acc_layout> : i32 {
+    // CHECK: wait_barrier [[EMPTY_BAR:%.*]], %{{.*}}partition = 2
+    // CHECK: barrier_expect [[SCALES_BAR:%.*]], 1024 {{.*}}partition = 2
+    // CHECK: async_tma_copy_global_to_local {{.*}}partition = 2
+    %scales_result = tt.descriptor_load %scales_desc[%i, %i] : !tt.tensordesc<tensor<8x128xi8, #shared>> -> tensor<8x128xi8, #oper_layout>
+    %scales_shared = ttg.local_alloc %scales_result : (tensor<8x128xi8, #oper_layout>) -> !ttg.memdesc<8x128xi8, #shared, #smem>
+    // CHECK: wait_barrier [[SCALES_BAR]]{{.*}}partition = 0
+    // CHECK-NEXT: [[SCALES_REG:%.*]] = ttg.local_load {{.*}}partition = 0
+    // CHECK-NEXT: arrive_barrier [[EMPTY_BAR]]{{.*}}partition = 0
+    %scales_reg = ttg.local_load %scales_shared : !ttg.memdesc<8x128xi8, #shared, #smem> -> tensor<8x128xi8, #oper_layout>
+    // CHECK-NEXT: [[SCALES_TRANS:%.*]] = tt.trans [[SCALES_REG]] {{.*}}partition = 0
+    %scales_T = tt.trans %scales_reg {order = array<i32: 1, 0>} : tensor<8x128xi8, #oper_layout> -> tensor<128x8xi8, #oper_layout_trans>
+    // CHECK-NEXT: wait_barrier [[SCALES_TMEM_BAR:%.*]], %arg{{[0-9]+}} {{.*}}partition = 0
+    // CHECK-NEXT: tmem_store [[SCALES_TRANS]], [[SCALES_TMEM:%.*]], %true {{.*}}partition = 0
+    %scales_tmem = ttng.tmem_alloc %scales_T : (tensor<128x8xi8, #oper_layout_trans>) -> !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>
+    // CHECK-NEXT: arrive_barrier [[SCALES_READY_BAR:%.*]], 1 {{.*}}partition = 0
+
+    // CHECK: wait_barrier [[SCALES_READY_BAR]]{{.*}}partition = 1
+    %acc_tmem, %acc_tok = ttng.tmem_alloc %acc : (tensor<128x128xf32, #acc_layout>) -> (!ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
+    // CHECK-NEXT: tc_gen5_mma_scaled {{.*}} [[SCALES_TMEM]]{{.*}} [[USER_BAR:%.*]][%true], [[SCALES_TMEM_BAR]][%true] {{.*}}partition = 1
+    %mma_tok = ttng.tc_gen5_mma_scaled %lhs, %rhs, %acc_tmem[%acc_tok], %scales_tmem, %b_scales, %true, %true lhs = e4m3 rhs = e4m3 : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>, !ttg.memdesc<128x8xi8, #ttng.tensor_memory_scales_encoding<>, #ttng.tensor_memory>
+
+    // CHECK: wait_barrier [[USER_BAR]]{{.*}}partition = 0
+    // CHECK-NEXT: tmem_load
+    %c, %load_tok = ttng.tmem_load %acc_tmem[%mma_tok] : !ttg.memdesc<128x128xf32, #acc_tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #acc_layout>
+    // CHECK: arrive_barrier [[USER_DONE:%.*]], 1 {{.*}}partition = 0
+    // CHECK: wait_barrier [[USER_DONE]]{{.*}}partition = 1
+
+    "user"(%c) : (tensor<128x128xf32, #acc_layout>) -> ()
+
+    scf.yield %c : tensor<128x128xf32, #acc_layout>
+  } {tt.warp_specialize, tt.num_stages = 3 : i32}
+  "use"(%out) : (tensor<128x128xf32, #acc_layout>) -> ()
+  tt.return
+}
+
 }
 
 // -----
diff --git a/test/TritonNvidiaGPU/optimize_descriptor_encoding.mlir b/test/TritonNvidiaGPU/optimize_descriptor_encoding.mlir
@@ -49,11 +49,11 @@ tt.func public @tma_scatter(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %ar
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
 // CHECK-DAG: #[[BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
-// CHECK-DAG: #[[SWIZZLE_3D:.*]] = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [2, 1, 0]}>
+// CHECK-DAG: #[[SWIZZLE_MMA:.*]] = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32, CTAsPerCGA = [1, 1, 1], CTASplitNum = [1, 1, 1], CTAOrder = [2, 1, 0]}>
 // CHECK-DAG: #[[SWIZZLE_2D:.*]] = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 tt.func public @tma_scatter(%arg0: !tt.ptr<f32>, %arg1: i32, %arg2: i32, %arg3: i64, %arg4: i64) {
-  // CHECK: tt.make_tensor_descriptor {{.*}} : <f32>, <tensor<1x256x32xf32, #[[SWIZZLE_3D]]>>
-  // CHECK: %[[LOAD:.*]] = tt.descriptor_load {{.*}} : !tt.tensordesc<tensor<1x256x32xf32, #[[SWIZZLE_3D]]>> -> tensor<256x32xf32, #[[BLOCKED]]>
+  // CHECK: tt.make_tensor_descriptor {{.*}} : <f32>, <tensor<1x256x32xf32, #[[SWIZZLE_MMA]]>>
+  // CHECK: %[[LOAD:.*]] = tt.descriptor_load {{.*}} : !tt.tensordesc<tensor<1x256x32xf32, #[[SWIZZLE_MMA]]>> -> tensor<256x32xf32, #[[BLOCKED]]>
   // CHECK: ttg.local_alloc %[[LOAD]] : (tensor<256x32xf32, #[[BLOCKED]]>) -> !ttg.memdesc<256x32xf32, #[[SWIZZLE_2D]], #smem>
   %c1_i32 = arith.constant 1 : i32
   %c1_i64 = arith.constant 1 : i64
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
@@ -436,7 +436,7 @@ static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
   for (auto [ws, stateMap] : llvm::zip(wsOps, warpToState)) {
     Block *before = ws->getBlock();
     Block *after = b.splitBlock(before, ws->getIterator());
-    b.setInsertionPointToEnd(before);
+    TritonLLVMIRRewriter b(ws.getLoc(), OpBuilder::atBlockEnd(before));
     Value statePtr = LLVM::getSharedMemoryBase(b.getLoc(), b, targetInfo, func);
     for (auto [i, state] : llvm::enumerate(stateMap)) {
       Value stateVal = b.i8_val(state);
@@ -469,7 +469,7 @@ static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
     b.create<LLVM::BrOp>(&ws.getDefaultRegion().front());
 
     ws.getDefaultRegion().walk([&, ws = ws](WarpYieldOp op) mutable {
-      b.setInsertionPoint(op);
+      TritonLLVMIRRewriter b(op.getLoc(), op);
       createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
                     /*aligned=*/false);
       if (auto actRegs = ws.getActualRegisters())

Original file line number	Diff line number	Diff line change
`@@ -38,10 +38,6 @@ static bool isTMACompatibleEncoding(Attribute enc) {`
`38`	`38`	`if (auto nvmma = dyn_cast<ttg::NVMMASharedEncodingAttr>(enc)) {`
`39`	`39`	`return !nvmma.getTransposed();`
`40`	`40`	`}`
`41`		`- if (auto swizzled = dyn_cast<ttg::SwizzledSharedEncodingAttr>(enc)) {`
`42`		`- return swizzled.getVec() == 1 && swizzled.getPerPhase() == 1 &&`
`43`		`- swizzled.getMaxPhase() == 1;`
`44`		`- }`
`45`	`41`	`return false;`
`46`	`42`	`}`
`47`	`43`