11// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -tritongpu-hoist-tmem-alloc | FileCheck %s --check-prefix=TMEM --check-prefix=FUNC
22// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -verify-diagnostics --tritongpu-hoist-tmem-alloc -tritongpu-partition-scheduling -tritongpu-load-mma-specialization -sccp -int-range-optimizations -canonicalize -cse -tritongpu-remove-layout-conversions | FileCheck %s
3- // RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -verify-diagnostics --tritongpu-hoist-tmem-alloc -tritongpu-automatic-warp-specialization | FileCheck %s --check-prefix=AWS --check-prefix=FUNC
3+ // RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -verify-diagnostics --tritongpu-hoist-tmem-alloc -tritongpu-assign-latencies -tritongpu-schedule-loops -tritongpu- automatic-warp-specialization | FileCheck %s --check-prefix=AWS --check-prefix=FUNC
44
55#acc_layout = #ttg.blocked <{sizePerThread = [1 , 128 ], threadsPerWarp = [32 , 1 ], warpsPerCTA = [4 , 1 ], order = [0 , 1 ]}>
66#oper_layout = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 32 ], warpsPerCTA = [2 , 2 ], order = [1 , 0 ]}>
@@ -768,7 +768,7 @@ tt.func @matmul_scaled_rhs_scales_tma(
768768 %off_n: i32 ,
769769 %a_desc: !tt.tensordesc <tensor <128 x64 xf8 E4 M3 FN, #nvmma_smem >>,
770770 %b_desc: !tt.tensordesc <tensor <128 x64 xf8 E4 M3 FN, #nvmma_smem >>,
771- %b_scale_desc: !tt.tensordesc <tensor <128 x8 xi8 , #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [4 , 3 , 2 , 1 , 0 ]}>>>
771+ %b_scale_desc: !tt.tensordesc <tensor <128 x8 xi8 , #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ]}>>>
772772) {
773773 %true = arith.constant true
774774 %c0_i32 = arith.constant 0 : i32
@@ -791,7 +791,7 @@ tt.func @matmul_scaled_rhs_scales_tma(
791791 // CHECK-COUNT-3: async_tma_copy_global_to_local {{.*}} {ttg.partition = 2 : i32}
792792 %a_reg = tt.descriptor_load %a_desc [%off_m , %off_k ] : !tt.tensordesc <tensor <128 x64 xf8 E4 M3 FN, #nvmma_smem >> -> tensor <128 x64 xf8 E4 M3 FN, #oper_layout >
793793 %b_reg = tt.descriptor_load %b_desc [%off_n , %off_k ] : !tt.tensordesc <tensor <128 x64 xf8 E4 M3 FN, #nvmma_smem >> -> tensor <128 x64 xf8 E4 M3 FN, #oper_layout >
794- %b_scales_reg = tt.descriptor_load %b_scale_desc [%off_m , %c0_i32 ] : !tt.tensordesc <tensor <128 x8 xi8 , #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [4 , 3 , 2 , 1 , 0 ]}>>> -> tensor <128 x8 xi8 , #scales >
794+ %b_scales_reg = tt.descriptor_load %b_scale_desc [%off_m , %c0_i32 ] : !tt.tensordesc <tensor <128 x8 xi8 , #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ]}>>> -> tensor <128 x8 xi8 , #scales >
795795
796796 %a_sh = ttg.local_alloc %a_reg : (tensor <128 x64 xf8 E4 M3 FN, #oper_layout >) -> !ttg.memdesc <128 x64 xf8 E4 M3 FN, #nvmma_smem , #smem >
797797 %b_sh_raw = ttg.local_alloc %b_reg : (tensor <128 x64 xf8 E4 M3 FN, #oper_layout >) -> !ttg.memdesc <128 x64 xf8 E4 M3 FN, #nvmma_smem , #smem >
@@ -1023,13 +1023,13 @@ tt.func @specialize_load_only(%desc: !tt.tensordesc<tensor<128x64xf16, #shared>>
10231023 %c1_i32 = arith.constant 1 : i32
10241024 // CHECK: local_alloc : () -> !ttg.memdesc<3x128x64xf16,
10251025 scf.for %i = %c0_i32 to %ub step %c1_i32 : i32 {
1026- // CHECK: wait_barrier {{.*}} {ttg.partition = 0 : i32}
1027- // CHECK-NEXT: local_load {{.*}} {ttg.partition = 0 : i32}
1026+ // CHECK: wait_barrier {{.*}} {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32}
1027+ // CHECK-NEXT: local_load {{.*}} {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32}
10281028 // CHECK-NEXT: fence_async_shared {{.*}}partition = 0
1029- // CHECK-NEXT: arrive_barrier {{.*}} {ttg.partition = 0 : i32}
1030- %val = tt.descriptor_load %desc [%i , %i ] : !tt.tensordesc <tensor <128 x64 xf16 , #shared >> -> tensor <128 x64 xf16 , #oper_layout >
1031- " use" (%val ) : (tensor <128 x64 xf16 , #oper_layout >) -> ()
1032- } {tt.warp_specialize }
1029+ // CHECK-NEXT: arrive_barrier {{.*}} {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32}
1030+ %val = tt.descriptor_load %desc [%i , %i ] { loop.cluster = 1 : i32 , loop.stage = 0 } : !tt.tensordesc <tensor <128 x64 xf16 , #shared >> -> tensor <128 x64 xf16 , #oper_layout >
1031+ " use" (%val ) { loop.cluster = 0 : i32 , loop.stage = 1 : i32 } : (tensor <128 x64 xf16 , #oper_layout >) -> ()
1032+ } {tt.num_stages = 3 : i32 , tt.scheduled_max_stage = 1 : i32 , tt. warp_specialize }
10331033 tt.return
10341034}
10351035
@@ -1041,9 +1041,9 @@ tt.func @fp4_padded_load(%desc: !tt.tensordesc<tensor<1x256x64xui8, #fp4_padded_
10411041 scf.for %i = %c0_i32 to %ub step %c1_i32 : i32 {
10421042 // CHECK: [[IDX:%.*]] = arith.muli [[I]], %c2_i32 : i32
10431043 // CHECK: async_tma_copy_global_to_local %arg{{[0-9]+}}[[[I]], [[IDX]]]
1044- %val = tt.descriptor_load %desc [%i , %i ] : !tt.tensordesc <tensor <1 x256 x64 xui8 , #fp4_padded_shared >> -> tensor <256 x64 xi8 , #oper_layout >
1045- " use" (%val ) : (tensor <256 x64 xi8 , #oper_layout >) -> ()
1046- } {tt.warp_specialize }
1044+ %val = tt.descriptor_load %desc [%i , %i ] { loop.cluster = 1 : i32 , loop.stage = 0 , ttg.partition = 2 : i32 } : !tt.tensordesc <tensor <1 x256 x64 xui8 , #fp4_padded_shared >> -> tensor <256 x64 xi8 , #oper_layout >
1045+ " use" (%val ) { loop.cluster = 0 : i32 , loop.stage = 1 : i32 , ttg.partition = 0 : i32 } : (tensor <256 x64 xi8 , #oper_layout >) -> ()
1046+ } {tt.num_stages = 2 : i32 , tt.scheduled_max_stage = 1 : i32 , tt. warp_specialize }
10471047 tt.return
10481048}
10491049
0 commit comments