@@ -65,7 +65,7 @@ tt.func @async_tma_gather(%desc: !tt.tensordesc<tensor<1x128xbf16, #shared>>, %x
6565
6666#blocked = #ttg.blocked <{sizePerThread = [1 ], threadsPerWarp = [32 ], warpsPerCTA = [4 ], order = [0 ]}>
6767
68- module attributes {" ttg.num-ctas " = 1 : i32 , " ttg.num- warps" = 4 : i32 } {
68+ module attributes {" ttg.num-warps" = 4 : i32 } {
6969tt.func @async_tma_gather (%desc: !tt.tensordesc <tensor <1 x128 xbf16 , #shared >>, %x_offsets: tensor <32 xi32 , #blocked >, %y_offset: i32 ,
7070 %bar: !ttg.memdesc <1 xi64 , #shared1 , #ttg.shared_memory , mutable >,
7171 %result: !ttg.memdesc <32 x128 xbf16 , #shared , #ttg.shared_memory >,
@@ -75,3 +75,17 @@ tt.func @async_tma_gather(%desc: !tt.tensordesc<tensor<1x128xbf16, #shared>>, %x
7575 tt.return
7676}
7777}
78+
79+ // -----
80+
81+ #mma = #ttg.nvidia_mma <{versionMajor = 3 , versionMinor = 0 , warpsPerCTA = [4 , 1 ], instrShape = [16 , 256 , 32 ]}>
82+ #shared = #ttg.nvmma_shared <{swizzlingByteWidth = 32 , transposed = true , elementBitWidth = 8 }>
83+ #blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 32 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ]}>
84+
85+ module attributes {" ttg.num-warps" = 4 : i32 } {
86+ tt.func @wgmma (%a: tensor <128 x128 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>>, %b: !ttg.memdesc <128 x128 xf16 , #shared , #ttg.shared_memory >, %c: tensor <128 x128 xf16 , #mma >) {
87+ // expected-error @below {{in-register LHS operand must have a kWidth of 2 but got 1}}
88+ %0 = ttng.warp_group_dot %a , %b , %c : tensor <128 x128 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>> * !ttg.memdesc <128 x128 xf16 , #shared , #ttg.shared_memory > -> tensor <128 x128 xf16 , #mma >
89+ tt.return
90+ }
91+ }
0 commit comments