@@ -312,3 +312,24 @@ module attributes {ttg.target = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warps"
312
312
tt.return
313
313
}
314
314
}
315
+
316
+ // -----
317
+
318
+ // CHECK-NOT: triton_intel_gpu.dpas
319
+ #blocked = #ttg.blocked <{sizePerThread = [4 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
320
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp" = 16 : i32 , " triton_intel_gpu.min_sg_size" = 16 : i32 , " triton_intel_gpu.support_dpas" } {
321
+ // CHECK-LABEL: check_dpas_cap
322
+ tt.func @check_dpas_cap (%arg0: !tt.ptr <f32 > {tt.divisibility = 16 : i32 }) {
323
+ %zero_f32 = arith.constant dense <0.000000e+00 > : tensor <128 x16 xf32 , #blocked >
324
+ %a = arith.constant dense <0.000000e+00 > : tensor <128 x128 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked }>>
325
+ %b = arith.constant dense <0.000000e+00 > : tensor <128 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #blocked }>>
326
+
327
+ %result = tt.dot %a , %b , %zero_f32 , inputPrecision = tf32 : tensor <128 x128 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked }>> * tensor <128 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #blocked }>> -> tensor <128 x16 xf32 , #blocked >
328
+ %result_ptr = tt.splat %arg0 : !tt.ptr <f32 > -> tensor <128 x16 x!tt.ptr <f32 >, #blocked >
329
+ tt.store %result_ptr , %result : tensor <128 x16 x!tt.ptr <f32 >, #blocked >
330
+
331
+ %result2 = tt.dot %a , %b , %zero_f32 : tensor <128 x128 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #blocked }>> * tensor <128 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #blocked }>> -> tensor <128 x16 xf32 , #blocked >
332
+ tt.store %result_ptr , %result2 : tensor <128 x16 x!tt.ptr <f32 >, #blocked >
333
+ tt.return
334
+ }
335
+ }
0 commit comments