@@ -63,3 +63,91 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
6363 tt.return %0 : tensor <64 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>
6464 }
6565}
66+
67+ // -----
68+
69+ // CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 1], order = [0, 1]}>
70+ // CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [2], order = [0]}>
71+
72+ #blocked = #triton_gpu.blocked <{sizePerThread = [16 , 1 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [2 , 1 ], order = [0 , 1 ]}>
73+
74+ module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 2 : i32 , " triton_gpu.threads-per-warp" = 16 : i32 } {
75+ // CHECK-LABEL: tt.func @test_blocked_multi_warp(
76+ // CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
77+ // CHECK-SAME: %[[VAL_1:.*]]: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> {
78+ tt.func @test_blocked_multi_warp (%arg0: tensor <32 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>, %arg1: tensor <32 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>) -> tensor <32 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>> {
79+ // CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<32xf32, #[[$ATTR_1]]>
80+ // CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<32xf32, #[[$ATTR_1]]>
81+ // CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<32xf32, #[[$ATTR_1]]>
82+ %0 = arith.addf %arg0 , %arg1 : tensor <32 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>
83+ // CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<32xf32, #[[$ATTR_1]]> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
84+ // CHECK: tt.return %[[VAL_5]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
85+ tt.return %0 : tensor <32 xf32 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>
86+ }
87+ }
88+
89+ // -----
90+
91+ // CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [32, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 1], order = [0, 1]}>
92+ // CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}>
93+
94+ #blocked = #triton_gpu.blocked <{sizePerThread = [32 , 1 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [4 , 1 ], order = [0 , 1 ]}>
95+
96+ module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 4 : i32 , " triton_gpu.threads-per-warp" = 16 : i32 } {
97+ // CHECK-LABEL: tt.func @test_blocked_multi_warp_double_stride(
98+ // CHECK-SAME: %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
99+ // CHECK-SAME: %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> {
100+ tt.func @test_blocked_multi_warp_double_stride (%arg0: tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>, %arg1: tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>) -> tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>> {
101+ // CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<128xf16, #[[$ATTR_1]]>
102+ // CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<128xf16, #[[$ATTR_1]]>
103+ // CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_1]]>
104+ %0 = arith.addf %arg0 , %arg1 : tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>
105+ // CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_1]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
106+ // CHECK: tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
107+ tt.return %0 : tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #blocked }>>
108+ }
109+ }
110+
111+ // -----
112+
113+ // CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [8], order = [0]}>
114+ // CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
115+
116+ #mma = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [8 , 1 ], repCluster = [2 , 2 ], A = [16 , 16 ], B = [16 , 32 ], C = [16 , 32 ]}>
117+
118+ module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 8 : i32 , " triton_gpu.threads-per-warp" = 16 : i32 } {
119+ // CHECK-LABEL: tt.func @test_mma_multi_warp_double_stride(
120+ // CHECK-SAME: %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
121+ // CHECK-SAME: %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> {
122+ tt.func @test_mma_multi_warp_double_stride (%arg0: tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>, %arg1: tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>) -> tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>> {
123+ // CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
124+ // CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
125+ // CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_0]]>
126+ %0 = arith.addf %arg0 , %arg1 : tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>
127+ // CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_0]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
128+ // CHECK: tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
129+ tt.return %0 : tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>
130+ }
131+ }
132+
133+ // -----
134+
135+ // CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [2], order = [0]}>
136+ // CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
137+
138+ #mma = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 1 ], repCluster = [2 , 2 ], A = [16 , 16 ], B = [16 , 32 ], C = [16 , 32 ]}>
139+
140+ module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 2 : i32 , " triton_gpu.threads-per-warp" = 16 : i32 } {
141+ // CHECK-LABEL: tt.func @test_mma_multi_warp_double_stride_repeat(
142+ // CHECK-SAME: %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
143+ // CHECK-SAME: %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> {
144+ tt.func @test_mma_multi_warp_double_stride_repeat (%arg0: tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>, %arg1: tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>) -> tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>> {
145+ // CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
146+ // CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
147+ // CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_0]]>
148+ %0 = arith.addf %arg0 , %arg1 : tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>
149+ // CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_0]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
150+ // CHECK: tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
151+ tt.return %0 : tensor <128 xf16 , #triton_gpu.slice <{dim = 1 , parent = #mma }>>
152+ }
153+ }
0 commit comments