@@ -135,3 +135,57 @@ module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.sup
135135 tt.return
136136 }
137137}
138+
139+ // -----
140+
141+ #mma = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [8 , 1 ], repCluster = [2 , 2 ]}>
142+ #mma_1 = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [4 , 2 ], repCluster = [1 , 1 ]}>
143+ #mma_2 = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [8 , 1 ], repCluster = [4 , 2 ]}>
144+ module attributes {triton_intel_gpu.support_sg_2d_block , " ttg.num-warps" = 8 : i32 } {
145+ // CHECK-LABEL: @regular_pointer_block_io
146+ tt.func public @regular_pointer_block_io (%arg0: tensor <256 x64 x!tt.ptr <f16 >, #mma >,
147+ %arg1: tensor <256 x64 x!tt.ptr <f16 >, #mma_1 >,
148+ %arg2: tensor <128 x64 x!tt.ptr <f16 >, #mma_2 >,
149+ %arg3: tensor <256 x64 x!tt.ptr <f16 >, #mma_2 >) {
150+
151+ // CHECK-COUNT-4: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v32f16
152+ %0 = tt.load %arg0 {triton_intel_gpu.block_io = " row_major" } : tensor <256 x64 x!tt.ptr <f16 >, #mma >
153+
154+ // CHECK-COUNT-16: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPDh
155+ %1 = tt.load %arg1 {triton_intel_gpu.block_io = " row_major" } : tensor <256 x64 x!tt.ptr <f16 >, #mma_1 >
156+
157+ // CHECK-COUNT-2: llvm.call spir_funccc @_Z42intel_sub_group_2d_block_read_16b_32r16x2cPU3AS1viiiDv2_iPDh
158+ %2 = tt.load %arg3 {triton_intel_gpu.block_io = " row_major" } : tensor <256 x64 x!tt.ptr <f16 >, #mma_2 >
159+
160+ // COM: The data is duplicated in the warps because the warp shape is 32*8=256 larger than the tensor shape 128
161+ // CHECK-COUNT-2: llvm.call spir_funccc @_Z42intel_sub_group_2d_block_read_16b_32r16x2cPU3AS1viiiDv2_iPDh
162+ %3 = tt.load %arg2 {triton_intel_gpu.block_io = " row_major" } : tensor <128 x64 x!tt.ptr <f16 >, #mma_2 >
163+ tt.return
164+ }
165+ }
166+
167+ // -----
168+
169+ #mma = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [8 , 1 ], repCluster = [2 , 2 ]}>
170+ #mma_1 = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 8 , 1 ], repCluster = [1 , 2 , 2 ]}>
171+ #mma_32 = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 32 , warpsPerCTA = [8 , 1 ], repCluster = [2 , 2 ]}>
172+ module attributes {triton_intel_gpu.support_sg_2d_block , " ttg.num-warps" = 8 : i32 } {
173+ // CHECK-LABEL: @regular_pointer_gather_io
174+ tt.func public @regular_pointer_gather_io (%arg0: tensor <128 x64 x!tt.ptr <f16 >, #mma >,
175+ %arg1: tensor <128 x64 x!tt.ptr <f16 >, #mma_32 >,
176+ %arg2: tensor <2 x128 x64 x!tt.ptr <f16 >, #mma_1 >) {
177+ // COM: The pitch is not available in the current implementation.
178+ // COM: Not from axis info or ptrs[{0, 0}] and ptrs[{1, 0}] in the same work item.
179+ // CHECK-COUNT-32: llvm.load {{.*}} {alignment = 2 : i64} : !llvm.ptr<1> -> i16
180+ %0 = tt.load %arg1 {triton_intel_gpu.block_io = " row_major" } : tensor <128 x64 x!tt.ptr <f16 >, #mma_32 >
181+
182+ // COM: Not support column major block io.
183+ // CHECK-COUNT-32: llvm.load {{.*}} {alignment = 2 : i64} : !llvm.ptr<1> -> i16
184+ %1 = tt.load %arg0 {triton_intel_gpu.block_io = " column_major" } : tensor <128 x64 x!tt.ptr <f16 >, #mma >
185+
186+ // COM: Not support rank size > 2.
187+ // CHECK-COUNT-128: llvm.load {{.*}} {alignment = 2 : i64} : !llvm.ptr<1> -> i16
188+ %2 = tt.load %arg2 {triton_intel_gpu.block_io = " column_major" } : tensor <2 x128 x64 x!tt.ptr <f16 >, #mma_1 >
189+ tt.return
190+ }
191+ }
0 commit comments