Skip to content

Commit 75027b0

Browse files
authored
[test] Add XeGPU SIMT row/col reduction examples and use Async Region pass. (#1103)
[test] Add XeGPU SIMT row/col reduction examples and use Async Region pass.
1 parent a5a7ab4 commit 75027b0

17 files changed

+336
-110
lines changed

test/Integration/Dialect/XeGPU/SG/gemm_4kx4kx4k_f16_f16_f16.mlir

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,20 @@ module @gemm attributes {gpu.container_module} {
1212
%c64 = arith.constant 64 : index
1313
%c128 = arith.constant 128 : index
1414
%c512 = arith.constant 512 : index
15-
%t0 = gpu.wait async
16-
%A_gpu, %t1 = gpu.alloc async [%t0] () : memref<4096x4096xf16>
17-
%t2 = gpu.memcpy async [%t1] %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
18-
%B_gpu, %t3 = gpu.alloc async [%t2] () : memref<4096x4096xf16>
19-
%t4 = gpu.memcpy async [%t3] %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
20-
%C_gpu, %t5 = gpu.alloc async [%t4] () : memref<4096x4096xf16>
21-
%t6 = gpu.memcpy async [%t5] %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
15+
%A_gpu = gpu.alloc () : memref<4096x4096xf16>
16+
gpu.memcpy %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
17+
%B_gpu = gpu.alloc () : memref<4096x4096xf16>
18+
gpu.memcpy %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
19+
%C_gpu = gpu.alloc () : memref<4096x4096xf16>
20+
gpu.memcpy %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
2221
// NOTE: Here we can't use [8, 64] wi threads following the SG thread layout of [8, 4]. Because runtime will linearize the x dimension first (we need y dimension to be linearized first).
2322
// So just use linearized thread layout of [512, 1] wi threads.
24-
%t7 = gpu.launch_func async [%t6] @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
25-
gpu.wait [%t7] // Wait for the kernel to finish.
26-
%t12 = gpu.wait async
27-
%t8 = gpu.memcpy async [%t12] %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
28-
%t9 = gpu.dealloc async [%t8] %A_gpu : memref<4096x4096xf16>
29-
%t10 = gpu.dealloc async [%t9] %B_gpu : memref<4096x4096xf16>
30-
%t11 = gpu.dealloc async [%t10] %C_gpu : memref<4096x4096xf16>
31-
gpu.wait [%t11]
23+
gpu.launch_func @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
24+
gpu.wait // Wait for the kernel to finish.
25+
gpu.memcpy %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
26+
gpu.dealloc %A_gpu : memref<4096x4096xf16>
27+
gpu.dealloc %B_gpu : memref<4096x4096xf16>
28+
gpu.dealloc %C_gpu : memref<4096x4096xf16>
3229
return %C : memref<4096x4096xf16>
3330
}
3431

test/Integration/Dialect/XeGPU/SG/simple_gemm.mlir

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,21 +39,18 @@ module @gemm attributes {gpu.container_module} {
3939
%c1 = arith.constant 1 : index
4040
%c16 = arith.constant 16 : index
4141
%c32 = arith.constant 32 : index
42-
%t = gpu.wait async
43-
%memref_a, %t1 = gpu.alloc async [%t] () : memref<256x256xf16>
44-
%t2 = gpu.memcpy async [%t1] %memref_a, %a : memref<256x256xf16>, memref<256x256xf16>
45-
%memref_b, %t3 = gpu.alloc async [%t2] () : memref<256x256xf16>
46-
%t4 = gpu.memcpy async [%t3] %memref_b, %b : memref<256x256xf16>, memref<256x256xf16>
47-
%memref_c, %t5 = gpu.alloc async [%t4] () : memref<256x256xf32>
48-
%t6 = gpu.memcpy async [%t5] %memref_c, %c : memref<256x256xf32>, memref<256x256xf32>
49-
%t7 = gpu.launch_func async [%t6] @kernel::@simple_gemm blocks in (%c32, %c16, %c1) threads in (%c16, %c1, %c1) args(%memref_a : memref<256x256xf16>, %memref_b : memref<256x256xf16>, %memref_c : memref<256x256xf32>)
50-
gpu.wait [%t6] // Wait for the kernel to finish.
51-
%t8 = gpu.wait async
52-
%t9 = gpu.memcpy async [%t8] %c, %memref_c : memref<256x256xf32>, memref<256x256xf32>
53-
%t10 = gpu.dealloc async [%t9] %memref_a : memref<256x256xf16>
54-
%t11 = gpu.dealloc async [%t10] %memref_b : memref<256x256xf16>
55-
%t12 = gpu.dealloc async [%t11] %memref_c : memref<256x256xf32>
56-
gpu.wait [%t12]
42+
%memref_a = gpu.alloc () : memref<256x256xf16>
43+
gpu.memcpy %memref_a, %a : memref<256x256xf16>, memref<256x256xf16>
44+
%memref_b = gpu.alloc () : memref<256x256xf16>
45+
gpu.memcpy %memref_b, %b : memref<256x256xf16>, memref<256x256xf16>
46+
%memref_c = gpu.alloc () : memref<256x256xf32>
47+
gpu.memcpy %memref_c, %c : memref<256x256xf32>, memref<256x256xf32>
48+
gpu.launch_func @kernel::@simple_gemm blocks in (%c32, %c16, %c1) threads in (%c16, %c1, %c1) args(%memref_a : memref<256x256xf16>, %memref_b : memref<256x256xf16>, %memref_c : memref<256x256xf32>)
49+
gpu.wait // Wait for the kernel to finish.
50+
gpu.memcpy %c, %memref_c : memref<256x256xf32>, memref<256x256xf32>
51+
gpu.dealloc %memref_a : memref<256x256xf16>
52+
gpu.dealloc %memref_b : memref<256x256xf16>
53+
gpu.dealloc %memref_c : memref<256x256xf32>
5754
return %c : memref<256x256xf32>
5855
}
5956

test/Integration/Dialect/XeGPU/SG/xegpu-to-llvm.pp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
convert-xevm-to-llvm
1515
cse
1616
)
17+
func.func(gpu-async-region)
1718
reconcile-unrealized-casts
1819
convert-vector-to-scf
1920
convert-scf-to-cf
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// RUN: %python_executable %imex_runner --requires=mlir-sycl-runtime,spirv-backend -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
2+
// RUN: --runner imex-cpu-runner -e main \
3+
// RUN: --entry-point-result=void \
4+
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%mlir_sycl_runtime --filecheck
5+
module @gemm attributes {gpu.container_module} {
6+
gpu.module @kernel {
7+
gpu.func @col_reduce(%in: memref<16x16xf16>, %c: memref<1x16xf16>) kernel attributes {intel_reqd_sub_group_size = 16 : i32} {
8+
%c0 = arith.constant 0 : index
9+
%c1 = arith.constant 1 : index
10+
%c8 = arith.constant 8 : index
11+
%c0_i32 = arith.constant 0 : i32
12+
%c1_i32 = arith.constant 1 : i32
13+
%c2_i32 = arith.constant 2 : i32
14+
%c4_i32 = arith.constant 4 : i32
15+
%c8_i32 = arith.constant 8 : i32
16+
%c16_i32 = arith.constant 16 : i32
17+
%c16 = arith.constant 16 : index
18+
%cst = arith.constant dense<1.0> : vector<16xf16>
19+
%in_tdesc = xegpu.create_nd_tdesc %in[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
20+
%c_tdesc = xegpu.create_nd_tdesc %c[%c0, %c0] : memref<1x16xf16> -> !xegpu.tensor_desc<1x16xf16>
21+
%in_val = xegpu.load_nd %in_tdesc : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
22+
%reduce = vector.reduction <add>, %in_val : vector<16xf16> into f16
23+
%out = arith.constant dense<0.0> : vector<1xf16>
24+
%reduce_1x1 = vector.insert %reduce, %out [0] : f16 into vector<1xf16>
25+
xegpu.store_nd %reduce_1x1, %c_tdesc : vector<1xf16>, !xegpu.tensor_desc<1x16xf16>
26+
gpu.return
27+
}
28+
}
29+
30+
func.func @test(%in: memref<16x16xf16>, %c: memref<1x16xf16>) -> memref<1x16xf16> attributes {llvm.emit_c_interface} {
31+
%c1 = arith.constant 1 : index
32+
%c16 = arith.constant 16 : index
33+
%c32 = arith.constant 32 : index
34+
%memref_in = gpu.alloc () : memref<16x16xf16>
35+
gpu.memcpy %memref_in, %in : memref<16x16xf16>, memref<16x16xf16>
36+
%memref_out = gpu.alloc () : memref<1x16xf16>
37+
gpu.memcpy %memref_out, %c : memref<1x16xf16>, memref<1x16xf16>
38+
gpu.launch_func @kernel::@col_reduce blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%memref_in : memref<16x16xf16>, %memref_out : memref<1x16xf16>)
39+
gpu.wait
40+
gpu.memcpy %c, %memref_out : memref<1x16xf16>, memref<1x16xf16>
41+
gpu.dealloc %memref_in : memref<16x16xf16>
42+
gpu.dealloc %memref_out : memref<1x16xf16>
43+
return %c : memref<1x16xf16>
44+
}
45+
46+
47+
func.func @main() attributes {llvm.emit_c_interface} {
48+
%c0 = arith.constant 0 : index
49+
%c1 = arith.constant 1 : index
50+
%c16 = arith.constant 16 : index
51+
%in = memref.alloc() : memref<16x16xf16>
52+
%out = memref.alloc() : memref<1x16xf16>
53+
%out_host = memref.alloc() : memref<16xf32>
54+
// Fill input with random values
55+
%in_cast = memref.cast %in : memref<16x16xf16> to memref<*xf16>
56+
%lower = arith.constant 0.0 : f32
57+
%upper = arith.constant 5.0 : f32
58+
%gen_int = arith.constant 1 : i1
59+
call @fillResource1DRandomF16(%in_cast, %lower, %upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
60+
61+
// CPU version.
62+
%c0_f16 = arith.constant 0.0 : f16
63+
%cst = arith.constant dense<1.0> : vector<16xf16>
64+
scf.for %i = %c0 to %c16 step %c1 {
65+
%col = vector.transfer_read %in[%c0, %i], %c0_f16 : memref<16x16xf16>, vector<16x1xf16>
66+
%col_16 = vector.shape_cast %col : vector<16x1xf16> to vector<16xf16>
67+
%reduce = vector.reduction <add>, %col_16 : vector<16xf16> into f16
68+
%reduce_f32 = arith.extf %reduce : f16 to f32
69+
memref.store %reduce_f32, %out_host[%i] : memref<16xf32>
70+
}
71+
%out_host_cast = memref.cast %out_host : memref<16xf32> to memref<*xf32>
72+
// GPU version.
73+
%gpu_out = call @test(%in, %out) : (memref<16x16xf16>, memref<1x16xf16>) -> memref<1x16xf16>
74+
%gpu_out_cast = memref.cast %gpu_out : memref<1x16xf16> to memref<*xf16>
75+
76+
// call @printMemrefF16(%gpu_out_cast) : (memref<*xf16>) -> ()
77+
// call @printMemrefF32(%out_host_cast) : (memref<*xf32>) -> ()
78+
// CHECK: [ALLCLOSE: TRUE]
79+
call @printAllcloseF16(%gpu_out_cast, %out_host_cast) : (memref<*xf16>, memref<*xf32>) -> ()
80+
81+
memref.dealloc %in : memref<16x16xf16>
82+
memref.dealloc %out : memref<1x16xf16>
83+
memref.dealloc %out_host : memref<16xf32>
84+
return
85+
}
86+
func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface}
87+
func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
88+
func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface}
89+
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
90+
func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
91+
}

test/Integration/Dialect/XeGPU/SIMT/gemm_4kx4kx4k_f16_f16_f16.mlir

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,20 @@ module @gemm attributes {gpu.container_module} {
1212
%c64 = arith.constant 64 : index
1313
%c128 = arith.constant 128 : index
1414
%c512 = arith.constant 512 : index
15-
%t0 = gpu.wait async
16-
%A_gpu, %t1 = gpu.alloc async [%t0] () : memref<4096x4096xf16>
17-
%t2 = gpu.memcpy async [%t1] %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
18-
%B_gpu, %t3 = gpu.alloc async [%t2] () : memref<4096x4096xf16>
19-
%t4 = gpu.memcpy async [%t3] %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
20-
%C_gpu, %t5 = gpu.alloc async [%t4] () : memref<4096x4096xf16>
21-
%t6 = gpu.memcpy async [%t5] %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
15+
%A_gpu = gpu.alloc () : memref<4096x4096xf16>
16+
gpu.memcpy %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
17+
%B_gpu = gpu.alloc () : memref<4096x4096xf16>
18+
gpu.memcpy %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
19+
%C_gpu = gpu.alloc () : memref<4096x4096xf16>
20+
gpu.memcpy %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
2221
// NOTE: Here we can't use [8, 64] wi threads following the SG thread layout of [8, 4]. Because runtime will linearize the x dimension first (we need y dimension to be linearized first).
2322
// So just use linearized thread layout of [512, 1] wi threads.
24-
%t7 = gpu.launch_func async [%t6] @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
25-
gpu.wait [%t7] // Wait for the kernel to finish.
26-
%t12 = gpu.wait async
27-
%t8 = gpu.memcpy async [%t12] %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
28-
%t9 = gpu.dealloc async [%t8] %A_gpu : memref<4096x4096xf16>
29-
%t10 = gpu.dealloc async [%t9] %B_gpu : memref<4096x4096xf16>
30-
%t11 = gpu.dealloc async [%t10] %C_gpu : memref<4096x4096xf16>
31-
gpu.wait [%t11]
23+
gpu.launch_func @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
24+
gpu.wait // Wait for the kernel to finish.
25+
gpu.memcpy %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
26+
gpu.dealloc %A_gpu : memref<4096x4096xf16>
27+
gpu.dealloc %B_gpu : memref<4096x4096xf16>
28+
gpu.dealloc %C_gpu : memref<4096x4096xf16>
3229
return %C : memref<4096x4096xf16>
3330
}
3431

test/Integration/Dialect/XeGPU/SIMT/gemm_4kx4kx4k_transpose_b.mlir

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,20 @@ module @gemm attributes {gpu.container_module} {
1212
%c64 = arith.constant 64 : index
1313
%c128 = arith.constant 128 : index
1414
%c512 = arith.constant 512 : index
15-
%t0 = gpu.wait async
16-
%A_gpu, %t1 = gpu.alloc async [%t0] () : memref<4096x4096xf16>
17-
%t2 = gpu.memcpy async [%t1] %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
18-
%B_gpu, %t3 = gpu.alloc async [%t2] () : memref<4096x4096xf16>
19-
%t4 = gpu.memcpy async [%t3] %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
20-
%C_gpu, %t5 = gpu.alloc async [%t4] () : memref<4096x4096xf16>
21-
%t6 = gpu.memcpy async [%t5] %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
15+
%A_gpu = gpu.alloc () : memref<4096x4096xf16>
16+
gpu.memcpy %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
17+
%B_gpu = gpu.alloc () : memref<4096x4096xf16>
18+
gpu.memcpy %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
19+
%C_gpu = gpu.alloc () : memref<4096x4096xf16>
20+
gpu.memcpy %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
2221
// NOTE: Here we can't use [8, 64] wi threads following the SG thread layout of [8, 4]. Because runtime will linearize the x dimension first (we need y dimension to be linearized first).
2322
// So just use linearized thread layout of [512, 1] wi threads.
24-
%t7 = gpu.launch_func async [%t6] @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
25-
gpu.wait [%t7] // Wait for the kernel to finish.
26-
%t12 = gpu.wait async
27-
%t8 = gpu.memcpy async [%t12] %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
28-
%t9 = gpu.dealloc async [%t8] %A_gpu : memref<4096x4096xf16>
29-
%t10 = gpu.dealloc async [%t9] %B_gpu : memref<4096x4096xf16>
30-
%t11 = gpu.dealloc async [%t10] %C_gpu : memref<4096x4096xf16>
31-
gpu.wait [%t11]
23+
gpu.launch_func @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
24+
gpu.wait // Wait for the kernel to finish.
25+
gpu.memcpy %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
26+
gpu.dealloc %A_gpu : memref<4096x4096xf16>
27+
gpu.dealloc %B_gpu : memref<4096x4096xf16>
28+
gpu.dealloc %C_gpu : memref<4096x4096xf16>
3229
return %C : memref<4096x4096xf16>
3330
}
3431

test/Integration/Dialect/XeGPU/SIMT/loadstore_nd.mlir

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,19 @@ module @gemm attributes {gpu.container_module} {
3535
func.func @test(%src : memref<8x16xi32>) -> memref<8x16xi32> attributes {llvm.emit_c_interface} {
3636
%c1 = arith.constant 1 : index
3737
%c16 = arith.constant 16 : index
38-
%memref_src = gpu.alloc host_shared () : memref<8x16xi32>
39-
memref.copy %src, %memref_src : memref<8x16xi32> to memref<8x16xi32>
40-
%memref_dst = gpu.alloc host_shared () : memref<8x16xi32>
38+
%memref_src = gpu.alloc () : memref<8x16xi32>
39+
gpu.memcpy %memref_src, %src : memref<8x16xi32>, memref<8x16xi32>
40+
%memref_dst = gpu.alloc () : memref<8x16xi32>
4141
%srcc = memref.memory_space_cast %memref_src : memref<8x16xi32> to memref<8x16xi32, 1>
4242
%dstt = memref.memory_space_cast %memref_dst : memref<8x16xi32> to memref<8x16xi32, 1>
4343

4444
gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srcc : memref<8x16xi32, 1>, %dstt : memref<8x16xi32, 1>)
45-
return %memref_dst : memref<8x16xi32>
45+
gpu.wait
46+
%out = memref.alloc () : memref<8x16xi32>
47+
gpu.memcpy %out, %memref_dst : memref<8x16xi32>, memref<8x16xi32>
48+
gpu.dealloc %memref_src : memref<8x16xi32>
49+
gpu.dealloc %memref_dst : memref<8x16xi32>
50+
return %out : memref<8x16xi32>
4651
}
4752

4853
func.func @main() attributes {llvm.emit_c_interface} {

test/Integration/Dialect/XeGPU/SIMT/loadstore_nd_dpas.mlir

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,24 @@ module @gemm attributes {gpu.container_module} {
3333
func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
3434
%c1 = arith.constant 1 : index
3535
%c16 = arith.constant 16 : index
36-
%memref_a = gpu.alloc host_shared () : memref<8x16xf16>
37-
memref.copy %a, %memref_a : memref<8x16xf16> to memref<8x16xf16>
38-
%memref_b = gpu.alloc host_shared () : memref<16x16xf16>
39-
memref.copy %b, %memref_b : memref<16x16xf16> to memref<16x16xf16>
40-
%memref_c = gpu.alloc host_shared () : memref<8x16xf32>
41-
memref.copy %c, %memref_c : memref<8x16xf32> to memref<8x16xf32>
36+
%memref_a = gpu.alloc () : memref<8x16xf16>
37+
gpu.memcpy %memref_a, %a : memref<8x16xf16>, memref<8x16xf16>
38+
%memref_b = gpu.alloc () : memref<16x16xf16>
39+
gpu.memcpy %memref_b, %b : memref<16x16xf16>, memref<16x16xf16>
40+
%memref_c = gpu.alloc () : memref<8x16xf32>
41+
gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32>
4242

4343
%a_gpu = memref.memory_space_cast %memref_a : memref<8x16xf16> to memref<8x16xf16, 1>
4444
%b_gpu = memref.memory_space_cast %memref_b : memref<16x16xf16> to memref<16x16xf16, 1>
4545
%c_gpu = memref.memory_space_cast %memref_c : memref<8x16xf32> to memref<8x16xf32, 1>
4646

4747
gpu.launch_func @kernel::@load_store_2d_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%a_gpu : memref<8x16xf16, 1>, %b_gpu : memref<16x16xf16, 1>, %c_gpu : memref<8x16xf32, 1>)
48-
return %memref_c : memref<8x16xf32>
48+
gpu.wait
49+
gpu.memcpy %c, %memref_c : memref<8x16xf32>, memref<8x16xf32>
50+
gpu.dealloc %memref_a : memref<8x16xf16>
51+
gpu.dealloc %memref_b : memref<16x16xf16>
52+
gpu.dealloc %memref_c : memref<8x16xf32>
53+
return %c : memref<8x16xf32>
4954
}
5055

5156
func.func @main() attributes {llvm.emit_c_interface} {

test/Integration/Dialect/XeGPU/SIMT/loadstore_nd_prefetch.mlir

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,19 @@ module @gemm attributes {gpu.container_module} {
3030
func.func @test(%src : memref<8x16xf16>) -> memref<8x16xf16> attributes {llvm.emit_c_interface} {
3131
%c1 = arith.constant 1 : index
3232
%c16 = arith.constant 16 : index
33-
%memref_src = gpu.alloc host_shared () : memref<8x16xf16>
34-
memref.copy %src, %memref_src : memref<8x16xf16> to memref<8x16xf16>
35-
%memref_dst = gpu.alloc host_shared () : memref<8x16xf16>
33+
%memref_src = gpu.alloc () : memref<8x16xf16>
34+
gpu.memcpy %memref_src, %src : memref<8x16xf16>, memref<8x16xf16>
35+
%memref_dst = gpu.alloc () : memref<8x16xf16>
3636
%srcc = memref.memory_space_cast %memref_src : memref<8x16xf16> to memref<8x16xf16, 1>
3737
%dstt = memref.memory_space_cast %memref_dst : memref<8x16xf16> to memref<8x16xf16, 1>
3838

3939
gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srcc : memref<8x16xf16, 1>, %dstt : memref<8x16xf16, 1>)
40-
return %memref_dst : memref<8x16xf16>
40+
gpu.wait
41+
%out = memref.alloc () : memref<8x16xf16>
42+
gpu.memcpy %out, %memref_dst : memref<8x16xf16>, memref<8x16xf16>
43+
gpu.dealloc %memref_src : memref<8x16xf16>
44+
gpu.dealloc %memref_dst : memref<8x16xf16>
45+
return %out : memref<8x16xf16>
4146
}
4247

4348
func.func @main() attributes {llvm.emit_c_interface} {

0 commit comments

Comments
 (0)