intel
diff --git a/‎test/Integration/Dialect/XeGPU/SG/gemm_4kx4kx4k_f16_f16_f16.mlir
Lines changed: 12 additions & 15 deletions b/‎test/Integration/Dialect/XeGPU/SG/gemm_4kx4kx4k_f16_f16_f16.mlir
Lines changed: 12 additions & 15 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SG/simple_gemm.mlir
Lines changed: 12 additions & 15 deletions b/‎test/Integration/Dialect/XeGPU/SG/simple_gemm.mlir
Lines changed: 12 additions & 15 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SG/xegpu-to-llvm.pp
Lines changed: 1 addition & 0 deletions b/‎test/Integration/Dialect/XeGPU/SG/xegpu-to-llvm.pp
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SIMT/col_reduce_16x16xf16.mlir
Lines changed: 91 additions & 0 deletions b/‎test/Integration/Dialect/XeGPU/SIMT/col_reduce_16x16xf16.mlir
Lines changed: 91 additions & 0 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SIMT/gemm_4kx4kx4k_f16_f16_f16.mlir
Lines changed: 12 additions & 15 deletions b/‎test/Integration/Dialect/XeGPU/SIMT/gemm_4kx4kx4k_f16_f16_f16.mlir
Lines changed: 12 additions & 15 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SIMT/gemm_4kx4kx4k_transpose_b.mlir
Lines changed: 12 additions & 15 deletions b/‎test/Integration/Dialect/XeGPU/SIMT/gemm_4kx4kx4k_transpose_b.mlir
Lines changed: 12 additions & 15 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SIMT/loadstore_nd.mlir
Lines changed: 9 additions & 4 deletions b/‎test/Integration/Dialect/XeGPU/SIMT/loadstore_nd.mlir
Lines changed: 9 additions & 4 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SIMT/loadstore_nd_dpas.mlir
Lines changed: 12 additions & 7 deletions b/‎test/Integration/Dialect/XeGPU/SIMT/loadstore_nd_dpas.mlir
Lines changed: 12 additions & 7 deletions
diff --git a/‎test/Integration/Dialect/XeGPU/SIMT/loadstore_nd_prefetch.mlir
Lines changed: 9 additions & 4 deletions b/‎test/Integration/Dialect/XeGPU/SIMT/loadstore_nd_prefetch.mlir
Lines changed: 9 additions & 4 deletions
@@ -12,23 +12,20 @@ module @gemm attributes {gpu.container_module} {
     %c64 = arith.constant 64 : index
     %c128 = arith.constant 128 : index
     %c512 = arith.constant 512 : index
-    %t0 = gpu.wait async
-    %A_gpu, %t1 = gpu.alloc async [%t0] () : memref<4096x4096xf16>
-    %t2 = gpu.memcpy async [%t1] %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %B_gpu, %t3 = gpu.alloc  async [%t2] () : memref<4096x4096xf16>
-    %t4 = gpu.memcpy async [%t3] %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %C_gpu, %t5 = gpu.alloc async [%t4]  () : memref<4096x4096xf16>
-    %t6 = gpu.memcpy async [%t5] %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %A_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %B_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %C_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
     // NOTE: Here we can't use [8, 64] wi threads following the SG thread layout of [8, 4]. Because runtime will linearize the x dimension first (we need y dimension to be linearized first).
     // So just use linearized thread layout of [512, 1] wi threads.
-    %t7 = gpu.launch_func async [%t6]  @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
-    gpu.wait [%t7] // Wait for the kernel to finish.
-    %t12 = gpu.wait async
-    %t8 = gpu.memcpy async [%t12] %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %t9 = gpu.dealloc async [%t8]  %A_gpu : memref<4096x4096xf16>
-    %t10 = gpu.dealloc async [%t9] %B_gpu : memref<4096x4096xf16>
-    %t11 = gpu.dealloc async [%t10] %C_gpu : memref<4096x4096xf16>
-    gpu.wait [%t11]
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
+    gpu.wait  // Wait for the kernel to finish.
+    gpu.memcpy %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
+    gpu.dealloc %A_gpu : memref<4096x4096xf16>
+    gpu.dealloc %B_gpu : memref<4096x4096xf16>
+    gpu.dealloc %C_gpu : memref<4096x4096xf16>
     return %C : memref<4096x4096xf16>
   }
 
 
@@ -39,21 +39,18 @@ module @gemm attributes {gpu.container_module} {
     %c1 = arith.constant 1 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
-    %t = gpu.wait async
-    %memref_a, %t1 = gpu.alloc async [%t] () : memref<256x256xf16>
-    %t2 = gpu.memcpy async [%t1] %memref_a, %a : memref<256x256xf16>, memref<256x256xf16>
-    %memref_b, %t3 = gpu.alloc async [%t2] () : memref<256x256xf16>
-    %t4 = gpu.memcpy async [%t3] %memref_b, %b : memref<256x256xf16>, memref<256x256xf16>
-    %memref_c, %t5 = gpu.alloc async [%t4] () : memref<256x256xf32>
-    %t6 = gpu.memcpy async [%t5] %memref_c, %c : memref<256x256xf32>, memref<256x256xf32>
-    %t7 = gpu.launch_func async [%t6] @kernel::@simple_gemm blocks in (%c32, %c16, %c1) threads in (%c16, %c1, %c1) args(%memref_a : memref<256x256xf16>, %memref_b : memref<256x256xf16>, %memref_c : memref<256x256xf32>)
-    gpu.wait [%t6] // Wait for the kernel to finish.
-    %t8 = gpu.wait async
-    %t9 = gpu.memcpy async [%t8] %c, %memref_c : memref<256x256xf32>, memref<256x256xf32>
-    %t10 = gpu.dealloc async [%t9] %memref_a : memref<256x256xf16>
-    %t11 = gpu.dealloc async [%t10] %memref_b : memref<256x256xf16>
-    %t12 = gpu.dealloc async [%t11] %memref_c : memref<256x256xf32>
-    gpu.wait [%t12]
+    %memref_a = gpu.alloc  () : memref<256x256xf16>
+    gpu.memcpy %memref_a, %a : memref<256x256xf16>, memref<256x256xf16>
+    %memref_b = gpu.alloc () : memref<256x256xf16>
+    gpu.memcpy %memref_b, %b : memref<256x256xf16>, memref<256x256xf16>
+    %memref_c = gpu.alloc  () : memref<256x256xf32>
+    gpu.memcpy %memref_c, %c : memref<256x256xf32>, memref<256x256xf32>
+    gpu.launch_func  @kernel::@simple_gemm blocks in (%c32, %c16, %c1) threads in (%c16, %c1, %c1) args(%memref_a : memref<256x256xf16>, %memref_b : memref<256x256xf16>, %memref_c : memref<256x256xf32>)
+    gpu.wait  // Wait for the kernel to finish.
+    gpu.memcpy %c, %memref_c : memref<256x256xf32>, memref<256x256xf32>
+    gpu.dealloc %memref_a : memref<256x256xf16>
+    gpu.dealloc %memref_b : memref<256x256xf16>
+    gpu.dealloc %memref_c : memref<256x256xf32>
     return %c : memref<256x256xf32>
   }
 
 
@@ -14,6 +14,7 @@
       convert-xevm-to-llvm
 	    cse
     )
+    func.func(gpu-async-region)
     reconcile-unrealized-casts
     convert-vector-to-scf
     convert-scf-to-cf
 
@@ -0,0 +1,91 @@
+// RUN: %python_executable %imex_runner --requires=mlir-sycl-runtime,spirv-backend -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%mlir_sycl_runtime --filecheck
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @col_reduce(%in: memref<16x16xf16>, %c: memref<1x16xf16>) kernel attributes {intel_reqd_sub_group_size = 16 : i32} {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %c0_i32 = arith.constant 0 : i32
+      %c1_i32 = arith.constant 1 : i32
+      %c2_i32 = arith.constant 2 : i32
+      %c4_i32 = arith.constant 4 : i32
+      %c8_i32 = arith.constant 8 : i32
+      %c16_i32 = arith.constant 16 : i32
+      %c16 = arith.constant 16 : index
+      %cst = arith.constant dense<1.0> : vector<16xf16>
+      %in_tdesc = xegpu.create_nd_tdesc %in[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+      %c_tdesc = xegpu.create_nd_tdesc %c[%c0, %c0] : memref<1x16xf16> -> !xegpu.tensor_desc<1x16xf16>
+      %in_val = xegpu.load_nd %in_tdesc : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+      %reduce = vector.reduction <add>, %in_val : vector<16xf16> into f16
+      %out = arith.constant dense<0.0> : vector<1xf16>
+      %reduce_1x1 = vector.insert %reduce, %out [0] : f16 into vector<1xf16>
+      xegpu.store_nd %reduce_1x1, %c_tdesc : vector<1xf16>, !xegpu.tensor_desc<1x16xf16>
+      gpu.return
+    }
+  }
+
+  func.func @test(%in: memref<16x16xf16>, %c: memref<1x16xf16>) -> memref<1x16xf16> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %memref_in = gpu.alloc () : memref<16x16xf16>
+    gpu.memcpy %memref_in, %in : memref<16x16xf16>, memref<16x16xf16>
+    %memref_out = gpu.alloc () : memref<1x16xf16>
+    gpu.memcpy %memref_out, %c : memref<1x16xf16>, memref<1x16xf16>
+    gpu.launch_func  @kernel::@col_reduce blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%memref_in : memref<16x16xf16>, %memref_out : memref<1x16xf16>)
+    gpu.wait
+    gpu.memcpy %c, %memref_out : memref<1x16xf16>, memref<1x16xf16>
+    gpu.dealloc %memref_in : memref<16x16xf16>
+    gpu.dealloc %memref_out : memref<1x16xf16>
+    return %c : memref<1x16xf16>
+  }
+
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %in = memref.alloc() : memref<16x16xf16>
+    %out = memref.alloc() : memref<1x16xf16>
+    %out_host = memref.alloc() : memref<16xf32>
+    // Fill input with random values
+    %in_cast = memref.cast %in : memref<16x16xf16> to memref<*xf16>
+    %lower = arith.constant 0.0 : f32
+    %upper = arith.constant 5.0 : f32
+    %gen_int = arith.constant 1 : i1
+    call @fillResource1DRandomF16(%in_cast, %lower, %upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
+
+    // CPU version.
+    %c0_f16 = arith.constant 0.0 : f16
+    %cst = arith.constant dense<1.0> : vector<16xf16>
+    scf.for %i = %c0 to %c16 step %c1  {
+      %col = vector.transfer_read %in[%c0, %i], %c0_f16 : memref<16x16xf16>, vector<16x1xf16>
+      %col_16 = vector.shape_cast %col : vector<16x1xf16> to vector<16xf16>
+      %reduce = vector.reduction <add>, %col_16 : vector<16xf16> into f16
+      %reduce_f32 = arith.extf %reduce : f16 to f32
+      memref.store %reduce_f32, %out_host[%i] : memref<16xf32>
+    }
+    %out_host_cast = memref.cast %out_host : memref<16xf32> to memref<*xf32>
+    // GPU version.
+    %gpu_out = call @test(%in, %out) : (memref<16x16xf16>, memref<1x16xf16>) -> memref<1x16xf16>
+    %gpu_out_cast = memref.cast %gpu_out : memref<1x16xf16> to memref<*xf16>
+
+    // call @printMemrefF16(%gpu_out_cast) : (memref<*xf16>) -> ()
+    // call @printMemrefF32(%out_host_cast) : (memref<*xf32>) -> ()
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF16(%gpu_out_cast, %out_host_cast) : (memref<*xf16>, memref<*xf32>) -> ()
+
+    memref.dealloc %in : memref<16x16xf16>
+    memref.dealloc %out : memref<1x16xf16>
+    memref.dealloc %out_host : memref<16xf32>
+    return
+  }
+  func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface}
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
+}
@@ -12,23 +12,20 @@ module @gemm attributes {gpu.container_module} {
     %c64 = arith.constant 64 : index
     %c128 = arith.constant 128 : index
     %c512 = arith.constant 512 : index
-    %t0 = gpu.wait async
-    %A_gpu, %t1 = gpu.alloc async [%t0] () : memref<4096x4096xf16>
-    %t2 = gpu.memcpy async [%t1] %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %B_gpu, %t3 = gpu.alloc  async [%t2] () : memref<4096x4096xf16>
-    %t4 = gpu.memcpy async [%t3] %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %C_gpu, %t5 = gpu.alloc async [%t4]  () : memref<4096x4096xf16>
-    %t6 = gpu.memcpy async [%t5] %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %A_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %B_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %C_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
     // NOTE: Here we can't use [8, 64] wi threads following the SG thread layout of [8, 4]. Because runtime will linearize the x dimension first (we need y dimension to be linearized first).
     // So just use linearized thread layout of [512, 1] wi threads.
-    %t7 = gpu.launch_func async [%t6]  @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
-    gpu.wait [%t7] // Wait for the kernel to finish.
-    %t12 = gpu.wait async
-    %t8 = gpu.memcpy async [%t12] %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %t9 = gpu.dealloc async [%t8]  %A_gpu : memref<4096x4096xf16>
-    %t10 = gpu.dealloc async [%t9] %B_gpu : memref<4096x4096xf16>
-    %t11 = gpu.dealloc async [%t10] %C_gpu : memref<4096x4096xf16>
-    gpu.wait [%t11]
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
+    gpu.wait // Wait for the kernel to finish.
+    gpu.memcpy %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
+    gpu.dealloc %A_gpu : memref<4096x4096xf16>
+    gpu.dealloc %B_gpu : memref<4096x4096xf16>
+    gpu.dealloc %C_gpu : memref<4096x4096xf16>
     return %C : memref<4096x4096xf16>
   }
 
 
@@ -12,23 +12,20 @@ module @gemm attributes {gpu.container_module} {
     %c64 = arith.constant 64 : index
     %c128 = arith.constant 128 : index
     %c512 = arith.constant 512 : index
-    %t0 = gpu.wait async
-    %A_gpu, %t1 = gpu.alloc async [%t0] () : memref<4096x4096xf16>
-    %t2 = gpu.memcpy async [%t1] %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %B_gpu, %t3 = gpu.alloc  async [%t2] () : memref<4096x4096xf16>
-    %t4 = gpu.memcpy async [%t3] %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %C_gpu, %t5 = gpu.alloc async [%t4]  () : memref<4096x4096xf16>
-    %t6 = gpu.memcpy async [%t5] %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %A_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %A_gpu, %A : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %B_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %B_gpu, %B : memref<4096x4096xf16>, memref<4096x4096xf16>
+    %C_gpu = gpu.alloc () : memref<4096x4096xf16>
+    gpu.memcpy %C_gpu, %C : memref<4096x4096xf16>, memref<4096x4096xf16>
     // NOTE: Here we can't use [8, 64] wi threads following the SG thread layout of [8, 4]. Because runtime will linearize the x dimension first (we need y dimension to be linearized first).
     // So just use linearized thread layout of [512, 1] wi threads.
-    %t7 = gpu.launch_func async [%t6]  @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
-    gpu.wait [%t7] // Wait for the kernel to finish.
-    %t12 = gpu.wait async
-    %t8 = gpu.memcpy async [%t12] %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
-    %t9 = gpu.dealloc async [%t8]  %A_gpu : memref<4096x4096xf16>
-    %t10 = gpu.dealloc async [%t9] %B_gpu : memref<4096x4096xf16>
-    %t11 = gpu.dealloc async [%t10] %C_gpu : memref<4096x4096xf16>
-    gpu.wait [%t11]
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c16, %c16, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<4096x4096xf16>, %B_gpu : memref<4096x4096xf16>, %C_gpu : memref<4096x4096xf16>)
+    gpu.wait  // Wait for the kernel to finish.
+    gpu.memcpy %C, %C_gpu : memref<4096x4096xf16>, memref<4096x4096xf16>
+    gpu.dealloc %A_gpu : memref<4096x4096xf16>
+    gpu.dealloc %B_gpu : memref<4096x4096xf16>
+    gpu.dealloc %C_gpu : memref<4096x4096xf16>
     return %C : memref<4096x4096xf16>
   }
 
 
@@ -35,14 +35,19 @@ module @gemm attributes {gpu.container_module} {
   func.func @test(%src : memref<8x16xi32>) -> memref<8x16xi32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
     %c16 = arith.constant 16 : index
-    %memref_src = gpu.alloc host_shared () : memref<8x16xi32>
-    memref.copy %src, %memref_src : memref<8x16xi32> to memref<8x16xi32>
-    %memref_dst = gpu.alloc host_shared () : memref<8x16xi32>
+    %memref_src = gpu.alloc  () : memref<8x16xi32>
+    gpu.memcpy %memref_src, %src : memref<8x16xi32>, memref<8x16xi32>
+    %memref_dst = gpu.alloc  () : memref<8x16xi32>
     %srcc = memref.memory_space_cast %memref_src : memref<8x16xi32> to memref<8x16xi32, 1>
     %dstt = memref.memory_space_cast %memref_dst : memref<8x16xi32> to memref<8x16xi32, 1>
 
     gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srcc : memref<8x16xi32, 1>, %dstt : memref<8x16xi32, 1>)
-    return %memref_dst : memref<8x16xi32>
+    gpu.wait
+    %out = memref.alloc () : memref<8x16xi32>
+    gpu.memcpy %out, %memref_dst : memref<8x16xi32>, memref<8x16xi32>
+    gpu.dealloc %memref_src : memref<8x16xi32>
+    gpu.dealloc %memref_dst : memref<8x16xi32>
+    return %out : memref<8x16xi32>
   }
 
   func.func @main() attributes {llvm.emit_c_interface} {
 
@@ -33,19 +33,24 @@ module @gemm attributes {gpu.container_module} {
   func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
     %c16 = arith.constant 16 : index
-    %memref_a = gpu.alloc host_shared () : memref<8x16xf16>
-    memref.copy %a, %memref_a : memref<8x16xf16> to memref<8x16xf16>
-    %memref_b = gpu.alloc host_shared () : memref<16x16xf16>
-    memref.copy %b, %memref_b : memref<16x16xf16> to memref<16x16xf16>
-    %memref_c = gpu.alloc host_shared () : memref<8x16xf32>
-    memref.copy %c, %memref_c : memref<8x16xf32> to memref<8x16xf32>
+    %memref_a = gpu.alloc  () : memref<8x16xf16>
+    gpu.memcpy %memref_a, %a : memref<8x16xf16>, memref<8x16xf16>
+    %memref_b = gpu.alloc  () : memref<16x16xf16>
+    gpu.memcpy %memref_b, %b : memref<16x16xf16>, memref<16x16xf16>
+    %memref_c = gpu.alloc  () : memref<8x16xf32>
+    gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32>
 
     %a_gpu = memref.memory_space_cast %memref_a : memref<8x16xf16> to memref<8x16xf16, 1>
     %b_gpu = memref.memory_space_cast %memref_b : memref<16x16xf16> to memref<16x16xf16, 1>
     %c_gpu = memref.memory_space_cast %memref_c : memref<8x16xf32> to memref<8x16xf32, 1>
 
     gpu.launch_func @kernel::@load_store_2d_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%a_gpu : memref<8x16xf16, 1>, %b_gpu : memref<16x16xf16, 1>, %c_gpu : memref<8x16xf32, 1>)
-    return %memref_c : memref<8x16xf32>
+    gpu.wait
+    gpu.memcpy %c, %memref_c : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_a : memref<8x16xf16>
+    gpu.dealloc %memref_b : memref<16x16xf16>
+    gpu.dealloc %memref_c : memref<8x16xf32>
+    return %c : memref<8x16xf32>
   }
 
   func.func @main() attributes {llvm.emit_c_interface} {
 
@@ -30,14 +30,19 @@ module @gemm attributes {gpu.container_module} {
   func.func @test(%src : memref<8x16xf16>) -> memref<8x16xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
     %c16 = arith.constant 16 : index
-    %memref_src = gpu.alloc host_shared () : memref<8x16xf16>
-    memref.copy %src, %memref_src : memref<8x16xf16> to memref<8x16xf16>
-    %memref_dst = gpu.alloc host_shared () : memref<8x16xf16>
+    %memref_src = gpu.alloc  () : memref<8x16xf16>
+    gpu.memcpy %memref_src, %src : memref<8x16xf16>, memref<8x16xf16>
+    %memref_dst = gpu.alloc  () : memref<8x16xf16>
     %srcc = memref.memory_space_cast %memref_src : memref<8x16xf16> to memref<8x16xf16, 1>
     %dstt = memref.memory_space_cast %memref_dst : memref<8x16xf16> to memref<8x16xf16, 1>
 
     gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srcc : memref<8x16xf16, 1>, %dstt : memref<8x16xf16, 1>)
-    return %memref_dst : memref<8x16xf16>
+    gpu.wait
+    %out = memref.alloc () : memref<8x16xf16>
+    gpu.memcpy %out, %memref_dst : memref<8x16xf16>, memref<8x16xf16>
+    gpu.dealloc %memref_src : memref<8x16xf16>
+    gpu.dealloc %memref_dst : memref<8x16xf16>
+    return %out : memref<8x16xf16>
   }
 
   func.func @main() attributes {llvm.emit_c_interface} {
Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`convert-xevm-to-llvm`
`15`	`15`	`cse`
`16`	`16`	`)`
	`17`	`+ func.func(gpu-async-region)`
`17`	`18`	`reconcile-unrealized-casts`
`18`	`19`	`convert-vector-to-scf`
`19`	`20`	`convert-scf-to-cf`