Add support for N-D vector extract in vector.extract_strided_slice (#698)

charithaintc · web-flow · commit 57f9c682e69e · 2024-03-21T06:40:31.000-05:00
diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
@@ -1369,29 +1369,86 @@ struct VectorExtractStridedSlice final
     if (!dstType)
       return failure();
 
-    // fixme : currently only support 1D vectors
-    if (extractOp.getSourceVectorType().getRank() != 1)
-      return failure();
-
-    uint64_t offset = getFirstIntValue(extractOp.getOffsets());
-    uint64_t size = getFirstIntValue(extractOp.getSizes());
-    uint64_t stride = getFirstIntValue(extractOp.getStrides());
+    auto offsets = extractOp.getOffsets().getValue();
+    auto sizes = extractOp.getSizes().getValue();
+    auto strides = extractOp.getStrides().getValue();
 
-    if (stride != 1)
-      return failure();
+    if (strides[0].cast<IntegerAttr>().getInt() != 1)
+      return rewriter.notifyMatchFailure(
+          extractOp, "Strided slice with stride != 1 is not supported.");
 
     Value srcVector = adaptor.getOperands().front();
 
     // Extract vector<1xT> case.
     if (isa<spirv::ScalarType>(dstType)) {
+      uint64_t offset = getFirstIntValue(extractOp.getOffsets());
       rewriter.replaceOpWithNewOp<spirv::CompositeExtractOp>(extractOp,
                                                              srcVector, offset);
       return success();
     }
 
-    SmallVector<int32_t, 2> indices(size);
-    std::iota(indices.begin(), indices.end(), offset);
+    // if kD offsets are specified for nd source vector (n > k), the granularity
+    // of the extraction is greater than 1. In this case last (n-k) dimensions
+    // form the extraction granularity. example : %0 =
+    // vector.extract_strided_slice %src { offsets = [0, 0], sizes = [2, 2],
+    // strides = [1, 1]} : vector<4x8x8xf32> to vector<2x2x8xf32>
+    // here, extraction granularity is 8.
+    int64_t extractSliceLen = 1;
+    auto n = extractOp.getSourceVectorType().getRank();
+    auto k = (int64_t)offsets.size();
+    if (n > k) {
+      for (unsigned i = 0; i < n - k; i++) {
+        extractSliceLen *= extractOp.getSourceVectorType().getShape()[i + k];
+      }
+    }
+
+    // get total number of extracted slices
+    int64_t nExtractedSlices = 1;
+    for (auto size : sizes) {
+      nExtractedSlices *= size.cast<IntegerAttr>().getInt();
+    }
 
+    // compute the strides of the source vector considering first k dimensions
+    SmallVector<int32_t, 4> sourceStrides(k, extractSliceLen);
+    for (int i = k - 2; i >= 0; --i) {
+      sourceStrides[i] = sourceStrides[i + 1] *
+                         extractOp.getSourceVectorType().getShape()[i + 1];
+    }
+    // final shuffle indices has nExtractedElems * extractSliceLen elements
+    SmallVector<int32_t, 4> indices(nExtractedSlices * extractSliceLen);
+    // compute the strides of the extracted kD vector
+    SmallVector<int32_t, 4> extractedStrides(k, 1);
+    // compute extractedStrides
+    for (int i = k - 2; i >= 0; --i) {
+      extractedStrides[i] =
+          extractedStrides[i + 1] * sizes[i + 1].cast<IntegerAttr>().getInt();
+    }
+    // iterate over all extracted slices from 0 to nExtractedElems-1
+    // and compute the multi-dimensional index and the corresponding linearized
+    // index within the source vector
+    for (int64_t i = 0; i < nExtractedSlices; ++i) {
+      int64_t index = i;
+      // compute the corresponding multi-dimensional index
+      SmallVector<int32_t, 4> multiDimIndex(k, 0);
+      for (int64_t j = 0; j < k; ++j) {
+        multiDimIndex[j] = (index / extractedStrides[j]);
+        index -= multiDimIndex[j] * extractedStrides[j];
+      }
+      // compute the corresponding linearized index in the source vector
+      // i.e. shift the multiDimIndex by the offsets
+      int64_t linearizedIndex = 0;
+      for (int64_t j = 0; j < k; ++j) {
+        linearizedIndex +=
+            (offsets[j].cast<IntegerAttr>().getInt() + multiDimIndex[j]) *
+            sourceStrides[j];
+      }
+      // fill the indices array form linearizedIndex to linearizedIndex +
+      // sliceLen
+      for (int64_t j = 0; j < extractSliceLen; ++j) {
+        indices[i * extractSliceLen + j] = linearizedIndex + j;
+      }
+    }
+    // perform a shuffle to extract the kD vector
     rewriter.replaceOpWithNewOp<spirv::VectorShuffleOp>(
         extractOp, dstType, srcVector, srcVector,
         rewriter.getI32ArrayAttr(indices));
diff --git a/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir b/test/Conversion/XeGPUToSPIRV/xegpu-to-vc.mlir
@@ -67,4 +67,36 @@ gpu.module @test attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.
       gpu.return
   }
 
+  gpu.func @vector_extract_strided_slice(%src_1d : vector<128xf32>, %src_2d : vector<8x16xf32>, %src_nd : vector<2x32x8xf32>)
+    kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      // CHECK: spirv.VectorShuffle [32 : i32, 33 : i32, 34 : i32, 35 : i32, 36 : i32, 37 : i32, 38 : i32, 39 : i32,
+      // CHECK: 40 : i32, 41 : i32, 42 : i32, 43 : i32, 44 : i32, 45 : i32, 46 : i32, 47 : i32]
+      // CHECK: %[[vec_1d:.*]], %[[vec_1d]] : vector<128xf32>, vector<128xf32> -> vector<16xf32>
+      %0 = vector.extract_strided_slice %src_1d {sizes = [16], strides = [1], offsets = [32]}
+        : vector<128xf32> to vector<16xf32>
+
+      // CHECK: spirv.VectorShuffle [8 : i32, 9 : i32, 10 : i32, 11 : i32, 12 : i32, 13 : i32, 14 : i32, 15 : i32, 24 : i32,
+      // CHECK: 25 : i32, 26 : i32, 27 : i32, 28 : i32, 29 : i32, 30 : i32, 31 : i32, 40 : i32, 41 : i32, 42 : i32,
+      // CHECK: 43 : i32, 44 : i32, 45 : i32, 46 : i32, 47 : i32, 56 : i32, 57 : i32, 58 : i32, 59 : i32, 60 : i32,
+      // CHECK: 61 : i32, 62 : i32, 63 : i32, 72 : i32, 73 : i32, 74 : i32, 75 : i32, 76 : i32, 77 : i32, 78 : i32,
+      // CHECK: 79 : i32, 88 : i32, 89 : i32, 90 : i32, 91 : i32, 92 : i32, 93 : i32, 94 : i32, 95 : i32, 104 : i32,
+      // CHECK: 105 : i32, 106 : i32, 107 : i32, 108 : i32, 109 : i32, 110 : i32, 111 : i32, 120 : i32, 121 : i32,
+      // CHECK: 122 : i32, 123 : i32, 124 : i32, 125 : i32, 126 : i32, 127 : i32]
+      // CHECK: %[[vec_2d:.*]], %[[vec_2d]] : vector<128xf32>, vector<128xf32> -> vector<64xf32>
+      %1 = vector.extract_strided_slice %src_2d {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]}
+        : vector<8x16xf32> to vector<8x8xf32>
+
+      // CHECK: spirv.VectorShuffle [192 : i32, 193 : i32, 194 : i32, 195 : i32, 196 : i32, 197 : i32, 198 : i32,
+      // CHECK: 199 : i32, 200 : i32, 201 : i32, 202 : i32, 203 : i32, 204 : i32, 205 : i32, 206 : i32, 207 : i32,
+      // CHECK: 208 : i32, 209 : i32, 210 : i32, 211 : i32, 212 : i32, 213 : i32, 214 : i32, 215 : i32, 216 : i32,
+      // CHECK: 217 : i32, 218 : i32, 219 : i32, 220 : i32, 221 : i32, 222 : i32, 223 : i32, 224 : i32, 225 : i32,
+      // CHECK: 226 : i32, 227 : i32, 228 : i32, 229 : i32, 230 : i32, 231 : i32, 232 : i32, 233 : i32, 234 : i32,
+      // CHECK: 235 : i32, 236 : i32, 237 : i32, 238 : i32, 239 : i32, 240 : i32, 241 : i32, 242 : i32, 243 : i32,
+      // CHECK: 244 : i32, 245 : i32, 246 : i32, 247 : i32, 248 : i32, 249 : i32, 250 : i32, 251 : i32, 252 : i32,
+      // CHECK: 253 : i32, 254 : i32, 255 : i32] %[[vec_nd:.*]], %[[vec_nd]] : vector<512xf32>, vector<512xf32> -> vector<64xf32>
+      %2 = vector.extract_strided_slice %src_nd { offsets = [0, 24], strides = [1, 1], sizes = [1, 8] } : vector<2x32x8xf32> to vector<1x8x8xf32>
+      gpu.return
+
+  }
+
 }
diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir
@@ -0,0 +1,105 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+module @gemm attributes {gpu.container_module} {
+  func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<16x16xf16>
+    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
+    memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16>
+    %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
+    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_2 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf16>
+    gpu.dealloc  %memref_1 : memref<16x16xf16>
+    return %memref_2 : memref<8x16xf32>
+  }
+
+    gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %c0 = arith.constant 0 : index
+      %c16 = arith.constant 16 : index
+      // load A tile
+      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] { mode = vc } : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+      %val0 = xegpu.load_nd %a_tile0 { mode = vc, vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
+      // load B tile
+      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] { mode = vc } : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+      %val2 = xegpu.load_nd %b_tile0 { mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+      // do DPAS
+      %val4 = xegpu.dpas %val0, %val2 : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      // extract second 8x8
+      %val5 = vector.extract_strided_slice %val4 {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]} : vector<8x16xf32> to vector<8x8xf32>
+      %cst_8x8_flat = arith.constant dense<1.0> : vector<64xf32>
+      %cst_8x8 = vector.shape_cast %cst_8x8_flat : vector<64xf32> to vector<8x8xf32>
+      // shift the first half to left and use %cst_8x8 as the second half
+      %val6 = vector.shuffle %val5, %cst_8x8 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32>
+      %val7 = vector.shape_cast %val6 : vector<16x8xf32> to vector<8x16xf32>
+      // store
+      %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] { mode = vc } : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      xegpu.store_nd %val7, %out_tile { mode = vc} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    // init constants
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c1_f32 = arith.constant 1.0 : f32
+    // random init
+    %lower = arith.constant -1.0 : f32
+    %upper = arith.constant 1.0 : f32
+    %false = arith.constant 0 : i1
+    %A = memref.alloc() : memref<8x16xf16>
+    %B = memref.alloc() : memref<16x16xf16>
+    %Out_cpu = memref.alloc() : memref<8x16xf32>
+    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
+    %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16>
+    call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
+    call @fillResource1DRandomF16(%B_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
+    // run GPU version
+    %Out_gpu = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32>
+    %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32>
+    // run CPU version
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c8 to  %c16 step %c1 {
+        %v0_init = arith.constant 0.0 : f32
+        %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 {
+          %a0 = memref.load %A[%i, %k] : memref<8x16xf16>
+          %b0 = memref.load %B[%k, %j] : memref<16x16xf16>
+          %a0_f32 = arith.extf %a0 : f16 to f32
+          %b0_f32 = arith.extf %b0 : f16 to f32
+          %t0 = arith.mulf %a0_f32, %b0_f32 : f32
+          %v0_new = arith.addf %v0, %t0 : f32
+          scf.yield %v0_new : f32
+        }
+        // only update the first 8x8 of the result, next 8x8 is value 1
+        %shifted_j = arith.subi %j, %c8 : index
+        memref.store %result#0, %Out_cpu[%i, %shifted_j] : memref<8x16xf32>
+        memref.store %c1_f32, %Out_cpu[%i, %j] : memref<8x16xf32>
+      }
+    }
+    %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
+    // print GPU and CPU outs
+    // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
+    // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
+    // dealloc
+    memref.dealloc %A : memref<8x16xf16>
+    memref.dealloc %B : memref<16x16xf16>
+    memref.dealloc %Out_cpu : memref<8x16xf32>
+    // gpu dealloc
+    gpu.dealloc %Out_gpu : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_2.vc.mlir
@@ -0,0 +1,87 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+module @gemm attributes {gpu.container_module} {
+  func.func @test(%A: memref<32x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<32x16xf32>
+    memref.copy %A, %memref : memref<32x16xf32> to memref<32x16xf32>
+    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
+    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<32x16xf32>, %memref_1 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<32x16xf32>
+    return %memref_1 : memref<8x16xf32>
+  }
+
+  gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%A: memref<32x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %c0 = arith.constant 0 : index
+      %c16 = arith.constant 16 : index
+      // load tile
+      %tile = xegpu.create_nd_tdesc %A [%c0, %c0] {mode = vc} : memref<32x16xf32> -> !xegpu.tensor_desc<32x8xf32, #xegpu.tdesc_attr<array_length = 2>>
+      %value = xegpu.load_nd %tile {mode = vc} : !xegpu.tensor_desc<32x8xf32, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x32x8xf32>
+      // extract the bottom 8x8 part of first 32x8 block
+      %sub_tile0 = vector.extract_strided_slice %value { offsets = [0, 24], strides = [1, 1], sizes = [1, 8] } : vector<2x32x8xf32> to vector<1x8x8xf32>
+      // extract the bottom 8x8 part of second 32x8 block
+      %sub_tile1 = vector.extract_strided_slice %value { offsets = [1, 24], strides = [1, 1], sizes = [1, 8] } : vector<2x32x8xf32> to vector<1x8x8xf32>
+      // combine these two 8x8 tiles into a single 8x16 tile
+      %t1 = vector.shape_cast %sub_tile0 : vector<1x8x8xf32> to vector<8x8xf32>
+      %t2 = vector.shape_cast %sub_tile1 : vector<1x8x8xf32> to vector<8x8xf32>
+      %t3 = vector.shuffle %t1, %t2 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32>
+      %t4 = vector.shape_cast %t3 : vector<16x8xf32> to vector<8x16xf32>
+
+      // store the result
+      %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] {mode = vc} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      xegpu.store_nd %t4, %out_tile {mode = vc} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    // init constants
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+    %c16 = arith.constant 16 : index
+    %c24 = arith.constant 24 : index
+    %c1_f32 = arith.constant 1.0 : f32
+    %A = memref.alloc() : memref<32x16xf32>
+    %Out_cpu = memref.alloc() : memref<8x16xf32>
+    // fill A with values form 0, 1, ...., 511
+    scf.for %i = %c0 to %c32 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %t1 = arith.muli %i, %c16 : index
+        %val = arith.addi %t1, %j : index
+        %val_i32 = arith.index_cast %val : index to i32
+        %val_f32 = arith.sitofp %val_i32 : i32 to f32
+        %cond = arith.cmpi "sge", %i, %c24 : index
+        // only store the bottom 8x16 into Out_cpu
+        scf.if %cond {
+          %i_cpu = arith.subi %i, %c24 : index
+          memref.store %val_f32, %Out_cpu[%i_cpu, %j] : memref<8x16xf32>
+        }
+        memref.store %val_f32, %A[%i, %j] : memref<32x16xf32>
+      }
+    }
+    // run GPU version
+    %Out_gpu = call @test(%A) : (memref<32x16xf32>) -> memref<8x16xf32>
+    %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<32x16xf32> to memref<*xf32>
+    %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
+    // print GPU and CPU outs
+    // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
+    // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
+    // dealloc
+    memref.dealloc %A : memref<32x16xf32>
+    // gpu dealloc
+    gpu.dealloc %Out_gpu : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+}