Update create_nd_descriptor base address for 1d tile (#832)

Garra1980 · web-flow · commit 14bfdbadd139 · 2024-08-14T16:13:01.000-05:00
commit 80a415a531800b1935b6b5e33b82d3fc5cb45b63
Author: Gune &lt;gsingh@habana.ai&gt;
Date:   Thu Aug 1 15:17:36 2024 +0530

    Update create_nd_descriptor base address for 1d tile

    The base address before this fix assumes that the tile will be always
    2d. For a 1d tile, the base address needs to be adjusted similarly.
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -97,43 +97,91 @@ static func::CallOp createFuncCall(PatternRewriter &rewriter, Location loc,
   return rewriter.create<func::CallOp>(loc, fn, resultType, operands);
 }
 
-// Given an n-dim memref, a tensor descriptor defines a 2d memory region with
-// respect to the two inner-most dimensions. Other outer dimensions affect the
-// base address. For example, given
+// Given an n-dim memref, a tensor descriptor with tile rank of 2 defines a 2d
+// memory region with respect to the two inner-most dimensions. Other outer
+// dimensions affect the base address of the 2d plane.
+// For 2d, we compute the base address of 2d plane, assuming the coordinates
+// [0, 0] for the innermost 2 dimensions. The payload will record tile offset
+// within the 2d plane in separate field.
+// For example, given
 //   %m: memref<2x7x32x64xf16>
 // And this access
 //   %m[%a, %b, %c, %d]
+//
 // The base address will be adjusted as follows:
-//   new_base = base(%m) + %b * (32*64*2) + %a * (7*32*64*2)
+//   base address of plane for 2d tile = base(%m) + %b * (32*64*2) + %a *
+//                                       (7*32*64*2)
 // 2 is the number of bytes of the element type.
+//
+// For 1d, we compute the base address of the 1d tile, not the plane.
+// So the tile offset is also added to the base address.
+//
+// For tile rank of 1, the base address will be adjusted as:
+//   base address of tile for 1d tile = base(%m) + %d * (2) + %c * (64*2) +
+//                                      %b * (32*64*2) + %a * (7*32*64*2)
+
 static Value adjustBasePointer(ConversionPatternRewriter &rewriter,
-                               xegpu::CreateNdDescOp op, Value base) {
+                               xegpu::CreateNdDescOp op, Value memrefBaseAddr) {
+  auto memType = dyn_cast<MemRefType>(op.getSource().getType());
+
+  // FIXME: Only support static shape for now
+  if (!memType || !memType.hasStaticShape())
+    return memrefBaseAddr;
+
   auto loc = op.getLoc();
 
   auto createIndexConstant = [&](unsigned index) {
     return rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexType(),
                                               rewriter.getIndexAttr(index));
   };
 
-  if (auto memType = dyn_cast<MemRefType>(op.getSource().getType());
-      memType && memType.getRank() > 2) {
-    assert(memType.hasStaticShape() && "only support static shape for now");
-    auto shape = memType.getShape();
-    int64_t i = memType.getRank() - 1;
-    unsigned stride = memType.getElementType().getIntOrFloatBitWidth() / 8;
-    stride *= shape[i--];
-    stride *= shape[i--];
-    auto offsets = op.getMixedOffsets();
+  auto tileRank = op.getTensorDesc().getType().getRank();
+  auto offsets = op.getMixedOffsets();
+  auto strides = mlir::getStridesAndOffset(memType).first;
+  int64_t i = memType.getRank() - 1;
+
+  auto computeBase =
+      [&](Value base) {
+        for (; i >= 0; --i) {
+          unsigned stride =
+              strides[i] * memType.getElementType().getIntOrFloatBitWidth() / 8;
+          auto factor = createIndexConstant(stride);
+          auto offset = offsets.pop_back_val();
+          Value offsetVal;
+
+          if (offset.is<Value>()) {
+            offsetVal = offset.get<Value>();
+          } else {
+            offsetVal = createIndexConstant(
+                llvm::cast<IntegerAttr>(offset.get<Attribute>()).getInt());
+          }
+          auto linearOffset =
+              rewriter.create<arith::MulIOp>(loc, offsetVal, factor);
+          base = rewriter.create<arith::AddIOp>(loc, base, linearOffset);
+        }
+
+        return base;
+      };
+
+  if (tileRank == 2 && memType.getRank() > 2) {
+    // base address of plane for 2d: base addr of memref + offsets (starting
+    // from j to i) for a given memref<ixjxkxlxf16>
+
+    i -= 2;
     offsets.pop_back_n(2);
-    for (; i >= 0; --i) {
-      auto factor = createIndexConstant(stride);
-      auto linearOffset = rewriter.create<arith::MulIOp>(
-          loc, offsets.pop_back_val().get<Value>(), factor);
-      base = rewriter.create<arith::AddIOp>(loc, base, linearOffset);
-      stride *= shape[i];
-    }
+
+    auto baseOf2dPlane = computeBase(memrefBaseAddr);
+    return baseOf2dPlane;
+  }
+
+  if (tileRank == 1) {
+    // base address of tile for 1d: base addr of memref + offsets (starting from
+    // k to i) for a given memref<ixjxkxlxf16>
+    auto baseOf1dTile = computeBase(memrefBaseAddr);
+    return baseOf1dTile;
   }
-  return base;
+
+  return memrefBaseAddr;
 }
 
 struct CreateNdDescPattern
diff --git a/lib/ExecutionEngine/ImexRunnerUtils.cpp b/lib/ExecutionEngine/ImexRunnerUtils.cpp
@@ -88,6 +88,14 @@ _mlir_ciface_fillResource1DRandomF16(UnrankedMemRefType<f16> *ptr,
   _mlir_ciface_fillResource1DRandom(ptr, lower, upper, genInt);
 }
 
+/// Fills 1D memref of f32 type with random values uniformly
+extern "C" void
+_mlir_ciface_fillResource1DRandomF32(UnrankedMemRefType<float> *ptr,
+                                     const float lower, const float upper,
+                                     const bool genInt) {
+  _mlir_ciface_fillResource1DRandom(ptr, lower, upper, genInt);
+}
+
 extern "C" void _mlir_ciface_printMemrefBF16(UnrankedMemRefType<bf16> *M) {
   _mlir_ciface_printMemref(M);
 }
diff --git a/test/Conversion/XeGPUToVC/create_nd_desc.mlir b/test/Conversion/XeGPUToVC/create_nd_desc.mlir
@@ -98,5 +98,24 @@ module @gemm attributes {gpu.container_module} {
       //CHECK: gpu.return
       gpu.return
     }
+
+    // CHECK: gpu.func @test_create_nd_tdesc_4(%[[arg0:.*]]: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_create_nd_tdesc_4(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      //CHECK: %c1 = arith.constant 1 : index
+      %c1 = arith.constant 1 : index
+
+      //CHECK: %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<8x16xf16> -> index
+      //CHECK: %c2 = arith.constant 2 : index
+      //CHECK: %0 = arith.muli %c1, %c2 : index
+      //CHECK: %1 = arith.addi %intptr, %0 : index
+      //CHECK: %c32 = arith.constant 32 : index
+      //CHECK: %2 = arith.muli %c1, %c32 : index
+      //CHECK: %3 = arith.addi %1, %2 : index
+      //CHECK: %4 = arith.index_castui %3 : index to i64
+      //CHECK: %5 = vector.insert %4, %cst [0] : i64 into vector<4xi64>
+      %0 = xegpu.create_nd_tdesc %arg0[%c1, %c1] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+      //CHECK: gpu.return
+      gpu.return
+    }
   }
 }
diff --git a/test/Conversion/XeGPUToVC/xegpu-to-vc.mlir b/test/Conversion/XeGPUToVC/xegpu-to-vc.mlir
@@ -22,7 +22,11 @@ module @gemm attributes {gpu.container_module} {
 
       // CHECK: %[[A_STRUCT:.*]] = arith.constant dense<0> : vector<4xi64>
       // CHECK: %[[A_BASEPTR:.*]] = memref.extract_aligned_pointer_as_index {{.*}} : memref<128xf16> -> index
-      // CHECK: %[[A_BASEADDR:.*]] = arith.index_castui %[[A_BASEPTR]] : index to i64
+      // CHECK: %[[A_ELEMBYTES:.*]] = arith.constant 2 : index
+      // CHECK: %[[A_OFFSET:.*]] = arith.constant 0 : index
+      // CHECK: %[[A_STRIDE:.*]] = arith.muli %[[A_OFFSET]], %[[A_ELEMBYTES]] : index
+      // CHECK: %[[A_UPDATEDBASEPTR:.*]] = arith.addi %[[A_BASEPTR]], %[[A_STRIDE]] : index
+      // CHECK: %[[A_BASEADDR:.*]] = arith.index_castui %[[A_UPDATEDBASEPTR]] : index to i64
       // CHECK: %[[A_PAYLOAD_v4i64:.*]] = vector.insert %[[A_BASEADDR]], %[[A_STRUCT]] [0] : i64 into vector<4xi64>
       // CHECK: %[[A_PAYLOAD_v8i32:.*]] = vector.bitcast %[[A_PAYLOAD_v4i64]] : vector<4xi64> to vector<8xi32>
       %0 = xegpu.create_nd_tdesc %arg00[0] : memref<128xf16> -> !xegpu.tensor_desc<128xf16>
@@ -44,7 +48,11 @@ module @gemm attributes {gpu.container_module} {
 
       // CHECK: %[[C_STRUCT:.*]] = arith.constant dense<0> : vector<4xi64>
       // CHECK: %[[C_BASEPTR:.*]] = memref.extract_aligned_pointer_as_index {{.*}} : memref<128xf32> -> index
-      // CHECK: %[[C_BASE:.*]] = arith.index_castui %[[C_BASEPTR]] : index to i64
+      // CHECK: %[[C_ELEMBYTES:.*]] = arith.constant 4 : index
+      // CHECK: %[[C_OFFSET:.*]] = arith.constant 0 : index
+      // CHECK: %[[C_STRIDE:.*]] = arith.muli %[[C_OFFSET]], %[[C_ELEMBYTES]] : index
+      // CHECK: %[[C_UPDATEDBASEPTR:.*]] = arith.addi %[[C_BASEPTR]], %[[C_STRIDE]] : index
+      // CHECK: %[[C_BASE:.*]] = arith.index_castui %[[C_UPDATEDBASEPTR]] : index to i64
       // CHECK: %[[C_PAYLOAD:.*]] = vector.insert %[[C_BASE]], %[[C_STRUCT]] [0] : i64 into vector<4xi64>
       // CHECK: %[[C_PAYLOAD_v8i32:.*]] = vector.bitcast %[[C_PAYLOAD]] : vector<4xi64> to vector<8xi32>
       %2 = xegpu.create_nd_tdesc %arg02[0] : memref<128xf32> -> !xegpu.tensor_desc<128xf32>
diff --git a/test/Integration/Dialect/XeGPU/load_store_with_1d_tile.mlir b/test/Integration/Dialect/XeGPU/load_store_with_1d_tile.mlir
@@ -0,0 +1,81 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc-rawsend-false.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc-rawsend-false.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+module @gemm attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<0.0>
+  func.func @test(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %arg0, %memref : memref<8x16xf32> to memref<8x16xf32>
+    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %arg1, %memref_1 : memref<8x16xf32> to memref<8x16xf32>
+    %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c8, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_1 : memref<8x16xf32>, %memref_2 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf32>
+    gpu.dealloc  %memref_1 : memref<8x16xf32>
+    return %memref_2 : memref<8x16xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %thread_id_x = gpu.thread_id x
+      cf.br ^bb1
+    ^bb1:
+      %0 = xegpu.create_nd_tdesc %arg1[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %2 = xegpu.create_nd_tdesc %arg0[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      %3 = xegpu.load_nd %2  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %4 = arith.addf %3, %1 : vector<16xf32>
+      %5 = xegpu.create_nd_tdesc %arg2[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      xegpu.store_nd %4, %5  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c_gen_int = arith.constant 0 : i1
+    %cf_lower = arith.constant -0.5 : f32
+    %cf_upper = arith.constant 0.5 : f32
+
+    %A = memref.alloc() : memref<8x16xf32>
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
+
+    %B = memref.alloc() : memref<8x16xf32>
+    %B_random = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%B_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
+
+    // calculate the result C matrix
+    %c16 = arith.constant 16 : index
+    %c8 = arith.constant 8 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %ref = memref.alloc() : memref<8x16xf32>
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %a = memref.load %A[%i, %j] : memref<8x16xf32>
+        %b = memref.load %B[%i, %j] : memref<8x16xf32>
+        %c = arith.addf %a, %b : f32
+        memref.store %c, %ref[%i, %j] : memref<8x16xf32>
+      }
+    }
+
+    %C = call @test(%A, %B) : (memref<8x16xf32>, memref<8x16xf32>) -> memref<8x16xf32>
+
+    %C_cast = memref.cast %C : memref<8x16xf32> to memref<*xf32>
+    %ref_cast = memref.cast %ref : memref<8x16xf32> to memref<*xf32>
+    // call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%ref_cast, %C_cast) : (memref<*xf32>, memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+}
+
diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-func-vc-rawsend-false.pp b/test/Integration/Dialect/XeGPU/xegpu-to-func-vc-rawsend-false.pp
@@ -0,0 +1,30 @@
+// gpu dialect with intel intrinsic functions (func dialect) to
+// llvm dialect (for host code) and
+// spirv dialect (for device code) lowering pipeline.
+// Ready for imex runner starting from GPU dialect.
+builtin.module(
+    imex-vector-linearize
+    gpu.module(convert-xegpu-to-vc{useRawSend=false})
+    reconcile-unrealized-casts
+    bf16-to-gpu
+    imex-convert-gpu-to-spirv
+    spirv.module(spirv-lower-abi-attrs
+             spirv-update-vce)
+    func.func(llvm-request-c-wrappers)
+    serialize-spirv
+    convert-vector-to-scf
+    convert-gpu-to-gpux
+    convert-scf-to-cf
+    convert-cf-to-llvm
+    convert-vector-to-llvm
+    convert-index-to-llvm
+    convert-arith-to-llvm
+    convert-func-to-llvm
+    convert-math-to-llvm
+    convert-gpux-to-llvm
+    convert-index-to-llvm
+    expand-strided-metadata
+    lower-affine
+    finalize-memref-to-llvm
+    reconcile-unrealized-casts)
+// End

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,14 @@ _mlir_ciface_fillResource1DRandomF16(UnrankedMemRefType<f16> *ptr,`
`88`	`88`	`_mlir_ciface_fillResource1DRandom(ptr, lower, upper, genInt);`
`89`	`89`	`}`
`90`	`90`
	`91`	`+/// Fills 1D memref of f32 type with random values uniformly`
	`92`	`+extern "C" void`
	`93`	`+_mlir_ciface_fillResource1DRandomF32(UnrankedMemRefType<float> *ptr,`
	`94`	`+ const float lower, const float upper,`
	`95`	`+ const bool genInt) {`
	`96`	`+ _mlir_ciface_fillResource1DRandom(ptr, lower, upper, genInt);`
	`97`	`+}`
	`98`	`+`
`91`	`99`	`extern "C" void _mlir_ciface_printMemrefBF16(UnrankedMemRefType<bf16> *M) {`
`92`	`100`	`_mlir_ciface_printMemref(M);`
`93`	`101`	`}`