Add transpose_bit_width attribute to xegpu.load_nd (#693)

chencha3 · web-flow · commit c8588635e369 · 2024-03-12T23:05:44.000-05:00
diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td
@@ -218,14 +218,16 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
     XeGPU_TensorDesc: $TensorDesc,
     OptionalAttr<I32Attr>: $vnni_axis,
     OptionalAttr<DenseI64ArrayAttr>: $transpose,
+    OptionalAttr<I32Attr>: $transpose_bit_width,
     OptionalAttr<XeGPU_CacheReadAttr>: $l1_hint,
     OptionalAttr<XeGPU_CacheReadAttr>: $l2_hint,
     OptionalAttr<XeGPU_CacheReadAttr>: $l3_hint,
     DefaultValuedAttr<XeGPU_ModeAttr, "imex::xegpu::Mode::SIMT">: $mode);
+
   let results = (outs XeGPU_ValueType: $value);
 
   let extraClassDeclaration = [{
-    mlir::VectorType getValueType() {
+    mlir::VectorType getType() {
       return llvm::dyn_cast_if_present<mlir::VectorType>(getValue().getType());
     }
 
diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
@@ -661,12 +661,13 @@ class LoadStorePrefetchNdToRawSend : public OpConversionPattern<OpType> {
     auto extMsg = createIntConstant(i32Type, 0);
     auto dataSize2D = (encodeDataum(elmType) - 1);
     auto payLoad = adaptor.getTensorDesc();
-    // vnni and transpose combination is required for the case where B matrix is
-    // transposed and we need to load from B in DPAS layout. However, HW does
-    // not support both vnni and transpose together. We can get the same layout
-    // for the B load by doing the transpose in 32 bit granularity.
-    // TODO: Transpose granularity must be explicitly represented in XeGPU op.
-    if (vnni && transpose) {
+
+    // TODO: currently limit transposeBitWidth to 32, it is
+    // an architecture feature, and 32 works on PVC but may
+    // be not FS. To support other bits, we cannot hardcode
+    // with i32Type, and need to generalize the logic.
+    auto loadOp = llvm::dyn_cast<LoadNDOp>(op.getOperation());
+    if (loadOp && transpose && loadOp.getTransposeBitWidth() == 32) {
       // in raw_send msg set vnni effect to false and update data size of
       // payload item to 32 bits
       vnni = false;
diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
@@ -440,6 +440,7 @@ struct SgLoadTileOpPattern
     auto L3 = xegpu::CacheReadHintAttr::get(ctx, xegpu::CacheReadHint::CACHED);
 
     mlir::IntegerAttr vnniAttr;
+    mlir::IntegerAttr transposeBitWidthAttr;
     // TODO: move these two into architecture abstracture in future.
     const int SIMD_WIDTH_IN_BITS = 32;
     int factor = SIMD_WIDTH_IN_BITS / elemTy.getIntOrFloatBitWidth();
@@ -471,8 +472,8 @@ struct SgLoadTileOpPattern
 
       auto vectorTy = mlir::VectorType::get(shape, tileTy.getElementType());
       auto ldOp = rewriter.create<xegpu::LoadNDOp>(
-          op.getLoc(), vectorTy, src, vnniAttr, transposeAttr, L1, L2, L3,
-          imex::xegpu::Mode::VC);
+          op.getLoc(), vectorTy, src, vnniAttr, transposeAttr,
+          transposeBitWidthAttr, L1, L2, L3, imex::xegpu::Mode::VC);
       if (array_length == 1) {
         xegpuOps.push_back(ldOp);
       } else {
diff --git a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -140,7 +140,8 @@ parseOptionalAttrDict(mlir::OpAsmParser &parser, mlir::OperationState &result,
       return parseCustomEnumAttr<Mode, ModeAttr>(parser, result, nameId);
     }
 
-    if (nameId == "chunk_size_per_lane" || nameId == "vnni_axis")
+    if (nameId == "chunk_size_per_lane" || nameId == "vnni_axis" ||
+        nameId == "transpose_bit_width")
       return parseBoolAndIntegerAttr<mlir::IntegerAttr>(parser, result, nameId);
 
     if (nameId == "boundary_check")
@@ -727,9 +728,10 @@ mlir::ParseResult LoadNDOp::parse(mlir::OpAsmParser &parser,
   if (parser.parseOperand(TensorDescRawOperands[0]))
     return mlir::failure();
 
-  if (parseOptionalAttrDict(
-          parser, result,
-          {"mode", "vnni_axis", "transpose", "l1_hint", "l2_hint", "l3_hint"}))
+  if (parseOptionalAttrDict(parser, result,
+                            {"mode", "vnni_axis", "transpose",
+                             "transpose_bit_width", "l1_hint", "l2_hint",
+                             "l3_hint"}))
     return mlir::failure();
 
   if (parser.parseColon())
@@ -789,6 +791,13 @@ void LoadNDOp::print(mlir::OpAsmPrinter &printer) {
     printSep = true;
   }
 
+  if (getTransposeBitWidthAttr()) {
+    if (printSep)
+      printer << "," << ' ';
+    printer << "transpose_bit_width = " << getTransposeBitWidth().value();
+    printSep = true;
+  }
+
   printCacheHintAttrs<LoadNDOp>(printer, *this, printSep);
 
   if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
@@ -805,7 +814,7 @@ void LoadNDOp::print(mlir::OpAsmPrinter &printer) {
 
 mlir::LogicalResult LoadNDOp::verify() {
   auto tdescTy = getTensorDescType();
-  auto valueTy = getValueType();
+  auto valueTy = getType();
 
   if (tdescTy.getRank() > 2)
     return emitOpError(
@@ -845,6 +854,17 @@ mlir::LogicalResult LoadNDOp::verify() {
     }
   }
 
+  // TODO: remove the following two checks when we have a verfier
+  // against a architecture for handwritten code.
+  if (getTranspose() == llvm::ArrayRef<int64_t>({1, 0}) && getVnniAxis() == 0) {
+    return emitOpError("Transpose and VNNI are mutually exclusive.");
+  }
+
+  if (getVnniAxis() == 0 && getTransposeBitWidth()) {
+    return emitOpError("TransposeBitWidth and VNNI are mutually exclusive. "
+                       "TransposeBitWidth implies a VNNI transform on axis 0.");
+  }
+
   if (getTranspose()) {
     auto trans = getTranspose().value();
     if (tdescShape.size() >= trans.size())
@@ -860,6 +880,16 @@ mlir::LogicalResult LoadNDOp::verify() {
     tdescShape.push_back(vnni_factor);
   }
 
+  if (getTransposeBitWidth()) {
+    auto bitWidth = getTransposeBitWidth().value();
+    if (bitWidth != 32)
+      return emitOpError("Invalid bit width for transpose.");
+    auto vnni_factor = valueShape.back();
+    // transpose_bit_width imply a vnni transform on axis 0
+    tdescShape[0] /= vnni_factor;
+    tdescShape.push_back(vnni_factor);
+  }
+
   if (array_len > 1) {
     auto it = tdescShape.begin();
     tdescShape.insert(it, array_len);
diff --git a/lib/Utils/XeArch.cpp b/lib/Utils/XeArch.cpp
@@ -307,7 +307,14 @@ mlir::LogicalResult XeuArchInterface::isLegalLoad2dOp(mlir::Operation *op) {
 
     LoadStore2DConfig loadParams;
     bool vnni = loadOp.getVnniAxis() == 0 ? true : false;
-    bool transpose = loadOp.getTranspose() ? true : false;
+    bool transpose =
+        loadOp.getTranspose() == llvm::ArrayRef<int64_t>({1, 0}) ? true : false;
+
+    if (vnni && transpose) {
+      return loadOp->emitOpError(
+          "Transpose and VNNI are mutually exclusive. They are "
+          "not supported by the PVC hardware at the same time.\n");
+    }
 
     mlir::FailureOr<LoadStore2DConfig> configParams =
         this->get2DLoadConfig(op, elementSize, vnni, transpose);
diff --git a/test/Dialect/XeGPU/IR/invalid_vc.mlir b/test/Dialect/XeGPU/IR/invalid_vc.mlir
@@ -68,3 +68,15 @@ func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) {
                           : !xegpu.tensor_desc<16x8xf16, #xegpu.scattered>, vector<16x8xi1> -> vector<8x8x4xf16>
   return
 }
+
+// -----
+func.func @test_load_nd(%input: memref<24x32xf16>) {
+  %c0 = arith.constant 0 : index
+  %1 = xegpu.create_nd_tdesc %input[%c0, %c0] {mode = vc}
+                              : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // Hardware doesn't support VNNI transform and transpose at the same time.
+  // expected-error@+1 {{Transpose and VNNI are mutually exclusive.}}
+  %2 = xegpu.load_nd %1  {mode = vc, vnni_axis = 0, transpose = [1, 0]}
+                              : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+  return
+}
diff --git a/test/Dialect/XeGPU/IR/load_nd_vc.mlir b/test/Dialect/XeGPU/IR/load_nd_vc.mlir
@@ -75,3 +75,19 @@ func.func @test_load_nd_block_array_simd_f16(%src: memref<8x32xf16>) {
               : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
   return
 }
+
+
+// CHECK-LABEL: func @test_load_nd_transpose_bit_width_simd_f16({{.*}}) {
+func.func @test_load_nd_transpose_bit_width_simd_f16(%src: memref<8x32xf16>) {
+  // CHECK: xegpu.create_nd_tdesc
+  // CHECK-SAME: {mode = vc}
+  // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x32xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] {mode = vc} : memref<8x32xf16> -> !xegpu.tensor_desc<8x32xf16>
+
+  // CHECK: xegpu.load_nd
+  // CHECK-SAME: {mode = vc, transpose = [1, 0], transpose_bit_width = 32, l1_hint = cached, l2_hint = uncached}
+  // CHECK-SAME: !xegpu.tensor_desc<8x32xf16> -> vector<16x8x2xf16>
+  %2 = xegpu.load_nd %1 {mode = vc, transpose = [1, 0], transpose_bit_width = 32, l1_hint = cached, l2_hint = uncached}
+              : !xegpu.tensor_desc<8x32xf16> -> vector<16x8x2xf16>
+  return
+}
diff --git a/test/Integration/Dialect/XeGPU/gemm_with_transposed_B_1kx1kx1k_f16_f16_f32.mlir b/test/Integration/Dialect/XeGPU/gemm_with_transposed_B_1kx1kx1k_f16_f16_f32.mlir
@@ -41,7 +41,7 @@ module @gemm attributes {gpu.container_module} {
         %7 = xegpu.create_nd_tdesc %arg0[%2, %arg3] {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         %8 = xegpu.create_nd_tdesc %arg1[%3, %arg3] {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         %9 = xegpu.load_nd %7  {mode = vc, vnni_axis = 1}: !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
-        %10 = xegpu.load_nd %8  {mode = vc, vnni_axis = 0, transpose = [1, 0]} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+        %10 = xegpu.load_nd %8  {mode = vc, transpose_bit_width = 32, transpose = [1, 0]} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
         %11 = xegpu.dpas %9, %10, %arg4 {mode = vc} : vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
         scf.yield %11 : vector<8x16xf32>
       }

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ module @gemm attributes {gpu.container_module} {`
`41`	`41`	`%7 = xegpu.create_nd_tdesc %arg0[%2, %arg3] {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>`
`42`	`42`	`%8 = xegpu.create_nd_tdesc %arg1[%3, %arg3] {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>`
`43`	`43`	`%9 = xegpu.load_nd %7 {mode = vc, vnni_axis = 1}: !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>`
`44`		`- %10 = xegpu.load_nd %8 {mode = vc, vnni_axis = 0, transpose = [1, 0]} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>`
	`44`	`+ %10 = xegpu.load_nd %8 {mode = vc, transpose_bit_width = 32, transpose = [1, 0]} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>`
`45`	`45`	`%11 = xegpu.dpas %9, %10, %arg4 {mode = vc} : vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>`
`46`	`46`	`scf.yield %11 : vector<8x16xf32>`
`47`	`47`	`}`