Handle sub byte type in XeGPU lowering to 2D LSC. (#1061)

silee2 · web-flow · commit 187b4afe0e1e · 2025-04-09T12:21:47.000-07:00
diff --git a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
@@ -840,6 +840,31 @@ class LoadNdPattern : public OpConversionPattern<LoadNdOp> {
             op, "Only global access supported for block load.");
       auto payload = adaptor.getTensorDesc();
       auto retTy = op.getType();
+      auto bitWidth = elemTy.getIntOrFloatBitWidth();
+      if (bitWidth < 8) {
+        if (8 % bitWidth != 0)
+          return rewriter.notifyMatchFailure(
+              op, "Only sub byte type with bit-width 1, 2, 4, or 8 are "
+                  "supported for block load.");
+        auto subByteFactor = 8 / bitWidth;
+        // For supported sub byte type,
+        // fake element type to i8 and update elemTy, retTy and tdescTy
+        // accordingly. Add cast before and after intrinsic call to ensure the
+        // type matches the original type.
+        elemTy = rewriter.getI8Type();
+        auto shape = tdescTy.getShape().vec();
+        auto lastDim = shape.size() - 1;
+        if (shape[lastDim] % subByteFactor != 0) {
+          return rewriter.notifyMatchFailure(
+              op, "The last dimension but be a multiple of (8 / bitWidth) for "
+                  "sub byte types.");
+        }
+        shape[lastDim] = shape[lastDim] / subByteFactor;
+        tdescTy = TensorDescType::get(tdescTy.getContext(), shape, elemTy,
+                                      tdescTy.getEncoding(),
+                                      /*sg_map*/ nullptr);
+        retTy = VectorType::get(tdescTy.getShape(), elemTy);
+      }
 
       // TODO: remove this after moving transposeBitWidth into a standalone
       // pass. update the width and pictch of the payload when transposeBitWidth
@@ -908,6 +933,8 @@ class LoadNdPattern : public OpConversionPattern<LoadNdOp> {
 
       // TODO: remove this after moving transposeBitWidth into a standalone
       // pass.
+      // NOTE: sub byte type handling also needs the bitcast to the original
+      // type after the intrinsic call.
       if (retTy != op.getType()) {
         auto targetTy = convertVectorType(op.getType()).second;
         callOp = rewriter.create<vector::BitCastOp>(loc, targetTy, callOp);
@@ -959,6 +986,31 @@ class PrefetchNdPattern : public OpConversionPattern<PrefetchNdOp> {
       if (scope != xegpu::MemorySpace::Global)
         return rewriter.notifyMatchFailure(
             op, "Only global access supported for block prefetch.");
+      auto elemTy = tdescTy.getElementType();
+      auto bitWidth = elemTy.getIntOrFloatBitWidth();
+      if (bitWidth < 8) {
+        if (8 % bitWidth != 0)
+          return rewriter.notifyMatchFailure(
+              op, "Only sub byte type with bit-width 1, 2, 4, or 8 are "
+                  "supported for block prefetch.");
+        auto subByteFactor = 8 / bitWidth;
+        // For supported sub byte type,
+        // fake element type to i8 and update elemTy, retTy and tdescTy
+        // accordingly. Add cast before and after intrinsic call to ensure the
+        // type matches the original type.
+        elemTy = rewriter.getI8Type();
+        auto shape = tdescTy.getShape().vec();
+        auto lastDim = shape.size() - 1;
+        if (shape[lastDim] % subByteFactor != 0) {
+          return rewriter.notifyMatchFailure(
+              op, "The last dimension but be a multiple of (8 / bitWidth) for "
+                  "sub byte types.");
+        }
+        shape[lastDim] = shape[lastDim] / subByteFactor;
+        tdescTy = TensorDescType::get(tdescTy.getContext(), shape, elemTy,
+                                      tdescTy.getEncoding(),
+                                      /*sg_map*/ nullptr);
+      }
       auto callOp = gen2DPrefetchIntrinsicCall(
           rewriter, loc, l1hint, l3hint, tdescTy, adaptor.getTensorDesc());
       rewriter.replaceOp(op, callOp);
@@ -1010,7 +1062,33 @@ class StoreNdPattern : public OpConversionPattern<StoreNdOp> {
       if (scope != xegpu::MemorySpace::Global)
         return rewriter.notifyMatchFailure(
             op, "Only global access supported for block store.");
-
+      auto elemTy = tdescTy.getElementType();
+      auto bitWidth = elemTy.getIntOrFloatBitWidth();
+      if (bitWidth < 8) {
+        if (8 % bitWidth != 0)
+          return rewriter.notifyMatchFailure(
+              op, "Only sub byte type with bit-width 1, 2, 4, or 8 are "
+                  "supported for block store.");
+        auto subByteFactor = 8 / bitWidth;
+        // For supported sub byte type,
+        // fake element type to i8 and update elemTy, retTy and tdescTy
+        // accordingly. Add cast before and after intrinsic call to ensure the
+        // type matches the original type.
+        elemTy = rewriter.getI8Type();
+        auto shape = tdescTy.getShape().vec();
+        auto lastDim = shape.size() - 1;
+        if (shape[lastDim] % subByteFactor != 0) {
+          return rewriter.notifyMatchFailure(
+              op, "The last dimension but be a multiple of (8 / bitWidth) for "
+                  "sub byte types.");
+        }
+        shape[lastDim] = shape[lastDim] / subByteFactor;
+        tdescTy = TensorDescType::get(tdescTy.getContext(), shape, elemTy,
+                                      tdescTy.getEncoding(),
+                                      /*sg_map*/ nullptr);
+        auto dataTy = VectorType::get({tdescTy.getNumElements()}, elemTy);
+        data = rewriter.create<vector::BitCastOp>(loc, dataTy, data);
+      }
       auto callOp =
           gen2DStoreIntrinsicCall(rewriter, loc, l1hint, l3hint, tdescTy,
                                   adaptor.getTensorDesc(), data);
diff --git a/test/Conversion/XeGPUToVC/load_nd.mlir b/test/Conversion/XeGPUToVC/load_nd.mlir
@@ -35,6 +35,18 @@ module @gemm attributes {gpu.container_module} {
       gpu.return
     }
 
+    // CHECK: gpu.func @test_load_nd_subbyte(%[[arg0:.*]]: memref<8x256xi1>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_load_nd_subbyte(%arg0: memref<8x256xi1>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x256xi1> -> !xegpu.tensor_desc<8x256xi1>
+      // CHECK: %[[V10:.*]] = func.call @llvm.genx.lsc.load.2d.ugm.desc.v256i8.v2i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) : (i1, vector<2xi8>, i8, i16, i16, vector<16xi32>, i32, i32, vector<256xi8>) -> vector<256xi8>
+      // CHECK: %[[V11:.*]] = vector.bitcast %[[V10]] : vector<256xi8> to vector<2048xi1>
+      // CHECK: %[[V12:.*]] = vector.shape_cast %[[V11]] : vector<2048xi1> to vector<8x256xi1>
+      %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x256xi1> -> vector<8x256xi1>
+      %cst0 = arith.constant 0 : index
+      vector.store %1, %arg0[%cst0, %cst0] : memref<8x256xi1>, vector<8x256xi1>
+      gpu.return
+    }
+
     // CHECK: gpu.func @test_load_nd_1(%[[arg0:.*]]: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
     gpu.func @test_load_nd_1(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
       //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %{{.*}} : memref<8x16xf16> -> index
diff --git a/test/Conversion/XeGPUToVC/prefetchnd.mlir b/test/Conversion/XeGPUToVC/prefetchnd.mlir
@@ -116,3 +116,15 @@ module @two_type attributes {gpu.container_module} {
     }
   }
 }
+
+// -----
+module @subbyte attributes {gpu.container_module} {
+  gpu.module @test_kernel {
+    gpu.func @test_prefetch(%arg0: memref<8x256xi1>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x256xi1> -> !xegpu.tensor_desc<8x256xi1>
+      // CHECK: func.call @llvm.genx.lsc.prefetch.2d.ugm.desc.v2i8.i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) : (i1, vector<2xi8>, i8, i16, i16, vector<16xi32>, i32, i32, i8) -> ()
+      xegpu.prefetch_nd %0 : !xegpu.tensor_desc<8x256xi1>
+      gpu.return
+    }
+  }
+}
diff --git a/test/Conversion/XeGPUToVC/store_nd.mlir b/test/Conversion/XeGPUToVC/store_nd.mlir
@@ -38,6 +38,15 @@ module @gemm attributes {gpu.container_module} {
       gpu.return
     }
 
+    gpu.func @test_store_nd_subbyte(%arg0: memref<8x256xi1>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c = arith.constant dense<1> : vector<8x256xi1>
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x256xi1> -> !xegpu.tensor_desc<8x256xi1>
+      // CHECK: %[[V10:.*]] = vector.bitcast {{.*}} : vector<2048xi1> to vector<256xi8>
+      // CHECK: func.call @llvm.genx.lsc.store.2d.ugm.desc.v2i8.v256i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[V10]]) : (i1, vector<2xi8>, i8, i16, i16, vector<16xi32>, i32, i32, vector<256xi8>) -> ()
+      xegpu.store_nd %c, %0 : vector<8x256xi1>, !xegpu.tensor_desc<8x256xi1>
+      gpu.return
+    }
+
     // CHECK: gpu.func @test_store_nd_1d_strided_memref(%[[arg0:.*]]: memref<32x32xf32, strided<[64, 1]>>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
     gpu.func @test_store_nd_1d_strided_memref(%arg0: memref<32x32xf32, strided<[64,1], offset: 0>>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
 

Original file line number	Diff line number	Diff line change
`@@ -116,3 +116,15 @@ module @two_type attributes {gpu.container_module} {`
`116`	`116`	`}`
`117`	`117`	`}`
`118`	`118`	`}`
	`119`	`+`
	`120`	`+// -----`
	`121`	`+module @subbyte attributes {gpu.container_module} {`
	`122`	`+ gpu.module @test_kernel {`
	`123`	`+ gpu.func @test_prefetch(%arg0: memref<8x256xi1>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {`
	`124`	`+ %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x256xi1> -> !xegpu.tensor_desc<8x256xi1>`
	`125`	`+ // CHECK: func.call @llvm.genx.lsc.prefetch.2d.ugm.desc.v2i8.i8({{.}}, {{.}}, {{.}}, {{.}}, {{.}}, {{.}}, {{.}}, {{.}}, {{.*}}) : (i1, vector<2xi8>, i8, i16, i16, vector<16xi32>, i32, i32, i8) -> ()`
	`126`	`+ xegpu.prefetch_nd %0 : !xegpu.tensor_desc<8x256xi1>`
	`127`	`+ gpu.return`
	`128`	`+ }`
	`129`	`+ }`
	`130`	`+}`