Skip to content

Commit 7eabad4

Browse files
committed
save work
1 parent 2da2c6d commit 7eabad4

File tree

2 files changed

+99
-0
lines changed

2 files changed

+99
-0
lines changed

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,9 @@ struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
827827
}
828828
};
829829

830+
/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
831+
/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
832+
/// outside of the warp op.
830833
struct MemrefExtractAlignedPointerAsIndexDistribution final
831834
: public gpu::WarpDistributionPattern {
832835
using gpu::WarpDistributionPattern::WarpDistributionPattern;

mlir/test/Dialect/XeGPU/subgroup-distribute.mlir

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,3 +319,99 @@ gpu.module @test {
319319
gpu.return
320320
}
321321
}
322+
323+
// -----
324+
// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
325+
// CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index
326+
gpu.module @test {
327+
gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) {
328+
%c0 = arith.constant 0 : index
329+
%cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf16>
330+
%ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
331+
%ptr_i64 = arith.index_cast %ptr : index to i64
332+
%tdesc = xegpu.create_nd_tdesc %ptr_i64[%c0], shape: [16], strides: [16] : i64
333+
-> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
334+
xegpu.store_nd %cst, %tdesc : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
335+
gpu.return
336+
}
337+
}
338+
339+
340+
// -----
341+
// CHECK-LABEL: gpu.func @vector_transpose(
342+
// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32>
343+
// CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32>
344+
// CHECK: xegpu.store_nd %[[CST]], %[[DEST]] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32>
345+
gpu.module @test {
346+
gpu.func @vector_transpose(%arg0: memref<2x16xf32>) {
347+
%cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} dense<1.000000e+00>
348+
: vector<16x2xf32>
349+
%c0 = arith.constant 0 : index
350+
%transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
351+
: vector<16x2xf32> to vector<2x16xf32>
352+
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<2x16xf32>
353+
-> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
354+
xegpu.store_nd %transpose, %0 : vector<2x16xf32>,
355+
!xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
356+
gpu.return
357+
}
358+
}
359+
360+
// -----
361+
// CHECK-LABEL: gpu.func @vector_bitcast(
362+
// CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16>
363+
// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16>
364+
// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16>
365+
// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16>
366+
gpu.module @test {
367+
gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) {
368+
%cst = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
369+
: () -> (vector<4x32xi8>)
370+
%bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
371+
: vector<4x32xi8> to vector<4x16xi16>
372+
%c0 = arith.constant 0 : index
373+
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x16xi16>
374+
-> !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
375+
xegpu.store_nd %bitcast, %0 : vector<4x16xi16>,
376+
!xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
377+
gpu.return
378+
}
379+
}
380+
381+
// -----
382+
// CHECK-LABEL: gpu.func @mma_transpose_b(
383+
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>,
384+
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
385+
// CHECK: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
386+
// CHECK-NEXT: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
387+
// CHECK-NEXT: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
388+
// CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
389+
// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
390+
// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
391+
// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
392+
// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
393+
gpu.module @test {
394+
gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) {
395+
%c0 = arith.constant 0 : index
396+
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16>
397+
-> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
398+
%1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
399+
: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
400+
%2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32>
401+
-> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
402+
%3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
403+
: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
404+
%4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
405+
: vector<16x8xi32> to vector<16x16xf16>
406+
%5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
407+
: vector<16x16xf16> to vector<16x16xf16>
408+
%6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
409+
: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
410+
%7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32>
411+
-> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
412+
xegpu.store_nd %6, %7 : vector<8x16xf32>,
413+
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
414+
gpu.return
415+
416+
}
417+
}

0 commit comments

Comments
 (0)