@@ -319,3 +319,99 @@ gpu.module @test {
319319 gpu.return
320320 }
321321}
322+
323+ // -----
324+ // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
325+ // CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index
326+ gpu.module @test {
327+ gpu.func @memref_extract_aligned_pointer_as_index (%arg0 : memref <256 x256 xf16 >) {
328+ %c0 = arith.constant 0 : index
329+ %cst = arith.constant {layout_result_0 = #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>} dense <1.000000e+00 > : vector <16 xf16 >
330+ %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref <256 x256 xf16 > -> index
331+ %ptr_i64 = arith.index_cast %ptr : index to i64
332+ %tdesc = xegpu.create_nd_tdesc %ptr_i64 [%c0 ], shape : [16 ], strides : [16 ] : i64
333+ -> !xegpu.tensor_desc <16 xf16 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
334+ xegpu.store_nd %cst , %tdesc : vector <16 xf16 >, !xegpu.tensor_desc <16 xf16 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
335+ gpu.return
336+ }
337+ }
338+
339+
340+ // -----
341+ // CHECK-LABEL: gpu.func @vector_transpose(
342+ // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32>
343+ // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32>
344+ // CHECK: xegpu.store_nd %[[CST]], %[[DEST]] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32>
345+ gpu.module @test {
346+ gpu.func @vector_transpose (%arg0: memref <2 x16 xf32 >) {
347+ %cst = arith.constant {layout_result_0 = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>} dense <1.000000e+00 >
348+ : vector <16 x2 xf32 >
349+ %c0 = arith.constant 0 : index
350+ %transpose = vector.transpose %cst , [1 , 0 ] {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
351+ : vector <16 x2 xf32 > to vector <2 x16 xf32 >
352+ %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ] : memref <2 x16 xf32 >
353+ -> !xegpu.tensor_desc <2 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
354+ xegpu.store_nd %transpose , %0 : vector <2 x16 xf32 >,
355+ !xegpu.tensor_desc <2 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
356+ gpu.return
357+ }
358+ }
359+
360+ // -----
361+ // CHECK-LABEL: gpu.func @vector_bitcast(
362+ // CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16>
363+ // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16>
364+ // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16>
365+ // CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16>
366+ gpu.module @test {
367+ gpu.func @vector_bitcast (%arg0: memref <4 x16 xi16 >) {
368+ %cst = " some_op" () {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 2 ]>}
369+ : () -> (vector <4 x32 xi8 >)
370+ %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
371+ : vector <4 x32 xi8 > to vector <4 x16 xi16 >
372+ %c0 = arith.constant 0 : index
373+ %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ] : memref <4 x16 xi16 >
374+ -> !xegpu.tensor_desc <4 x16 xi16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
375+ xegpu.store_nd %bitcast , %0 : vector <4 x16 xi16 >,
376+ !xegpu.tensor_desc <4 x16 xi16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
377+ gpu.return
378+ }
379+ }
380+
381+ // -----
382+ // CHECK-LABEL: gpu.func @mma_transpose_b(
383+ // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>,
384+ // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
385+ // CHECK: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
386+ // CHECK-NEXT: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
387+ // CHECK-NEXT: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
388+ // CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
389+ // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
390+ // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
391+ // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
392+ // CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
393+ gpu.module @test {
394+ gpu.func @mma_transpose_b (%arg0: memref <8 x16 xf16 >, %arg1: memref <16 x8 xi32 >, %arg2: memref <8 x16 xf32 >) {
395+ %c0 = arith.constant 0 : index
396+ %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ] : memref <8 x16 xf16 >
397+ -> !xegpu.tensor_desc <8 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
398+ %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
399+ : !xegpu.tensor_desc <8 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>> -> vector <8 x16 xf16 >
400+ %2 = xegpu.create_nd_tdesc %arg1 [%c0 , %c0 ] : memref <16 x8 xi32 >
401+ -> !xegpu.tensor_desc <16 x8 xi32 , #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>>
402+ %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>}
403+ : !xegpu.tensor_desc <16 x8 xi32 , #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>> -> vector <16 x8 xi32 >
404+ %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 2 ]>}
405+ : vector <16 x8 xi32 > to vector <16 x16 xf16 >
406+ %5 = vector.transpose %4 , [1 , 0 ] {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>}
407+ : vector <16 x16 xf16 > to vector <16 x16 xf16 >
408+ %6 = xegpu.dpas %1 , %5 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
409+ : vector <8 x16 xf16 >, vector <16 x16 xf16 > -> vector <8 x16 xf32 >
410+ %7 = xegpu.create_nd_tdesc %arg2 [%c0 , %c0 ] : memref <8 x16 xf32 >
411+ -> !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
412+ xegpu.store_nd %6 , %7 : vector <8 x16 xf32 >,
413+ !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
414+ gpu.return
415+
416+ }
417+ }
0 commit comments