diff --git a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td index 2dd612139fa2d..388efaaa25117 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td @@ -463,10 +463,9 @@ def XeVM_PrefetchOp def XeVM_BlockPrefetch2dOp : XeVM_Op<"blockprefetch2d">, - Arguments<(ins Arg:$ptr, I32:$base_width, - I32:$base_height, I32:$base_pitch, I32:$x, I32:$y, - I32Attr:$elem_size_in_bits, I32Attr:$tile_width, I32Attr:$tile_height, - I32Attr:$v_blocks, + Arguments<(ins LLVM_AnyPointer:$ptr, I32:$base_width, I32:$base_height, + I32:$base_pitch, I32:$x, I32:$y, I32Attr:$elem_size_in_bits, + I32Attr:$tile_width, I32Attr:$tile_height, I32Attr:$v_blocks, OptionalAttr:$cache_control)> { let summary = "2D block prefetch"; diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir index 9c552d849c12c..d606cf51435dc 100644 --- a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir +++ b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir @@ -1,15 +1,16 @@ -// RUN: mlir-opt %s --split-input-file -convert-xegpu-to-xevm | FileCheck %s +// RUN: mlir-opt %s --split-input-file -convert-xegpu-to-xevm -canonicalize | FileCheck %s gpu.module @test { // CHECK-LABEL: @load_gather_i64_src_value_offset -// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex> -gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) { +// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: memref<1xf16> +// CHECK-SAME: %[[ARG3:.*]]: vector<1xi1> +gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>, %dst: memref<1xf16>, %mask: vector<1xi1>) { + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f16 + // CHECK: %[[C2_I64:.*]] = arith.constant 2 : i64 + // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG3]][0] : i1 from vector<1xi1> // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex> // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64 - // CHECK: %[[CST:.*]] = arith.constant dense : vector<1xi1> - // CHECK: %[[VAR2:.*]] = vector.extract %[[CST]][0] : i1 from vector<1xi1> - %1 = arith.constant dense<1>: vector<1xi1> - // CHECK: %[[C2_I64:.*]] = arith.constant 2 : i64 // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C2_I64]] : i64 // CHECK: %[[VAR4:.*]] = arith.addi %[[ARG0]], %[[VAR3]] : i64 // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1> @@ -17,11 +18,12 @@ gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) // CHECK: %[[VAR7:.*]] = llvm.load %[[VAR5]] {cache_control = #xevm.load_cache_control} : !llvm.ptr<1> -> f16 // CHECK: scf.yield %[[VAR7]] : f16 // CHECK: } else { - // CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f16 // CHECK: scf.yield %[[CST_0]] : f16 // CHECK: } - %3 = xegpu.load %src[%offset], %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + %0 = xegpu.load %src[%offset], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : i64, vector<1xindex>, vector<1xi1> -> vector<1xf16> + %c0 = arith.constant 0 : index + vector.store %0, %dst[%c0] : memref<1xf16>, vector<1xf16> gpu.return } } @@ -30,16 +32,16 @@ gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) gpu.module @test { // CHECK-LABEL: @source_materialize_single_elem_vec // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: memref<1xf16> -gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex>, %dst: memref<1xf16>) { - %1 = arith.constant dense<1>: vector<1xi1> - %3 = xegpu.load %src[%offset], %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> +// CHECK-SAME: %[[ARG3:.*]]: vector<1xi1> +gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex>, %dst: memref<1xf16>, %mask: vector<1xi1>) { + %0 = xegpu.load %src[%offset], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : i64, vector<1xindex>, vector<1xi1> -> vector<1xf16> + // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[VAR_IF:.*]] = scf.if // CHECK: %[[VAR_RET:.*]] = vector.broadcast %[[VAR_IF]] : f16 to vector<1xf16> - // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: vector.store %[[VAR_RET]], %[[ARG2]][%[[C0]]] : memref<1xf16>, vector<1xf16> %c0 = arith.constant 0 : index - vector.store %3, %dst[%c0] : memref<1xf16>, vector<1xf16> + vector.store %0, %dst[%c0] : memref<1xf16>, vector<1xf16> gpu.return } } @@ -48,24 +50,21 @@ gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex> gpu.module @test { // CHECK-LABEL: @store_scatter_i64_src_value_offset -// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex> -gpu.func @store_scatter_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) { +// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: vector<1xi1> +gpu.func @store_scatter_i64_src_value_offset(%src: i64, %offset: vector<1xindex>, %mask: vector<1xi1>) { + // CHECK: %[[CST_0:.*]] = arith.constant 2.900000e+00 : f32 + // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64 + // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG2]][0] : i1 from vector<1xi1> // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex> // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64 - // CHECK: %[[CST:.*]] = arith.constant dense : vector<1xi1> - // CHECK: %[[VAR2:.*]] = vector.extract %[[CST]][0] : i1 from vector<1xi1> - %1 = arith.constant dense<1>: vector<1xi1> - // CHECK: %[[CST_0:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32> - // CHECK: %[[VAR3:.*]] = vector.extract %[[CST_0]][0] : f32 from vector<1xf32> - %2 = arith.constant dense<2.9>: vector<1xf32> - // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64 + %0 = arith.constant dense<2.9>: vector<1xf32> // CHECK: %[[VAR4:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64 // CHECK: %[[VAR5:.*]] = arith.addi %[[ARG0]], %[[VAR4]] : i64 // CHECK: %[[VAR6:.*]] = llvm.inttoptr %[[VAR5]] : i64 to !llvm.ptr<1> // CHECK: scf.if %[[VAR2]] { - // CHECK: llvm.store %[[VAR3]], %[[VAR6]] {cache_control = #xevm.store_cache_control} : f32, !llvm.ptr<1> + // CHECK: llvm.store %[[CST_0]], %[[VAR6]] {cache_control = #xevm.store_cache_control} : f32, !llvm.ptr<1> // CHECK: } - xegpu.store %2, %src[%offset], %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + xegpu.store %0, %src[%offset], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, i64, vector<1xindex>, vector<1xi1> gpu.return } @@ -76,9 +75,9 @@ gpu.module @test { // CHECK-LABEL: @prefetch_i64_src_value_offset // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex> gpu.func @prefetch_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) { + // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64 // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex> // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64 - // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64 // CHECK: %[[VAR2:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64 // CHECK: %[[VAR3:.*]] = arith.addi %[[ARG0]], %[[VAR2]] : i64 // CHECK: %[[VAR4:.*]] = llvm.inttoptr %[[VAR3]] : i64 to !llvm.ptr<1> @@ -94,11 +93,11 @@ gpu.module @test { // CHECK-LABEL: @prefetch_memref_src_value_offset // CHECK-SAME: %[[ARG0:.*]]: memref<256xf32>, %[[ARG1:.*]]: vector<1xindex> gpu.func @prefetch_memref_src_value_offset(%src: memref<256xf32>, %offset: vector<1xindex>) { + // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64 // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex> // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64 // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<256xf32> -> index // CHECK: %[[VAR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64 - // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64 // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64 // CHECK: %[[VAR4:.*]] = arith.addi %[[VAR2]], %[[VAR3]] : i64 // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1> diff --git a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir index 873478aed57e3..e4b303087ea9b 100644 --- a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir +++ b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir @@ -1,34 +1,29 @@ -// RUN: mlir-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s +// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s -gpu.module @fence_check { - gpu.func @fence(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { +gpu.module @prefetch_nd_check { + // CHECK-LABEL: gpu.func @prefetch_nd + gpu.func @prefetch_nd(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { + // CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.constant 64 : i32 + // CHECK: %[[LD_CREATE_DESC_I64:.*]] = arith.constant dense<0> : vector<4xi64> + // CHECK: %[[PREF_BASE_H:.*]] = arith.constant 8 : i32 + // CHECK: %[[PREF_BASE_W:.*]] = arith.constant 16 : i32 + // CHECK: %[[OFFSET_ZERO:.*]] = arith.constant 0 : i32 %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32> - %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32> - // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 - // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64> // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32> - // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC_2:.*]] = vector.insert %[[PREF_BASE_W]], %[[LD_DESC_1]] [2] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC_3:.*]] = vector.insert %[[PREF_BASE_H]], %[[LD_DESC_2]] [3] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC_4:.*]] = vector.insert %[[OFFSET_ZERO]], %[[LD_DESC_3]] [4] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC:.*]] = vector.insert %[[OFFSET_ZERO]], %[[LD_DESC_4]] [5] : i32 into vector<8xi32> %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64> //CHECK: %[[PREF_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64> - //CHECK: %[[PREF_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32> - //CHECK: %[[PREF_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32> - //CHECK: %[[PREF_TILE_W64:.*]] = arith.constant 0 : i64 - //CHECK: %[[PREF_TILE_W:.*]] = arith.trunci %[[PREF_TILE_W64]] : i64 to i32 - //CHECK: %[[PREF_TILE_H64:.*]] = arith.constant 0 : i64 - //CHECK: %[[PREF_TILE_H:.*]] = arith.trunci %[[PREF_TILE_H64]] : i64 to i32 //CHECK: %[[PREF_LLVMPTR:.*]] = llvm.inttoptr %[[PREF_INTPTR]] : i64 to !llvm.ptr<1> - //CHECK: %[[PREF_SIZEOF_F32:.*]] = arith.constant 4 : i32 - //CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[PREF_BASE_W]], %[[PREF_SIZEOF_F32]] : i32 //CHECK: xevm.blockprefetch2d %[[PREF_LLVMPTR]], %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_BASE_H]], - //CHECK-SAME: %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_TILE_W]], %[[PREF_TILE_H]] + //CHECK-SAME: %[[PREF_BASE_ROW_IN_BYTES]], %[[OFFSET_ZERO]], %[[OFFSET_ZERO]] //CHECK-SAME: <{cache_control = #xevm.load_cache_control, elem_size_in_bits = 32 : i32, //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32, v_blocks = 1 : i32}> //CHECK-SAME: : (!llvm.ptr<1>, i32, i32, i32, i32, i32)