From 8b7161b07a38c8e27d9de71f48950e14ffdaba5f Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Mon, 8 Dec 2025 21:14:01 +0000 Subject: [PATCH 1/6] [MLIR] Vector to XeGPU conversion: Use proper source variant for create_nd_tdesc op creation. If source strided memref is not fully static - at least one of shape, strides, offset is kDynamic - use i64 source variant. --- .../VectorToXeGPU/VectorToXeGPU.cpp | 40 ++++++++++++++++--- .../VectorToXeGPU/load-to-xegpu.mlir | 26 ++++++++---- .../VectorToXeGPU/store-to-xegpu.mlir | 26 ++++++++---- .../VectorToXeGPU/transfer-read-to-xegpu.mlir | 32 ++++++++++----- .../transfer-write-to-xegpu.mlir | 35 ++++++++++------ 5 files changed, 116 insertions(+), 43 deletions(-) diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 079e1e2a8ac67..b8606b261b781 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -102,18 +102,48 @@ static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter, xegpu::TensorDescType descType, TypedValue src) { MemRefType srcTy = src.getType(); + assert(srcTy.isStrided() && "Expected strided memref type"); auto [strides, offset] = srcTy.getStridesAndOffset(); + bool isStatic = true; + + // Memref is dynamic if any of its shape, offset or strides is dynamic. + if (!srcTy.hasStaticShape()) { + isStatic = false; + } + + if (offset == ShapedType::kDynamic) + isStatic = false; + + for (auto stride : strides) { + if (stride == ShapedType::kDynamic) { + isStatic = false; + break; + } + } xegpu::CreateNdDescOp ndDesc; - if (srcTy.hasStaticShape()) { + if (isStatic) { ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src); } else { - // In case of any dynamic shapes, source's shape and strides have to be + // In case of ranked dynamic memref, instead of passing on the memref, + // i64 base address, source's offset, shape and strides have to be // explicitly provided. auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, src); - ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src, - meta.getConstifiedMixedSizes(), - meta.getConstifiedMixedStrides()); + auto baseAddrIndex = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, loc, meta.getBaseBuffer()); + auto baseAddrI64 = arith::IndexCastOp::create( + rewriter, loc, rewriter.getI64Type(), baseAddrIndex.getResult()); + // Strided metadata only provides 1D offset but create_nd_desc op expect + // offset match the rank of source memref. Add leading zeros if rank > 1. + srcTy.getRank(); + SmallVector fullOffsets; + for (unsigned i = 0; i < srcTy.getRank() - 1; ++i) { + fullOffsets.push_back(rewriter.getI64IntegerAttr(0)); + } + fullOffsets.push_back(meta.getConstifiedMixedOffset()); + ndDesc = xegpu::CreateNdDescOp::create( + rewriter, loc, descType, baseAddrI64, fullOffsets, + meta.getConstifiedMixedSizes(), meta.getConstifiedMixedStrides()); } return ndDesc; diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir index ae5141db16c09..867d1f20fb707 100644 --- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir @@ -10,9 +10,13 @@ func.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vecto // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[COLLAPSED]] -// CHECK-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, +// CHECK: %[[BASE_BUFFER:.+]], %[[OFFSET1:.+]], %[[SIZES:.+]], %[[STRIDES:.+]] = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK-SAME: : memref<32xf32, strided<[1], offset: ?>> -> memref, index, index, index +// CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// CHECK-SAME: : memref -> index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][%[[OFFSET1]]], shape : [32], +// CHECK-SAME: strides : [1] : i64 -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> // CHECK: return %[[VEC]] @@ -30,9 +34,12 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[COLLAPSED]] -// CHECK-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// CHECK-SAME: : memref -> index +// CHECK: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// CHECK-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32> // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] @@ -49,8 +56,11 @@ func.func @load_dynamic_source(%source: memref, // CHECK-SAME: %[[SRC:.+]]: memref, // CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] -// CHECK: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// CHECK: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// CHECK-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor_desc<8x16xf32> // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir index 1a10d917623cc..09bd571951a6b 100644 --- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir @@ -12,9 +12,13 @@ func.func @store_1D_vector(%vec: vector<8xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[COLLAPSED]] -// CHECK-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, +// CHECK: %[[BASE_BUFFER:.+]], %[[OFFSET1:.+]], %[[SIZES:.+]], %[[STRIDES:.+]] = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK-SAME: : memref<32xf32, strided<[1], offset: ?>> -> memref, index, index, index +// CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// CHECK-SAME: : memref -> index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][%[[OFFSET1]]], shape : [32], +// CHECK-SAME: strides : [1] : i64 -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> @@ -32,9 +36,12 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[COLLAPSED]] -// CHECK-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// CHECK-SAME: : memref -> index +// CHECK: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// CHECK-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // ----- @@ -51,8 +58,11 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref, // CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] -// CHECK: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// CHECK: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// CHECK-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> // ----- diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir index 8bb272b1fe5fc..af330dced143e 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir @@ -49,9 +49,12 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>, // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[COLLAPSED]] -// LOAD-ND-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, +// LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// LOAD-ND-SAME: : memref -> index +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// LOAD-ND-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, // LOAD-ND-SAME: boundary_check = false // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] @@ -148,8 +151,12 @@ gpu.func @load_dynamic_source(%source: memref, // LOAD-ND-SAME: %[[SRC:.+]]: memref, // LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] -// LOAD-ND: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// LOAD-ND-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, +// LOAD-ND-SAME: #xegpu.block_tdesc_attr> // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] @@ -185,7 +192,11 @@ gpu.func @load_dynamic_source2(%source: memref, // LOAD-ND-SAME: %[[SRC:.+]]: memref, // LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32, strided<[16, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> +// LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [8, 16], strides : [16, 1] : +// LOAD-ND-SAME: i64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> // LOAD-ND: return %[[VEC]] : vector<8x16xf32> @@ -460,10 +471,11 @@ gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %of // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[SUBVIEW]] -// LOAD-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf16, -// LOAD-ND-SAME: boundary_check = false +// LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[SUBVIEW]] +// LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [256, 256], strides : [4096, 1] : +// LOAD-ND-SAME: i64 -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]], %[[OFF2]]]{{.*}}-> vector<8x16xf16> // LOAD-ND: return %[[VEC]] diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir index 43a1a7206e2cc..6185f8537d8e0 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir @@ -16,9 +16,13 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[COLLAPSED]] -// STORE-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, +// STORE-ND: %[[BASE_BUFFER:.+]], %[[OFFSET1:.+]], %[[SIZES:.+]], %[[STRIDES:.+]] = memref.extract_strided_metadata %[[COLLAPSED]] +// STORE-ND-SAME: : memref<32xf32, strided<[1], offset: ?>> -> memref, index, index, index +// STORE-ND: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// STORE-ND-SAME: : memref -> index +// STORE-ND: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][%[[OFFSET1]]], shape : [32], +// STORE-ND-SAME: strides : [1] : i64 -> !xegpu.tensor_desc<8xf32, // STORE-ND-SAME: boundary_check = false // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> @@ -51,9 +55,12 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[COLLAPSED]] -// STORE-ND-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, +// STORE-ND: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// STORE-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// STORE-ND-SAME: : memref -> index +// STORE-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// STORE-ND-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, // STORE-ND-SAME: boundary_check = false // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> @@ -87,8 +94,11 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref, // STORE-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] -// STORE-ND: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// STORE-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// STORE-ND: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index +// STORE-ND: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// STORE-ND-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> // STORE-SCATTER-LABEL: @store_dynamic_source( @@ -295,10 +305,11 @@ gpu.func @store_to_subview(%vec: vector<8xf16>, // STORE-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // STORE-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[COLLAPSED]] -// STORE-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16, -// STORE-ND-SAME: boundary_check = false +// STORE-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]], %[[STRIDES:.*]] = memref.extract_strided_metadata %[[COLLAPSED]] +// STORE-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// STORE-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 +// STORE-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %0[%[[OFFSET]]], shape : [256], strides : [1] : i64 -> +// STORE-ND-SAME: !xegpu.tensor_desc<8xf16, #xegpu.block_tdesc_attr> // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF2]]] : vector<8xf16> // STORE-SCATTER-LABEL: @store_to_subview( From bc2f04a478c85ed70ba74cf14127b93c2b8cf744 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Tue, 9 Dec 2025 22:42:54 +0000 Subject: [PATCH 2/6] Remove dead code. --- mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index b8606b261b781..76b8ba59fde83 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -135,7 +135,6 @@ static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter, rewriter, loc, rewriter.getI64Type(), baseAddrIndex.getResult()); // Strided metadata only provides 1D offset but create_nd_desc op expect // offset match the rank of source memref. Add leading zeros if rank > 1. - srcTy.getRank(); SmallVector fullOffsets; for (unsigned i = 0; i < srcTy.getRank() - 1; ++i) { fullOffsets.push_back(rewriter.getI64IntegerAttr(0)); From cfd5087efcf1a8b4382a7237fff24399f7bb9f54 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 10 Dec 2025 19:40:12 +0000 Subject: [PATCH 3/6] Combine base addr and byte offset as adjusted base addr. --- .../VectorToXeGPU/VectorToXeGPU.cpp | 20 +++++------ .../VectorToXeGPU/load-to-xegpu.mlir | 21 ++++++++---- .../VectorToXeGPU/store-to-xegpu.mlir | 21 ++++++++---- .../VectorToXeGPU/transfer-read-to-xegpu.mlir | 34 +++++++++++++------ .../transfer-write-to-xegpu.mlir | 28 ++++++++++----- 5 files changed, 83 insertions(+), 41 deletions(-) diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 76b8ba59fde83..0313e7d937041 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -131,17 +131,17 @@ static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter, auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, src); auto baseAddrIndex = memref::ExtractAlignedPointerAsIndexOp::create( rewriter, loc, meta.getBaseBuffer()); - auto baseAddrI64 = arith::IndexCastOp::create( - rewriter, loc, rewriter.getI64Type(), baseAddrIndex.getResult()); - // Strided metadata only provides 1D offset but create_nd_desc op expect - // offset match the rank of source memref. Add leading zeros if rank > 1. - SmallVector fullOffsets; - for (unsigned i = 0; i < srcTy.getRank() - 1; ++i) { - fullOffsets.push_back(rewriter.getI64IntegerAttr(0)); - } - fullOffsets.push_back(meta.getConstifiedMixedOffset()); + auto offset = meta.getOffset(); + auto elemByteSize = srcTy.getElementTypeBitWidth() / 8; + auto offsetInBytes = arith::MulIOp::create( + rewriter, loc, offset, + arith::ConstantIndexOp::create(rewriter, loc, elemByteSize)); + auto adjustedBaseAddr = arith::AddIOp::create( + rewriter, loc, baseAddrIndex.getResult(), offsetInBytes); + auto adjustedAddrI64 = arith::IndexCastOp::create( + rewriter, loc, rewriter.getI64Type(), adjustedBaseAddr); ndDesc = xegpu::CreateNdDescOp::create( - rewriter, loc, descType, baseAddrI64, fullOffsets, + rewriter, loc, descType, adjustedAddrI64, meta.getConstifiedMixedSizes(), meta.getConstifiedMixedStrides()); } diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir index 867d1f20fb707..c77efa03f3483 100644 --- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir @@ -9,13 +9,16 @@ func.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vecto // CHECK-LABEL: @load_1D_vector( // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // CHECK: %[[BASE_BUFFER:.+]], %[[OFFSET1:.+]], %[[SIZES:.+]], %[[STRIDES:.+]] = memref.extract_strided_metadata %[[COLLAPSED]] // CHECK-SAME: : memref<32xf32, strided<[1], offset: ?>> -> memref, index, index, index // CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] // CHECK-SAME: : memref -> index -// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][%[[OFFSET1]]], shape : [32], +// CHECK: %[[MUL:.+]] = arith.muli %[[OFFSET1]], %[[ELEM_BYTES]] : index +// CHECK: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[ADD]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [32], // CHECK-SAME: strides : [1] : i64 -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> @@ -33,12 +36,15 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>, // CHECK-LABEL: @load_2D_vector( // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // CHECK: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] // CHECK-SAME: : memref -> index -// CHECK: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// CHECK: %[[MUL:.+]] = arith.muli %[[OFF1]], %[[ELEM_BYTES]] : index +// CHECK: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[ADD]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [16, 32], // CHECK-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32> // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] @@ -55,11 +61,14 @@ func.func @load_dynamic_source(%source: memref, // CHECK-LABEL: @load_dynamic_source( // CHECK-SAME: %[[SRC:.+]]: memref, // CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// CHECK: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] // CHECK: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index -// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// CHECK: %[[MUL:.+]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index +// CHECK: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[ADD]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], // CHECK-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor_desc<8x16xf32> // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir index 09bd571951a6b..3c11313d05536 100644 --- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir @@ -11,13 +11,16 @@ func.func @store_1D_vector(%vec: vector<8xf32>, // CHECK-SAME: %[[VEC:.+]]: vector<8xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[ELEM_BYTES:.*]] = arith.constant 4 : index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // CHECK: %[[BASE_BUFFER:.+]], %[[OFFSET1:.+]], %[[SIZES:.+]], %[[STRIDES:.+]] = memref.extract_strided_metadata %[[COLLAPSED]] // CHECK-SAME: : memref<32xf32, strided<[1], offset: ?>> -> memref, index, index, index // CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] // CHECK-SAME: : memref -> index -// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][%[[OFFSET1]]], shape : [32], +// CHECK: %[[MUL:.+]] = arith.muli %[[OFFSET1]], %[[ELEM_BYTES]] : index +// CHECK: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[ADD]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [32], // CHECK-SAME: strides : [1] : i64 -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> @@ -35,12 +38,15 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>, // CHECK-SAME: %[[VEC:.+]]: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[ELEM_BYTES:.*]] = arith.constant 4 : index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // CHECK: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] // CHECK-SAME: : memref -> index -// CHECK: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// CHECK: %[[MUL:.+]] = arith.muli %[[OFF1]], %[[ELEM_BYTES]] : index +// CHECK: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[ADD]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [16, 32], // CHECK-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> @@ -57,11 +63,14 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>, // CHECK-SAME: %[[VEC:.+]]: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref, // CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// CHECK: %[[ELEM_BYTES:.*]] = arith.constant 4 : index // CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] // CHECK: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // CHECK: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index -// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// CHECK: %[[MUL:.+]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index +// CHECK: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// CHECK: %[[I64PTR:.+]] = arith.index_cast %[[ADD]] : index to i64 +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], // CHECK-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir index af330dced143e..b58f9b30ed726 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir @@ -48,12 +48,15 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>, // LOAD-ND-LABEL: @load_2D_vector( // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index +// LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] -// LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] -// LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] -// LOAD-ND-SAME: : memref -> index -// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] +// LOAD-ND-SAME: : memref -> index +// LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFF1]], %[[ELEM_BYTES]] : index +// LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [16, 32], // LOAD-ND-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, // LOAD-ND-SAME: boundary_check = false // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> @@ -150,11 +153,14 @@ gpu.func @load_dynamic_source(%source: memref, // LOAD-ND-LABEL: @load_dynamic_source( // LOAD-ND-SAME: %[[SRC:.+]]: memref, // LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] // LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index -// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index +// LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], // LOAD-ND-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, // LOAD-ND-SAME: #xegpu.block_tdesc_attr> // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> @@ -191,11 +197,14 @@ gpu.func @load_dynamic_source2(%source: memref, // LOAD-ND-LABEL: @load_dynamic_source2( // LOAD-ND-SAME: %[[SRC:.+]]: memref, // LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] // LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] -// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [8, 16], strides : [16, 1] : +// LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index +// LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 +// LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [8, 16], strides : [16, 1] : // LOAD-ND-SAME: i64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> // LOAD-ND: return %[[VEC]] : vector<8x16xf32> @@ -470,11 +479,14 @@ gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %of // LOAD-ND-LABEL: @load_from_subview_2D( // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[ELEM_BYTES:.+]] = arith.constant 2 : index // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> // LOAD-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[SUBVIEW]] // LOAD-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] -// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [256, 256], strides : [4096, 1] : +// LOAD-ND: %[[MUL:.*]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index +// LOAD-ND: %[[ADD:.*]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// LOAD-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 +// LOAD-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [256, 256], strides : [4096, 1] : // LOAD-ND-SAME: i64 -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> // LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]], %[[OFF2]]]{{.*}}-> vector<8x16xf16> // LOAD-ND: return %[[VEC]] diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir index 6185f8537d8e0..66da64225678e 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir @@ -15,13 +15,16 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index +// STORE-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // STORE-ND: %[[BASE_BUFFER:.+]], %[[OFFSET1:.+]], %[[SIZES:.+]], %[[STRIDES:.+]] = memref.extract_strided_metadata %[[COLLAPSED]] // STORE-ND-SAME: : memref<32xf32, strided<[1], offset: ?>> -> memref, index, index, index // STORE-ND: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] // STORE-ND-SAME: : memref -> index -// STORE-ND: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][%[[OFFSET1]]], shape : [32], +// STORE-ND: %[[MUL:.+]] = arith.muli %[[OFFSET1]], %[[ELEM_BYTES]] : index +// STORE-ND: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// STORE-ND: %[[I64PTR:.+]] = arith.index_cast %[[ADD]] : index to i64 +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [32], // STORE-ND-SAME: strides : [1] : i64 -> !xegpu.tensor_desc<8xf32, // STORE-ND-SAME: boundary_check = false // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> @@ -54,12 +57,15 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index +// STORE-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // STORE-ND: %[[BASE_BUFFER:.*]], %[[OFF1:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // STORE-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] // STORE-ND-SAME: : memref -> index -// STORE-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFF1]]], shape : [16, 32], +// STORE-ND: %[[MUL:.+]] = arith.muli %[[OFF1]], %[[ELEM_BYTES]] : index +// STORE-ND: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// STORE-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [16, 32], // STORE-ND-SAME: strides : [32, 1] : i64 -> !xegpu.tensor_desc<8x16xf32, // STORE-ND-SAME: boundary_check = false // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> @@ -93,11 +99,14 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref, // STORE-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// STORE-ND: %[[ELEM_BYTES:.+]] = arith.constant 4 : index // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] // STORE-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] // STORE-ND: %[[INTPTR:.+]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] : memref -> index -// STORE-ND: %[[I64PTR:.+]] = arith.index_cast %[[INTPTR]] : index to i64 -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]][0, %[[OFFSET]]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], +// STORE-ND: %[[MUL:.+]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index +// STORE-ND: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// STORE-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [%[[SIZES]]#0, %[[SIZES]]#1], // STORE-ND-SAME: strides : [%[[STRIDES]]#0, 1] : i64 -> !xegpu.tensor // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> @@ -303,12 +312,15 @@ gpu.func @store_to_subview(%vec: vector<8xf16>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8xf16>, // STORE-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // STORE-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// STORE-ND: %[[ELEM_BYTES:.+]] = arith.constant 2 : index // STORE-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> // STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] // STORE-ND: %[[BASE_BUFFER:.*]], %[[OFFSET:.*]], %[[SIZES:.*]], %[[STRIDES:.*]] = memref.extract_strided_metadata %[[COLLAPSED]] // STORE-ND: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE_BUFFER]] -// STORE-ND: %[[I64PTR:.*]] = arith.index_cast %[[INTPTR]] : index to i64 -// STORE-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %0[%[[OFFSET]]], shape : [256], strides : [1] : i64 -> +// STORE-ND: %[[MUL:.+]] = arith.muli %[[OFFSET]], %[[ELEM_BYTES]] : index +// STORE-ND: %[[ADD:.+]] = arith.addi %[[INTPTR]], %[[MUL]] : index +// STORE-ND: %[[I64PTR:.*]] = arith.index_cast %[[ADD]] : index to i64 +// STORE-ND: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[I64PTR]], shape : [256], strides : [1] : i64 -> // STORE-ND-SAME: !xegpu.tensor_desc<8xf16, #xegpu.block_tdesc_attr> // STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF2]]] : vector<8xf16> From 6837ee48aa2a60a7377c7d7e1d85d944cd648a07 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 10 Dec 2025 19:44:20 +0000 Subject: [PATCH 4/6] Address reviewer comments. --- mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 0313e7d937041..55ade0ae8eeec 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -107,15 +107,14 @@ static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter, bool isStatic = true; // Memref is dynamic if any of its shape, offset or strides is dynamic. - if (!srcTy.hasStaticShape()) { + if (!srcTy.hasStaticShape()) isStatic = false; - } - if (offset == ShapedType::kDynamic) + if (!ShapedType::isStatic(offset)) isStatic = false; for (auto stride : strides) { - if (stride == ShapedType::kDynamic) { + if (!ShapedType::isStatic(stride)) { isStatic = false; break; } From f43d5c8c981baabdb497ce1ca6b429c6f836a9eb Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 10 Dec 2025 22:10:43 +0000 Subject: [PATCH 5/6] [MLIR] Fix broken GPU integration tests targetting SYCL and LevelZero runtime. SPIR-V kernel lowering no longer supports i1 store. Fuse GPU kernels to remove usage of i1 stores. --- .../GPU/LevelZero/gpu-reluf32-to-spirv.mlir | 57 ++++++------------- .../GPU/SYCL/gpu-reluf32-to-spirv.mlir | 48 +++++----------- 2 files changed, 31 insertions(+), 74 deletions(-) diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir index 8d022ac1cf277..d0f21873e6e2c 100644 --- a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_levelzero_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -35,51 +35,26 @@ module @relu attributes {gpu.container_module} { func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> { %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<4x5xf32> - memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32> - %memref_0 = gpu.alloc host_shared () : memref<4x5xi1> - %2 = gpu.wait async - %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) - args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) - gpu.wait [%3] - %memref_1 = gpu.alloc host_shared () : memref<4x5xf32> - %4 = gpu.wait async - %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) - args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, - %memref_1 : memref<4x5xf32>) - gpu.wait [%5] - %alloc = memref.alloc() : memref<4x5xf32> - memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32> - %6 = gpu.wait async - %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32> - %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1> - %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32> - return %alloc : memref<4x5xf32> + %host_result = memref.alloc() : memref<4x5xf32> + %gpu_input = gpu.alloc() : memref<4x5xf32> + gpu.memcpy %gpu_input, %arg0 : memref<4x5xf32>, memref<4x5xf32> + %gpu_result = gpu.alloc() : memref<4x5xf32> + gpu.launch_func @test_kernel::@test_relu blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%gpu_input : memref<4x5xf32>, %gpu_result : memref<4x5xf32>) + gpu.memcpy %host_result, %gpu_result : memref<4x5xf32>, memref<4x5xf32> + gpu.dealloc %gpu_input : memref<4x5xf32> + gpu.dealloc %gpu_result : memref<4x5xf32> + return %host_result : memref<4x5xf32> } - gpu.module @test_kernel - attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel - attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_relu(%arg0: memref<4x5xf32>, %arg1: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %zero = arith.constant 0.000000e+00 : f32 %0 = gpu.block_id x %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> - %3 = arith.cmpf olt, %2, %arg1 : f32 - memref.store %3, %arg2[%0, %1] : memref<4x5xi1> - gpu.return - } - } - gpu.module @test_kernel_0 - attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel - attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y - %2 = memref.load %arg0[%0, %1] : memref<4x5xi1> - %3 = memref.load %arg1[%0, %1] : memref<4x5xf32> - %4 = arith.select %2, %arg2, %3 : f32 - memref.store %4, %arg3[%0, %1] : memref<4x5xf32> + %3 = arith.cmpf ogt, %2, %zero : f32 + %4 = arith.select %3, %2, %zero : f32 + memref.store %4, %arg1[%0, %1] : memref<4x5xf32> gpu.return } } diff --git a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir index e385daefcb9b5..9d45f405e9f0f 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_sycl_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -35,44 +35,26 @@ module @relu attributes {gpu.container_module} { func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> { %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<4x5xf32> - memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32> - %memref_0 = gpu.alloc host_shared () : memref<4x5xi1> - %2 = gpu.wait async - %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) - gpu.wait [%3] - %memref_1 = gpu.alloc host_shared () : memref<4x5xf32> - %4 = gpu.wait async - %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>) - gpu.wait [%5] - %alloc = memref.alloc() : memref<4x5xf32> - memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32> - %6 = gpu.wait async - %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32> - %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1> - %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32> - return %alloc : memref<4x5xf32> + %host_result = memref.alloc() : memref<4x5xf32> + %gpu_input = gpu.alloc() : memref<4x5xf32> + gpu.memcpy %gpu_input, %arg0 : memref<4x5xf32>, memref<4x5xf32> + %gpu_result = gpu.alloc() : memref<4x5xf32> + gpu.launch_func @test_kernel::@test_relu blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%gpu_input : memref<4x5xf32>, %gpu_result : memref<4x5xf32>) + gpu.memcpy %host_result, %gpu_result : memref<4x5xf32>, memref<4x5xf32> + gpu.dealloc %gpu_input : memref<4x5xf32> + gpu.dealloc %gpu_result : memref<4x5xf32> + return %host_result : memref<4x5xf32> } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_relu(%arg0: memref<4x5xf32>, %arg1: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %zero = arith.constant 0.000000e+00 : f32 %0 = gpu.block_id x %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> - %3 = arith.cmpf olt, %2, %arg1 : f32 - memref.store %3, %arg2[%0, %1] : memref<4x5xi1> - gpu.return - } - } - gpu.module @test_kernel_0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y - %2 = memref.load %arg0[%0, %1] : memref<4x5xi1> - %3 = memref.load %arg1[%0, %1] : memref<4x5xf32> - %4 = arith.select %2, %arg2, %3 : f32 - memref.store %4, %arg3[%0, %1] : memref<4x5xf32> + %3 = arith.cmpf ogt, %2, %zero : f32 + %4 = arith.select %3, %2, %zero : f32 + memref.store %4, %arg1[%0, %1] : memref<4x5xf32> gpu.return } } From 1f55241af791f49fcfefbdba4fec1cead0f396c1 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Wed, 10 Dec 2025 22:14:15 +0000 Subject: [PATCH 6/6] Revert "[MLIR] Fix broken GPU integration tests targetting SYCL and LevelZero runtime." This reverts commit f43d5c8c981baabdb497ce1ca6b429c6f836a9eb. --- .../GPU/LevelZero/gpu-reluf32-to-spirv.mlir | 57 +++++++++++++------ .../GPU/SYCL/gpu-reluf32-to-spirv.mlir | 48 +++++++++++----- 2 files changed, 74 insertions(+), 31 deletions(-) diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir index d0f21873e6e2c..8d022ac1cf277 100644 --- a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_levelzero_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -35,26 +35,51 @@ module @relu attributes {gpu.container_module} { func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> { %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 %c1 = arith.constant 1 : index - %host_result = memref.alloc() : memref<4x5xf32> - %gpu_input = gpu.alloc() : memref<4x5xf32> - gpu.memcpy %gpu_input, %arg0 : memref<4x5xf32>, memref<4x5xf32> - %gpu_result = gpu.alloc() : memref<4x5xf32> - gpu.launch_func @test_kernel::@test_relu blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%gpu_input : memref<4x5xf32>, %gpu_result : memref<4x5xf32>) - gpu.memcpy %host_result, %gpu_result : memref<4x5xf32>, memref<4x5xf32> - gpu.dealloc %gpu_input : memref<4x5xf32> - gpu.dealloc %gpu_result : memref<4x5xf32> - return %host_result : memref<4x5xf32> + %memref = gpu.alloc host_shared () : memref<4x5xf32> + memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32> + %memref_0 = gpu.alloc host_shared () : memref<4x5xi1> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) + args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) + gpu.wait [%3] + %memref_1 = gpu.alloc host_shared () : memref<4x5xf32> + %4 = gpu.wait async + %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) + args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, + %memref_1 : memref<4x5xf32>) + gpu.wait [%5] + %alloc = memref.alloc() : memref<4x5xf32> + memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32> + %6 = gpu.wait async + %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32> + %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1> + %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32> + return %alloc : memref<4x5xf32> } - gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_relu(%arg0: memref<4x5xf32>, %arg1: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %zero = arith.constant 0.000000e+00 : f32 + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel + attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %0 = gpu.block_id x %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> - %3 = arith.cmpf ogt, %2, %zero : f32 - %4 = arith.select %3, %2, %zero : f32 - memref.store %4, %arg1[%0, %1] : memref<4x5xf32> + %3 = arith.cmpf olt, %2, %arg1 : f32 + memref.store %3, %arg2[%0, %1] : memref<4x5xi1> + gpu.return + } + } + gpu.module @test_kernel_0 + attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel + attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<4x5xi1> + %3 = memref.load %arg1[%0, %1] : memref<4x5xf32> + %4 = arith.select %2, %arg2, %3 : f32 + memref.store %4, %arg3[%0, %1] : memref<4x5xf32> gpu.return } } diff --git a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir index 9d45f405e9f0f..e385daefcb9b5 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_sycl_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -35,26 +35,44 @@ module @relu attributes {gpu.container_module} { func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> { %c5 = arith.constant 5 : index %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 %c1 = arith.constant 1 : index - %host_result = memref.alloc() : memref<4x5xf32> - %gpu_input = gpu.alloc() : memref<4x5xf32> - gpu.memcpy %gpu_input, %arg0 : memref<4x5xf32>, memref<4x5xf32> - %gpu_result = gpu.alloc() : memref<4x5xf32> - gpu.launch_func @test_kernel::@test_relu blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%gpu_input : memref<4x5xf32>, %gpu_result : memref<4x5xf32>) - gpu.memcpy %host_result, %gpu_result : memref<4x5xf32>, memref<4x5xf32> - gpu.dealloc %gpu_input : memref<4x5xf32> - gpu.dealloc %gpu_result : memref<4x5xf32> - return %host_result : memref<4x5xf32> + %memref = gpu.alloc host_shared () : memref<4x5xf32> + memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32> + %memref_0 = gpu.alloc host_shared () : memref<4x5xi1> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) + gpu.wait [%3] + %memref_1 = gpu.alloc host_shared () : memref<4x5xf32> + %4 = gpu.wait async + %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>) + gpu.wait [%5] + %alloc = memref.alloc() : memref<4x5xf32> + memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32> + %6 = gpu.wait async + %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32> + %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1> + %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32> + return %alloc : memref<4x5xf32> } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_relu(%arg0: memref<4x5xf32>, %arg1: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %zero = arith.constant 0.000000e+00 : f32 + gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %0 = gpu.block_id x %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> - %3 = arith.cmpf ogt, %2, %zero : f32 - %4 = arith.select %3, %2, %zero : f32 - memref.store %4, %arg1[%0, %1] : memref<4x5xf32> + %3 = arith.cmpf olt, %2, %arg1 : f32 + memref.store %3, %arg2[%0, %1] : memref<4x5xi1> + gpu.return + } + } + gpu.module @test_kernel_0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<4x5xi1> + %3 = memref.load %arg1[%0, %1] : memref<4x5xf32> + %4 = arith.select %2, %arg2, %3 : f32 + memref.store %4, %arg3[%0, %1] : memref<4x5xf32> gpu.return } }