Update tests

lialan · lialan · commit 2580b469e4a1 · 2024-10-30T17:28:39.000Z
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -130,6 +130,7 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
   return newMask;
 }
 
+/// A wrapper function for emitting `vector.extract_strided_slice`.
 static Value extractSubvectorFrom(RewriterBase &rewriter, Location loc,
                                   VectorType extractType, Value vector,
                                   int64_t frontOffset, int64_t subvecSize) {
@@ -142,6 +143,7 @@ static Value extractSubvectorFrom(RewriterBase &rewriter, Location loc,
       ->getResult(0);
 }
 
+/// A wrapper function for emitting `vector.insert_strided_slice`.
 static Value insertSubvectorInto(RewriterBase &rewriter, Location loc,
                                  Value src, Value dest, int64_t offset) {
   auto offsets = rewriter.getI64ArrayAttr({offset});
@@ -150,36 +152,14 @@ static Value insertSubvectorInto(RewriterBase &rewriter, Location loc,
                                                        dest, offsets, strides);
 }
 
+/// Extracts `lengthSubvec` elements from `srcVec` into `destVec` starting at
+/// the offset specified by `srcOffsetVar`. Use this function when
+/// `srcOffsetVar` is not a constant, making it impossible to use
+/// vector.extract_strided_slice, as it requires constant offsets.
 static void dynamicallyExtractElementsToVector(
     RewriterBase &rewriter, Location loc, TypedValue<VectorType> srcVec,
-    Value destVec, OpFoldResult srcOffsetVar, int64_t loopSize) {
-  /*
-  // Create affine maps for the lower and upper bounds
-  AffineMap lowerBoundMap = AffineMap::getConstantMap(0, rewriter.getContext());
-  AffineMap upperBoundMap =
-      AffineMap::getConstantMap(loopSize, rewriter.getContext());
-
-  auto forLoop = rewriter.create<affine::AffineForOp>(
-      loc, ValueRange{}, lowerBoundMap, ValueRange{}, upperBoundMap, 1,
-      ArrayRef<Value>(destVec));
-
-  OpBuilder builder =
-      OpBuilder::atBlockEnd(forLoop.getBody(), rewriter.getListener());
-
-  auto iv = forLoop.getInductionVar();
-
-  auto loopDestVec = forLoop.getRegionIterArgs()[0];
-  auto extractLoc = builder.create<arith::AddIOp>(
-      loc, rewriter.getIndexType(), srcOffsetVar.dyn_cast<Value>(), iv);
-  auto extractElemOp = builder.create<vector::ExtractElementOp>(
-      loc, elemType, srcVec, extractLoc);
-  auto insertElemOp = builder.create<vector::InsertElementOp>(
-      loc, extractElemOp, loopDestVec, iv);
-  builder.create<affine::AffineYieldOp>(loc,
-                                        ValueRange{insertElemOp->getResult(0)});
-  return forLoop->getResult(0);
-  */
-  for (int i = 0; i < loopSize; ++i) {
+    Value destVec, OpFoldResult srcOffsetVar, int64_t lengthSubvec) {
+  for (int i = 0; i < lengthSubvec; ++i) {
     Value extractLoc;
     if (i == 0) {
       extractLoc = srcOffsetVar.dyn_cast<Value>();
@@ -194,15 +174,21 @@ static void dynamicallyExtractElementsToVector(
   }
 }
 
+/// Load `numLoadedElements` of `newElementType` from `base` at
+/// `linearizedIndices`, then bitcast the result into a vector of
+/// `oldElementType`.
 static TypedValue<VectorType>
 emulatedVectorLoad(ConversionPatternRewriter &rewriter, Location loc,
-                   Value base, OpFoldResult linearizedIndices, int64_t numBytes,
-                   int64_t scale, Type oldElememtType, Type newElementType) {
+                   Value base, OpFoldResult linearizedIndices,
+                   int64_t numLoadedElements, Type oldElememtType,
+                   Type newElementType) {
+  auto scale = newElementType.getIntOrFloatBitWidth() /
+               oldElememtType.getIntOrFloatBitWidth();
   auto newLoad = rewriter.create<vector::LoadOp>(
-      loc, VectorType::get(numBytes, newElementType), base,
+      loc, VectorType::get(numLoadedElements, newElementType), base,
       getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
   return rewriter.create<vector::BitCastOp>(
-      loc, VectorType::get(numBytes * scale, oldElememtType), newLoad);
+      loc, VectorType::get(numLoadedElements * scale, oldElememtType), newLoad);
 };
 
 namespace {
@@ -443,7 +429,7 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
         llvm::divideCeil(maxintraDataOffset + origElements, scale);
     Value result =
         emulatedVectorLoad(rewriter, loc, adaptor.getBase(), linearizedIndices,
-                           numElements, scale, oldElementType, newElementType);
+                           numElements, oldElementType, newElementType);
 
     if (foldedIntraVectorOffset) {
       if (isUnalignedEmulation) {
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-dynamic.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-dynamic.mlir
@@ -2,14 +2,13 @@
 
 // CHECK: #map = affine_map<()[s0, s1] -> ((s0 * 3 + s1) floordiv 4)>
 // CHECK: #map1 = affine_map<()[s0, s1] -> ((s0 * 3 + s1) mod 4)>
-func.func @vector_load_i2(%arg1: index, %arg2: index) -> vector<3x3xi2> {
+func.func @vector_load_i2(%arg1: index, %arg2: index) -> vector<3xi2> {
     %0 = memref.alloc() : memref<3x3xi2>
     %c0 = arith.constant 0 : index
     %c2 = arith.constant 2 : index
     %cst = arith.constant dense<0> : vector<3x3xi2>
     %1 = vector.load %0[%arg1, %arg2] : memref<3x3xi2>, vector<3xi2>
-    %2 = vector.insert %1, %cst [0] : vector<3xi2> into vector<3x3xi2>
-    return %2 : vector<3x3xi2>
+    return %1 : vector<3xi2>
 }
 
 // CHECK: func @vector_load_i2
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
@@ -19,6 +19,25 @@ func.func @vector_load_i2(%arg1: index, %arg2: index) -> vector<3x3xi2> {
 
 //-----
 
+func.func @vector_load_i2_unaligned(%arg1: index, %arg2: index) -> vector<3x3xi2> {
+    %0 = memref.alloc() : memref<3x3xi2>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant dense<0> : vector<3x3xi2>
+    %1 = vector.load %0[%c0, %c1] : memref<3x3xi2>, vector<3xi2>
+    %2 = vector.insert %1, %cst [0] : vector<3xi2> into vector<3x3xi2>
+    return %2 : vector<3x3xi2>
+}
+
+// CHECK: func @vector_load_i2_unaligned
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
+// CHECK: %[[INDEX:.+]] = arith.constant 0 : index
+// CHECK: %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<3xi8>, vector<1xi8>
+// CHECK: %[[VEC_I2:.+]] = vector.bitcast %[[VEC]] : vector<1xi8> to vector<4xi2>
+// CHECK: %[[EXCTRACT:.+]] = vector.extract_strided_slice %[[VEC_I2]] {offsets = [1], sizes = [3], strides = [1]} : vector<4xi2> to vector<3xi2>
+
+//-----
+
 func.func @vector_transfer_read_i2() -> vector<3xi2> {
  %0 = memref.alloc() : memref<3x3xi2>
  %c0i2 = arith.constant 0 : i2
@@ -37,6 +56,26 @@ func.func @vector_transfer_read_i2() -> vector<3xi2> {
 
 //-----
 
+func.func @vector_transfer_read_i2_unaligned() -> vector<3xi2> {
+ %0 = memref.alloc() : memref<3x3xi2>
+ %c0i2 = arith.constant 0 : i2
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = vector.transfer_read %0[%c0, %c1], %c0i2 {in_bounds = [true]} : memref<3x3xi2>, vector<3xi2>
+ return %1 : vector<3xi2>
+}
+
+// CHECK: func @vector_transfer_read_i2_unaligned
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
+// CHECK: %[[PAD:.+]] = arith.constant 0 : i2
+// CHECK: %[[EXT:.+]] = arith.extui %[[PAD]] : i2 to i8
+// CHECK: %[[INDEX:.+]] = arith.constant 0 : index
+// CHECK: %[[READ:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[EXT]] : memref<3xi8>, vector<1xi8>
+// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[READ]] : vector<1xi8> to vector<4xi2>
+// CHECK: vector.extract_strided_slice %[[BITCAST]] {offsets = [1], sizes = [3], strides = [1]} : vector<4xi2> to vector<3xi2>
+
+//-----
+
 func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
     %0 = memref.alloc() : memref<3x5xi2>
     %cst = arith.constant dense<0> : vector<3x5xi2>
@@ -64,4 +103,36 @@ func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
 // CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[ORIGINMASK]], %[[CST2]]
 // CHECK-SAME: {offsets = [2], strides = [1]} : vector<5xi1> into vector<8xi1>
 // CHECK: %[[SELECT:.+]] = arith.select %[[INSERT2]], %[[BITCAST2]], %[[INSERT1]] : vector<8xi1>, vector<8xi2>
-// CHECK: vector.extract_strided_slice %[[SELECT]] {offsets = [2], sizes = [5], strides = [1]} : vector<8xi2> to vector<5xi2> 
+// CHECK: vector.extract_strided_slice %[[SELECT]] {offsets = [2], sizes = [5], strides = [1]} : vector<8xi2> to vector<5xi2>
+
+//-----
+
+func.func @vector_cst_maskedload_i2_unaligned(%passthru: vector<5xi2>) -> vector<3x5xi2> {
+    %0 = memref.alloc() : memref<3x5xi2>
+    %cst = arith.constant dense<0> : vector<3x5xi2>
+    %mask = vector.constant_mask [3] : vector<5xi1>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = vector.maskedload %0[%c0, %c1], %mask, %passthru :
+      memref<3x5xi2>, vector<5xi1>, vector<5xi2> into vector<5xi2>
+    %2 = vector.insert %1, %cst [0] : vector<5xi2> into vector<3x5xi2>
+    return %2 : vector<3x5xi2>
+}
+
+
+// CHECK: func @vector_cst_maskedload_i2_unaligned
+// CHECK: %[[ORIGINMASK:.+]] = vector.constant_mask [3] : vector<5xi1>
+// CHECK: %[[NEWMASK:.+]] = arith.constant dense<[true, false]> : vector<2xi1>
+// CHECK: %[[VESSEL:.+]] = arith.constant dense<0> : vector<8xi2>
+// CHECK: %[[INSERT1:.+]] = vector.insert_strided_slice %arg0, %[[VESSEL]]
+// CHECK-SAME: {offsets = [1], strides = [1]} : vector<5xi2> into vector<8xi2>
+// CHECK: %[[BITCAST1:.+]] = vector.bitcast %[[INSERT1]] : vector<8xi2> to vector<2xi8>
+// CHECK: %[[C0:.+]] = arith.constant 0 : index
+// CHECK: %[[MASKEDLOAD:.+]] = vector.maskedload %alloc[%[[C0]]], %[[NEWMASK:.+]], %[[BITCAST1]]
+// CHECK-SAME: : memref<4xi8>, vector<2xi1>, vector<2xi8> into vector<2xi8>
+// CHECK: %[[BITCAST2:.+]] = vector.bitcast %[[MASKEDLOAD]] : vector<2xi8> to vector<8xi2>
+// CHECK: %[[CST2:.+]] = arith.constant dense<false> : vector<8xi1>
+// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[ORIGINMASK]], %[[CST2]]
+// CHECK-SAME: {offsets = [1], strides = [1]} : vector<5xi1> into vector<8xi1>
+// CHECK: %[[SELECT:.+]] = arith.select %[[INSERT2]], %[[BITCAST2]], %[[INSERT1]] : vector<8xi1>, vector<8xi2>
+// CHECK: vector.extract_strided_slice %[[SELECT]] {offsets = [1], sizes = [5], strides = [1]} : vector<8xi2> to vector<5xi2>