Refactor and fixes

lialan · lialan · commit a9d72602a9b5 · 2024-11-01T02:14:52.000Z
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -38,16 +38,17 @@ using namespace mlir;
 
 /// Returns a compressed mask. The mask value is set only if any mask is present
 /// in the scale range. E.g., if `scale` equals to 2, and `intraDataOffset`
-/// equals to 2, the following mask:
+/// equals to 1 (intraDataOffset strictly smaller than scale), the following
+/// mask:
 ///
-///   %mask = [1, 1, 1, 0, 0, 0]
+///   %mask = [1, 1, 0, 0, 0, 0]
 ///
 /// will first be padded with number of `intraDataOffset` zeros:
-///   %mask = [0, 0, 1, 1, 1, 0, 0, 0]
+///   %mask = [0, 1, 1, 0, 0, 0, 0, 0]
 ///
 /// then it will return the following new compressed mask:
 ///
-///   %mask = [0, 1, 1, 0]
+///   %mask = [1, 1, 0, 0]
 static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
                                                   Location loc, Value mask,
                                                   int origElements, int scale,
@@ -76,9 +77,6 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
   shape.back() = numElements;
   auto newMaskType = VectorType::get(shape, rewriter.getI1Type());
   if (createMaskOp) {
-    // TODO: handle the case with non-zero intraDataOffset for CreateMaskOp.
-    if (intraDataOffset != 0)
-      return failure();
     OperandRange maskOperands = createMaskOp.getOperands();
     size_t numMaskOperands = maskOperands.size();
     AffineExpr s0;
@@ -130,10 +128,18 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
   return newMask;
 }
 
-/// A wrapper function for emitting `vector.extract_strided_slice`.
+/// A wrapper function for emitting `vector.extract_strided_slice`. The vector
+/// has to be of 1-D shape.
 static Value extractSubvectorFrom(RewriterBase &rewriter, Location loc,
                                   VectorType extractType, Value vector,
                                   int64_t frontOffset, int64_t subvecSize) {
+  // get vector's vector type:
+  auto vectorType = dyn_cast<VectorType>(vector.getType());
+  assert(vectorType && "expected vector type");
+  assert(vectorType.getShape().size() == 1 && "expected 1-D vector type");
+  assert(extractType.getShape().size() == 1 &&
+         "extractType must be 1-D vector type");
+
   auto offsets = rewriter.getI64ArrayAttr({frontOffset});
   auto sizes = rewriter.getI64ArrayAttr({subvecSize});
   auto strides = rewriter.getI64ArrayAttr({1});
@@ -143,9 +149,17 @@ static Value extractSubvectorFrom(RewriterBase &rewriter, Location loc,
       ->getResult(0);
 }
 
-/// A wrapper function for emitting `vector.insert_strided_slice`.
+/// A wrapper function for emitting `vector.insert_strided_slice`. The source
+/// and dest vectors must be of 1-D shape.
 static Value insertSubvectorInto(RewriterBase &rewriter, Location loc,
                                  Value src, Value dest, int64_t offset) {
+  auto srcType = dyn_cast<VectorType>(src.getType());
+  assert(srcType && "expected vector type");
+  assert(srcType.getShape().size() == 1 && "expected 1-D vector type");
+  auto destType = dyn_cast<VectorType>(dest.getType());
+  assert(destType && "expected vector type");
+  assert(destType.getShape().size() == 1 && "expected 1-D vector type");
+
   auto offsets = rewriter.getI64ArrayAttr({offset});
   auto strides = rewriter.getI64ArrayAttr({1});
   return rewriter.create<vector::InsertStridedSliceOp>(loc, dest.getType(), src,
@@ -157,24 +171,20 @@ static Value insertSubvectorInto(RewriterBase &rewriter, Location loc,
 /// `srcOffsetVar` is not a constant, making it impossible to use
 /// vector.extract_strided_slice, as it requires constant offsets.
 static Value dynamicallyExtractSubVector(RewriterBase &rewriter, Location loc,
-                                         TypedValue<VectorType> srcVec,
-                                         Value destVec,
-                                         OpFoldResult srcOffsetVar,
-                                         int64_t lengthSubvec) {
-  for (int i = 0; i < lengthSubvec; ++i) {
-    Value extractLoc;
-    if (i == 0) {
-      extractLoc = srcOffsetVar.dyn_cast<Value>();
-    } else {
-      extractLoc = rewriter.create<arith::AddIOp>(
-          loc, rewriter.getIndexType(), srcOffsetVar.dyn_cast<Value>(),
-          rewriter.create<arith::ConstantIndexOp>(loc, i));
-    }
+                                         TypedValue<VectorType> source,
+                                         Value dest, OpFoldResult offset,
+                                         int64_t numElementsToExtract) {
+  for (int i = 0; i < numElementsToExtract; ++i) {
+    Value extractLoc =
+        (i == 0) ? offset.dyn_cast<Value>()
+                 : rewriter.create<arith::AddIOp>(
+                       loc, rewriter.getIndexType(), offset.dyn_cast<Value>(),
+                       rewriter.create<arith::ConstantIndexOp>(loc, i));
     auto extractOp =
-        rewriter.create<vector::ExtractOp>(loc, srcVec, extractLoc);
-    destVec = rewriter.create<vector::InsertOp>(loc, extractOp, destVec, i);
+        rewriter.create<vector::ExtractOp>(loc, source, extractLoc);
+    dest = rewriter.create<vector::InsertOp>(loc, extractOp, dest, i);
   }
-  return destVec;
+  return dest;
 }
 
 /// Load `numLoadedElements` of `newElementType` from `base` at
@@ -183,15 +193,15 @@ static Value dynamicallyExtractSubVector(RewriterBase &rewriter, Location loc,
 static TypedValue<VectorType>
 emulatedVectorLoad(ConversionPatternRewriter &rewriter, Location loc,
                    Value base, OpFoldResult linearizedIndices,
-                   int64_t numLoadedElements, Type oldElememtType,
+                   int64_t numElementsToLoad, Type oldElememtType,
                    Type newElementType) {
   auto scale = newElementType.getIntOrFloatBitWidth() /
                oldElememtType.getIntOrFloatBitWidth();
   auto newLoad = rewriter.create<vector::LoadOp>(
-      loc, VectorType::get(numLoadedElements, newElementType), base,
+      loc, VectorType::get(numElementsToLoad, newElementType), base,
       getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
   return rewriter.create<vector::BitCastOp>(
-      loc, VectorType::get(numLoadedElements * scale, oldElememtType), newLoad);
+      loc, VectorType::get(numElementsToLoad * scale, oldElememtType), newLoad);
 };
 
 namespace {
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-dynamic.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-dynamic.mlir
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
@@ -1,5 +1,8 @@
 // RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=8" --cse --split-input-file %s | FileCheck %s
 
+// CHECK: #map = affine_map<()[s0, s1] -> ((s0 * 3 + s1) floordiv 4)>
+// CHECK: #map1 = affine_map<()[s0, s1] -> ((s0 * 3 + s1) mod 4)>
+
 func.func @vector_load_i2(%arg1: index, %arg2: index) -> vector<3x3xi2> {
     %0 = memref.alloc() : memref<3x3xi2>
     %c0 = arith.constant 0 : index
@@ -19,25 +22,6 @@ func.func @vector_load_i2(%arg1: index, %arg2: index) -> vector<3x3xi2> {
 
 //-----
 
-func.func @vector_load_i2_unaligned(%arg1: index, %arg2: index) -> vector<3x3xi2> {
-    %0 = memref.alloc() : memref<3x3xi2>
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %cst = arith.constant dense<0> : vector<3x3xi2>
-    %1 = vector.load %0[%c0, %c1] : memref<3x3xi2>, vector<3xi2>
-    %2 = vector.insert %1, %cst [0] : vector<3xi2> into vector<3x3xi2>
-    return %2 : vector<3x3xi2>
-}
-
-// CHECK: func @vector_load_i2_unaligned
-// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
-// CHECK: %[[INDEX:.+]] = arith.constant 0 : index
-// CHECK: %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<3xi8>, vector<1xi8>
-// CHECK: %[[VEC_I2:.+]] = vector.bitcast %[[VEC]] : vector<1xi8> to vector<4xi2>
-// CHECK: %[[EXCTRACT:.+]] = vector.extract_strided_slice %[[VEC_I2]] {offsets = [1], sizes = [3], strides = [1]} : vector<4xi2> to vector<3xi2>
-
-//-----
-
 func.func @vector_transfer_read_i2() -> vector<3xi2> {
  %0 = memref.alloc() : memref<3x3xi2>
  %c0i2 = arith.constant 0 : i2
@@ -56,26 +40,6 @@ func.func @vector_transfer_read_i2() -> vector<3xi2> {
 
 //-----
 
-func.func @vector_transfer_read_i2_unaligned() -> vector<3xi2> {
- %0 = memref.alloc() : memref<3x3xi2>
- %c0i2 = arith.constant 0 : i2
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %1 = vector.transfer_read %0[%c0, %c1], %c0i2 {in_bounds = [true]} : memref<3x3xi2>, vector<3xi2>
- return %1 : vector<3xi2>
-}
-
-// CHECK: func @vector_transfer_read_i2_unaligned
-// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
-// CHECK: %[[PAD:.+]] = arith.constant 0 : i2
-// CHECK: %[[EXT:.+]] = arith.extui %[[PAD]] : i2 to i8
-// CHECK: %[[INDEX:.+]] = arith.constant 0 : index
-// CHECK: %[[READ:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[EXT]] : memref<3xi8>, vector<1xi8>
-// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[READ]] : vector<1xi8> to vector<4xi2>
-// CHECK: vector.extract_strided_slice %[[BITCAST]] {offsets = [1], sizes = [3], strides = [1]} : vector<4xi2> to vector<3xi2>
-
-//-----
-
 func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
     %0 = memref.alloc() : memref<3x5xi2>
     %cst = arith.constant dense<0> : vector<3x5xi2>
@@ -107,32 +71,49 @@ func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
 
 //-----
 
-func.func @vector_cst_maskedload_i2_unaligned(%passthru: vector<5xi2>) -> vector<3x5xi2> {
-    %0 = memref.alloc() : memref<3x5xi2>
-    %cst = arith.constant dense<0> : vector<3x5xi2>
-    %mask = vector.constant_mask [3] : vector<5xi1>
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %1 = vector.maskedload %0[%c0, %c1], %mask, %passthru :
-      memref<3x5xi2>, vector<5xi1>, vector<5xi2> into vector<5xi2>
-    %2 = vector.insert %1, %cst [0] : vector<5xi2> into vector<3x5xi2>
-    return %2 : vector<3x5xi2>
+func.func @vector_load_i2_dynamic_indexing(%arg1: index, %arg2: index) -> vector<3xi2> {
+  %0 = memref.alloc() : memref<3x3xi2>
+  %cst = arith.constant dense<0> : vector<3x3xi2>
+  %1 = vector.load %0[%arg1, %arg2] : memref<3x3xi2>, vector<3xi2>
+  return %1 : vector<3xi2>
 }
 
+// CHECK: func @vector_load_i2_dynamic_indexing
+// CHECK: %[[ALLOC:.+]]= memref.alloc() : memref<3xi8>
+// CHECK: %[[LOADADDR1:.+]] = affine.apply #map()[%arg0, %arg1]
+// CHECK: %[[LOADADDR2:.+]] = affine.apply #map1()[%arg0, %arg1]
+// CHECK: %[[EMULATED_LOAD:.+]] = vector.load %alloc[%[[LOADADDR1]]] : memref<3xi8>, vector<2xi8>
+// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[EMULATED_LOAD]] : vector<2xi8> to vector<8xi2>
+// CHECK: %[[ZERO:.+]] = arith.constant dense<0> : vector<3xi2>
+// CHECK: %[[EXTRACT:.+]] = vector.extract %[[BITCAST]][%[[LOADADDR2]]] : i2 from vector<8xi2>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: %[[OFFSET:.+]] = arith.addi %[[LOADADDR2]], %[[C1]] : index
+// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[BITCAST]][%[[OFFSET]]] : i2 from vector<8xi2>
+// CHECK: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: %[[OFFSET2:.+]] = arith.addi %1, %c2 : index
+// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[BITCAST]][%[[OFFSET2]]] : i2 from vector<8xi2>
 
-// CHECK: func @vector_cst_maskedload_i2_unaligned
-// CHECK: %[[ORIGINMASK:.+]] = vector.constant_mask [3] : vector<5xi1>
-// CHECK: %[[NEWMASK:.+]] = arith.constant dense<[true, false]> : vector<2xi1>
-// CHECK: %[[VESSEL:.+]] = arith.constant dense<0> : vector<8xi2>
-// CHECK: %[[INSERT1:.+]] = vector.insert_strided_slice %arg0, %[[VESSEL]]
-// CHECK-SAME: {offsets = [1], strides = [1]} : vector<5xi2> into vector<8xi2>
-// CHECK: %[[BITCAST1:.+]] = vector.bitcast %[[INSERT1]] : vector<8xi2> to vector<2xi8>
-// CHECK: %[[C0:.+]] = arith.constant 0 : index
-// CHECK: %[[MASKEDLOAD:.+]] = vector.maskedload %alloc[%[[C0]]], %[[NEWMASK:.+]], %[[BITCAST1]]
-// CHECK-SAME: : memref<4xi8>, vector<2xi1>, vector<2xi8> into vector<2xi8>
-// CHECK: %[[BITCAST2:.+]] = vector.bitcast %[[MASKEDLOAD]] : vector<2xi8> to vector<8xi2>
-// CHECK: %[[CST2:.+]] = arith.constant dense<false> : vector<8xi1>
-// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[ORIGINMASK]], %[[CST2]]
-// CHECK-SAME: {offsets = [1], strides = [1]} : vector<5xi1> into vector<8xi1>
-// CHECK: %[[SELECT:.+]] = arith.select %[[INSERT2]], %[[BITCAST2]], %[[INSERT1]] : vector<8xi1>, vector<8xi2>
-// CHECK: vector.extract_strided_slice %[[SELECT]] {offsets = [1], sizes = [5], strides = [1]} : vector<8xi2> to vector<5xi2>
+//-----
+
+func.func @vector_transfer_read_i2_dynamic_indexing(%arg1: index, %arg2: index) -> vector<3xi2> {
+  %0 = memref.alloc() : memref<3x3xi2>
+  %pad = arith.constant 0 : i2
+  %1 = vector.transfer_read %0[%arg1, %arg2], %pad {in_bounds = [true]} : memref<3x3xi2>, vector<3xi2>
+  return %1 : vector<3xi2>
+}
+
+// CHECK: func @vector_transfer_read_i2_dynamic_indexing
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
+// CHECK: %[[C0:.+]] = arith.extui %c0_i2 : i2 to i8
+// CHECK: %[[LOADADDR1:.+]] = affine.apply #map()[%arg0, %arg1]
+// CHECK: %[[LOADADDR2:.+]] = affine.apply #map1()[%arg0, %arg1]
+// CHECK: %[[READ:.+]] = vector.transfer_read %[[ALLOC]][%[[LOADADDR1]]], %[[C0]] : memref<3xi8>, vector<2xi8>
+// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[READ]] : vector<2xi8> to vector<8xi2>
+// CHECK: %[[CST:.+]] = arith.constant dense<0> : vector<3xi2>
+// CHECK: %[[EXTRACT:.+]] = vector.extract %[[BITCAST]][%[[LOADADDR2]]] : i2 from vector<8xi2>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: %[[ADDI:.+]] = arith.addi %[[LOADADDR2]], %[[C1]] : index
+// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[BITCAST]][%[[ADDI]]] : i2 from vector<8xi2>
+// CHECK: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: %[[ADDI2:.+]] = arith.addi %[[LOADADDR2]], %[[C2]] : index
+// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[BITCAST]][%[[ADDI2]]] : i2 from vector<8xi2>