Update according comments

lialan · lialan · commit 70de874277eb · 2025-01-24T10:54:07.000Z
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -296,38 +296,49 @@ static VectorValue emulatedVectorLoad(OpBuilder &rewriter, Location loc,
       newLoad);
 }
 
-/// Selects values from two sources based on a mask, and casts the result to a
-/// new type.
-static Value selectAndCast(OpBuilder &builder, Location loc,
-                           VectorType castIntoType, Value mask, Value trueValue,
-                           Value falseValue) {
-  Value maskedValue =
+/// Downcast two values to `downcastType`, then select values
+/// based on `mask`, and casts the result to `upcastType`.
+static Value downcastSelectAndUpcast(OpBuilder &builder, Location loc,
+                                     VectorType downcastType,
+                                     VectorType upcastType, Value mask,
+                                     Value trueValue, Value falseValue) {
+  assert(
+      downcastType.getNumElements() * downcastType.getElementTypeBitWidth() ==
+          upcastType.getNumElements() * upcastType.getElementTypeBitWidth() &&
+      "expected upcastType size to be twice the size of downcastType");
+  if (trueValue.getType() != downcastType)
+    trueValue = builder.create<vector::BitCastOp>(loc, downcastType, trueValue);
+  if (falseValue.getType() != downcastType)
+    falseValue =
+        builder.create<vector::BitCastOp>(loc, downcastType, falseValue);
+  Value selectedType =
       builder.create<arith::SelectOp>(loc, mask, trueValue, falseValue);
-  return builder.create<vector::BitCastOp>(loc, castIntoType, maskedValue);
+  // Upcast the selected value to the new type.
+  return builder.create<vector::BitCastOp>(loc, upcastType, selectedType);
 }
 
 /// Emits `memref.generic_atomic_rmw` op to store a subbyte-sized value to a
-/// byte in memory, with a mask. The `valueToStore` is a vector of subbyte-sized
-/// elements, with size of 8 bits, and the mask is used to select which elements
-/// to store.
+/// byte in `linearizedMemref`, with a mask. The `valueToStore` is a vector of
+/// subbyte-sized elements, with size of 8 bits, and the mask is used to select
+/// which elements to store.
 ///
 /// Inputs:
 ///   linearizedMemref = |2|2|2|2| : <4xi2> (<1xi8>)
-///   linearizedIndex = 2
+///   storeIdx = 2
 ///   valueToStore = |3|3|3|3| : vector<4xi2>
 ///   mask = |0|0|1|1| : vector<4xi1>
 ///
 /// Result:
 ///   linearizedMemref = |2|2|3|3| : <4xi2> (<1xi8>)
 static void atomicStore(OpBuilder &builder, Location loc,
-                        MemRefValue linearizedMemref, Value linearizedIndex,
+                        MemRefValue linearizedMemref, Value storeIdx,
                         VectorValue valueToStore, Value mask) {
   assert(valueToStore.getType().getRank() == 1 && "expected 1-D vector");
 
   // Create an atomic load-modify-write region using
   // `memref.generic_atomic_rmw`.
   auto atomicOp = builder.create<memref::GenericAtomicRMWOp>(
-      loc, linearizedMemref, ValueRange{linearizedIndex});
+      loc, linearizedMemref, ValueRange{storeIdx});
   Value origValue = atomicOp.getCurrentValue();
 
   OpBuilder::InsertionGuard guard(builder);
@@ -338,30 +349,30 @@ static void atomicStore(OpBuilder &builder, Location loc,
   auto oneElemVecType = VectorType::get({1}, origValue.getType());
   Value origVecValue = builder.create<vector::FromElementsOp>(
       loc, oneElemVecType, ValueRange{origValue});
-  origVecValue = builder.create<vector::BitCastOp>(loc, valueToStore.getType(),
-                                                   origVecValue);
 
   // Construct the final masked value and yield it.
-  Value maskedValue = selectAndCast(builder, loc, oneElemVecType, mask,
-                                    valueToStore, origVecValue);
+  Value maskedValue =
+      downcastSelectAndUpcast(builder, loc, valueToStore.getType(),
+                              oneElemVecType, mask, valueToStore, origVecValue);
   auto scalarMaskedValue =
       builder.create<vector::ExtractOp>(loc, maskedValue, 0);
   builder.create<memref::AtomicYieldOp>(loc, scalarMaskedValue);
 }
 
-/// Extract `sliceNumElements` from source `vector` at `sliceOffset`,
-/// and insert it into an empty vector at offset `byteOffset`.
+/// Extract `sliceNumElements` from source `vector` at `extractOffset`,
+/// and insert it into an empty vector at `insertOffset`.
 /// Inputs:
-///   vector = |1|2|3|4| : vector<4xi2>
-///   sliceOffset = 1
+///   vec_in  = |0|1|2|3| : vector<4xi2>
+///   extractOffset = 1
 ///   sliceNumElements = 2
-///   byteOffset = 2
+///   insertOffset = 2
 /// Output:
-///   vector = |0|0|2|3| : vector<4xi2>
+///   vec_out = |0|0|1|2| : vector<4xi2>
 static Value extractSliceIntoByte(ConversionPatternRewriter &rewriter,
                                   Location loc, VectorValue vector,
-                                  int64_t sliceOffset, int64_t sliceNumElements,
-                                  int64_t byteOffset) {
+                                  int64_t extractOffset,
+                                  int64_t sliceNumElements,
+                                  int64_t insertOffset) {
   assert(vector.getType().getRank() == 1 && "expected 1-D vector");
   auto vectorElementType = vector.getType().getElementType();
   assert(
@@ -374,9 +385,9 @@ static Value extractSliceIntoByte(ConversionPatternRewriter &rewriter,
       loc, VectorType::get({scale}, vectorElementType),
       rewriter.getZeroAttr(VectorType::get({scale}, vectorElementType)));
   auto extracted = staticallyExtractSubvector(rewriter, loc, vector,
-                                              sliceOffset, sliceNumElements);
+                                              extractOffset, sliceNumElements);
   return staticallyInsertSubvector(rewriter, loc, extracted, emptyByteVector,
-                                   byteOffset);
+                                   insertOffset);
 }
 
 namespace {
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
@@ -361,25 +361,27 @@ func.func @vector_maskedload_i2_constant_mask_unaligned(%passthru: vector<5xi2>)
 /// vector.store
 ///----------------------------------------------------------------------------------------
 
-func.func @vector_store_i2_const_index_two_atomic(%arg0: vector<3xi2>) {
-    %0 = memref.alloc() : memref<3x3xi2>
+func.func @vector_store_i2_const_index_two_atomic_rmw(%arg0: vector<3xi2>) {
+    %src = memref.alloc() : memref<3x3xi2>
     %c0 = arith.constant 0 : index
     %c2 = arith.constant 2 : index
-    vector.store %arg0, %0[%c2, %c0] :memref<3x3xi2>, vector<3xi2>
+    vector.store %arg0, %src[%c2, %c0] :memref<3x3xi2>, vector<3xi2>
     return
 }
 
 // In this example, emit 2 atomic RMWs.
-// Load from bit [12:18), byte [1:2] of total 3 bytes, both bytes needs rmw.
+//
+// Note, sizeof(%src) = 18 bits. This is modelled as %src_as_bytes:
+// <3xi8> (bits [0, 18) with the input values from %src, and [18, 24) are masked out)
 
-// CHECK-LABEL: func @vector_store_i2_const_index_two_atomic(
+// CHECK-LABEL: func @vector_store_i2_const_index_two_atomic_rmw(
 // CHECK-SAME: %[[ARG0:.+]]: vector<3xi2>)
 // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
 // CHECK: %[[C1:.+]] = arith.constant 1 : index
 // CHECK: %[[CST:.+]] = arith.constant dense<[false, false, true, true]> : vector<4xi1>
 // CHECK: %[[CST_0:.+]] = arith.constant dense<0> : vector<4xi2>
 
-// Part 1 atomic RMW sequence
+// Part 1 atomic RMW sequence (load bits [12, 16) from %src_as_bytes[1])
 // CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[ARG0]]
 // CHECK-SAME: {offsets = [0], sizes = [2], strides = [1]} : vector<3xi2> to vector<2xi2>
 // CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[EXTRACT]], %[[CST_0]]
@@ -393,7 +395,7 @@ func.func @vector_store_i2_const_index_two_atomic(%arg0: vector<3xi2>) {
 // CHECK: %[[EXTRACT2:.+]] = vector.extract %[[BITCAST2]][0] : i8 from vector<1xi8>
 // CHECK: memref.atomic_yield %[[EXTRACT2]] : i8
 
-// Part 2 atomic RMW sequence
+// Part 2 atomic RMW sequence (load bits [16, 18) from %src_as_bytes[2])
 // CHECK: %[[ADDR2:.+]] = arith.addi %[[C1]], %[[C1]] : index
 // CHECK: %[[EXTRACT3:.+]] = vector.extract_strided_slice %[[ARG0]]
 // CHECK-SAME: {offsets = [2], sizes = [1], strides = [1]} : vector<3xi2> to vector<1xi2>
@@ -411,7 +413,7 @@ func.func @vector_store_i2_const_index_two_atomic(%arg0: vector<3xi2>) {
 
 // -----
 
-func.func @vector_store_i2_atomic(%arg0: vector<7xi2>) {
+func.func @vector_store_i2_atomic_rmw(%arg0: vector<7xi2>) {
     %0 = memref.alloc() : memref<3x7xi2>
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -420,7 +422,7 @@ func.func @vector_store_i2_atomic(%arg0: vector<7xi2>) {
 }
 
 // In this example, emit 2 atomic RMWs and 1 non-atomic store:
-// CHECK-LABEL: func @vector_store_i2_atomic(
+// CHECK-LABEL: func @vector_store_i2_atomic_rmw(
 // CHECK-SAME: %[[ARG0:.+]]: vector<7xi2>)
 // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<6xi8>
 // CHECK: %[[C1:.+]] = arith.constant 1 : index
@@ -467,7 +469,7 @@ func.func @vector_store_i2_atomic(%arg0: vector<7xi2>) {
 
 // -----
 
-func.func @vector_store_i2_single_atomic(%arg0: vector<1xi2>) {
+func.func @vector_store_i2_const_index_one_atomic_rmw(%arg0: vector<1xi2>) {
     %0 = memref.alloc() : memref<4x1xi2>
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -476,7 +478,7 @@ func.func @vector_store_i2_single_atomic(%arg0: vector<1xi2>) {
 }
 
 // In this example, only emit 1 atomic store
-// CHECK-LABEL: func @vector_store_i2_single_atomic(
+// CHECK-LABEL: func @vector_store_i2_const_index_one_atomic_rmw(
 // CHECK-SAME: %[[ARG0:.+]]: vector<1xi2>)
 // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1xi8>
 // CHECK: %[[C0:.+]] = arith.constant 0 : index