diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 98d98f067de14..8b70a6b60a1ec 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -5099,6 +5099,10 @@ LogicalResult vector::LoadOp::verify() { if (failed(verifyLoadStoreMemRefLayout(*this, resVecTy, memRefTy))) return failure(); + if (memRefTy.getRank() < resVecTy.getRank()) + return emitOpError( + "destination memref has lower rank than the result vector"); + // Checks for vector memrefs. Type memElemTy = memRefTy.getElementType(); if (auto memVecTy = llvm::dyn_cast(memElemTy)) { @@ -5131,6 +5135,9 @@ LogicalResult vector::StoreOp::verify() { if (failed(verifyLoadStoreMemRefLayout(*this, valueVecTy, memRefTy))) return failure(); + if (memRefTy.getRank() < valueVecTy.getRank()) + return emitOpError("source memref has lower rank than the vector to store"); + // Checks for vector memrefs. Type memElemTy = memRefTy.getElementType(); if (auto memVecTy = llvm::dyn_cast(memElemTy)) { diff --git a/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir b/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir index 0f973af799634..c8a434bb8f5de 100644 --- a/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir +++ b/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir @@ -718,18 +718,6 @@ func.func @vector_load_i8_with_offset(%arg0 : memref) -> vector<[16]x[16 // ----- -// CHECK-LABEL: @vector_load_i8_from_rank_1_memref( -// CHECK-SAME: %[[MEMREF:.*]]: memref) -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: arm_sme.tile_load %[[MEMREF]][%[[C0]]] : memref, vector<[16]x[16]xi8> -func.func @vector_load_i8_from_rank_1_memref(%arg0 : memref) -> vector<[16]x[16]xi8> { - %c0 = arith.constant 0 : index - %tile = vector.load %arg0[%c0] : memref, vector<[16]x[16]xi8> - return %tile : vector<[16]x[16]xi8> -} - -// ----- - // CHECK-LABEL: @vector_load_i16( // CHECK: arm_sme.tile_load {{.*}} : memref, vector<[8]x[8]xi16> func.func @vector_load_i16(%arg0 : memref) -> vector<[8]x[8]xi16> { diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir index 067cdb5c5fd20..3160fd9c65c04 100644 --- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir +++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir @@ -819,18 +819,29 @@ func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: ind // ----- -func.func @fold_vector_load_subview( - %arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index) -> vector<12x32xf32> { - %0 = memref.subview %arg0[%arg1, %arg2][1, 1][1, 1] : memref<12x32xf32> to memref> - %1 = vector.load %0[] : memref>, vector<12x32xf32> - return %1 : vector<12x32xf32> +func.func @fold_vector_load_subview(%src : memref<24x64xf32>, + %off1 : index, + %off2 : index, + %dim1 : index, + %dim2 : index, + %idx : index) -> vector<12x32xf32> { + + %0 = memref.subview %src[%off1, %off2][%dim1, %dim2][1, 1] : memref<24x64xf32> to memref> + %1 = vector.load %0[%idx, %idx] : memref>, vector<12x32xf32> + return %1 : vector<12x32xf32> } -// CHECK: func @fold_vector_load_subview -// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<12x32xf32> -// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index -// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index -// CHECK: vector.load %[[ARG0]][%[[ARG1]], %[[ARG2]]] : memref<12x32xf32>, vector<12x32xf32> +// CHECK: #[[$ATTR_46:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> +// CHECK-LABEL: func.func @fold_vector_load_subview( +// CHECK-SAME: %[[SRC:[a-zA-Z0-9$._-]*]]: memref<24x64xf32>, +// CHECK-SAME: %[[OFF_1:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[OFF_2:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[DIM_1:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[DIM_2:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[IDX:[a-zA-Z0-9$._-]*]]: index) -> vector<12x32xf32> { +// CHECK: %[[VAL_6:.*]] = affine.apply #[[$ATTR_46]](){{\[}}%[[OFF_1]], %[[IDX]]] +// CHECK: %[[VAL_7:.*]] = affine.apply #[[$ATTR_46]](){{\[}}%[[OFF_2]], %[[IDX]]] +// CHECK: %[[VAL_8:.*]] = vector.load %[[SRC]]{{\[}}%[[VAL_6]], %[[VAL_7]]] : memref<24x64xf32>, vector<12x32xf32> // ----- @@ -851,20 +862,32 @@ func.func @fold_vector_maskedload_subview( // ----- -func.func @fold_vector_store_subview( - %arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3: vector<2x32xf32>) -> () { - %0 = memref.subview %arg0[%arg1, %arg2][1, 1][1, 1] : memref<12x32xf32> to memref> - vector.store %arg3, %0[] : memref>, vector<2x32xf32> - return +func.func @fold_vector_store_subview(%src : memref<24x64xf32>, + %off1 : index, + %off2 : index, + %vec: vector<2x32xf32>, + %idx : index, + %dim1 : index, + %dim2 : index) -> () { + + %0 = memref.subview %src[%off1, %off2][%dim1, %dim2][1, 1] : memref<24x64xf32> to memref> + vector.store %vec, %0[%idx, %idx] : memref> , vector<2x32xf32> + return } -// CHECK: func @fold_vector_store_subview -// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<12x32xf32> -// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index -// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index -// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: vector<2x32xf32> -// CHECK: vector.store %[[ARG3]], %[[ARG0]][%[[ARG1]], %[[ARG2]]] : memref<12x32xf32>, vector<2x32xf32> -// CHECK: return +// CHECK: #[[$ATTR_47:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> + +// CHECK-LABEL: func.func @fold_vector_store_subview( +// CHECK-SAME: %[[SRC:[a-zA-Z0-9$._-]*]]: memref<24x64xf32>, +// CHECK-SAME: %[[OFF1:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[OFF_2:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[VEC:[a-zA-Z0-9$._-]*]]: vector<2x32xf32>, +// CHECK-SAME: %[[IDX:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[VAL_5:[a-zA-Z0-9$._-]*]]: index, +// CHECK-SAME: %[[VAL_6:[a-zA-Z0-9$._-]*]]: index) { +// CHECK: %[[VAL_7:.*]] = affine.apply #[[$ATTR_47]](){{\[}}%[[OFF1]], %[[IDX]]] +// CHECK: %[[VAL_8:.*]] = affine.apply #[[$ATTR_47]](){{\[}}%[[OFF_2]], %[[IDX]]] +// CHECK: vector.store %[[VEC]], %[[SRC]]{{\[}}%[[VAL_7]], %[[VAL_8]]] : memref<24x64xf32>, vector<2x32xf32> // ----- diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index ea6d0021391fb..f7192fbf68b4e 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1743,13 +1743,11 @@ func.func @invalid_outerproduct(%src : memref) { // ----- -func.func @invalid_outerproduct1(%src : memref) { +func.func @invalid_outerproduct1(%src : memref, %lhs : vector<[4]x[4]xf32>, %rhs : vector<[4]xf32>) { %idx = arith.constant 0 : index - %0 = vector.load %src[%idx] : memref, vector<[4]x[4]xf32> - %1 = vector.load %src[%idx] : memref, vector<[4]xf32> // expected-error @+1 {{'vector.outerproduct' op expected 1-d vector for operand #1}} - %op = vector.outerproduct %0, %1 : vector<[4]x[4]xf32>, vector<[4]xf32> + %op = vector.outerproduct %lhs, %rhs : vector<[4]x[4]xf32>, vector<[4]xf32> } // ----- @@ -1870,3 +1868,29 @@ func.func @flat_transpose_scalable(%arg0: vector<[16]xf32>) -> vector<[16]xf32> : vector<[16]xf32> -> vector<[16]xf32> return %0 : vector<[16]xf32> } + +// ----- + +//===----------------------------------------------------------------------===// +// vector.load +//===----------------------------------------------------------------------===// + +func.func @vector_load(%src : memref) { + %c0 = arith.constant 0 : index + // expected-error @+1 {{'vector.load' op destination memref has lower rank than the result vector}} + %0 = vector.load %src[%c0] : memref, vector<16x16xi8> + return +} + +// ----- + +//===----------------------------------------------------------------------===// +// vector.store +//===----------------------------------------------------------------------===// + +func.func @vector_store(%dest : memref, %vec : vector<16x16xi8>) { + %c0 = arith.constant 0 : index + // expected-error @+1 {{'vector.store' op source memref has lower rank than the vector to store}} + vector.store %vec, %dest[%c0] : memref, vector<16x16xi8> + return +} diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir index fd50acf03e79b..511ab70f35086 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir @@ -2,8 +2,8 @@ // CHECK-LABEL: func @vector_transfer_ops_0d_memref( // CHECK-SAME: %[[MEM:.*]]: memref -// CHECK-SAME: %[[VEC:.*]]: vector<1x1x1xf32> -func.func @vector_transfer_ops_0d_memref(%mem: memref, %vec: vector<1x1x1xf32>) { +// CHECK-SAME: %[[VEC:.*]]: vector +func.func @vector_transfer_ops_0d_memref(%mem: memref, %vec: vector) { %f0 = arith.constant 0.0 : f32 // CHECK-NEXT: %[[S:.*]] = vector.load %[[MEM]][] : memref, vector @@ -12,8 +12,8 @@ func.func @vector_transfer_ops_0d_memref(%mem: memref, %vec: vector<1x1x1xf // CHECK-NEXT: vector.store %[[S]], %[[MEM]][] : memref, vector vector.transfer_write %0, %mem[] : vector, memref -// CHECK-NEXT: vector.store %[[VEC]], %[[MEM]][] : memref, vector<1x1x1xf32> - vector.store %vec, %mem[] : memref, vector<1x1x1xf32> +// CHECK-NEXT: vector.store %[[VEC]], %[[MEM]][] : memref, vector + vector.store %vec, %mem[] : memref, vector return } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transpose.mlir index ff20f99b63cd1..8188e66ce0662 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transpose.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/transpose.mlir @@ -14,10 +14,9 @@ func.func @entry() { // Calculate the size of a 32-bit tile, e.g. ZA{n}.s. %svl_s = arm_sme.streaming_vl - %za_s_size = arith.muli %svl_s, %svl_s : index // Allocate memory. - %mem1 = memref.alloca(%za_s_size) : memref + %mem1 = memref.alloca(%svl_s, %svl_s) : memref // Fill each "row" of "mem1" with row number. // @@ -29,15 +28,15 @@ func.func @entry() { // 3, 3, 3, 3 // %init_0 = arith.constant 0 : i32 - scf.for %i = %c0 to %za_s_size step %svl_s iter_args(%val = %init_0) -> (i32) { + scf.for %i = %c0 to %svl_s step %c1 iter_args(%val = %init_0) -> (i32) { %splat_val = vector.broadcast %val : i32 to vector<[4]xi32> - vector.store %splat_val, %mem1[%i] : memref, vector<[4]xi32> + vector.store %splat_val, %mem1[%i, %c0] : memref, vector<[4]xi32> %val_next = arith.addi %val, %c1_i32 : i32 scf.yield %val_next : i32 } // Load tile from "mem1". - %tile = vector.load %mem1[%c0] : memref, vector<[4]x[4]xi32> + %tile = vector.load %mem1[%c0, %c0] : memref, vector<[4]x[4]xi32> // Transpose tile. %transposed_tile = vector.transpose %tile, [1, 0] : vector<[4]x[4]xi32> to vector<[4]x[4]xi32> diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir index 6e25bee65f095..b69a200b2a49a 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir @@ -33,12 +33,11 @@ func.func @za0_d_f64() -> i32 { // 2.1, 2.1, 2.1, 2.1 // 3.1, 3.1, 3.1, 3.1 // - %tilesize = arith.muli %svl_d, %svl_d : index - %mem1 = memref.alloca(%tilesize) : memref + %mem1 = memref.alloca(%svl_d, %svl_d) : memref %init_0 = arith.constant 0.1 : f64 - scf.for %i = %c0 to %tilesize step %svl_d iter_args(%val = %init_0) -> (f64) { + scf.for %i = %c0 to %svl_d step %c1_index iter_args(%val = %init_0) -> (f64) { %splat_val = vector.broadcast %val : f64 to vector<[2]xf64> - vector.store %splat_val, %mem1[%i] : memref, vector<[2]xf64> + vector.store %splat_val, %mem1[%i, %c0] : memref, vector<[2]xf64> %val_next = arith.addf %val, %c1_f64 : f64 scf.yield %val_next : f64 } @@ -48,27 +47,29 @@ func.func @za0_d_f64() -> i32 { // // CHECK-ZA0_D: ( 0.1, 0.1 // CHECK-ZA0_D-NEXT: ( 1.1, 1.1 - scf.for %i = %c0 to %tilesize step %svl_d { - %tileslice = vector.load %mem1[%i] : memref, vector<[2]xf64> + scf.for %i = %c0 to %svl_d step %c1_index { + %tileslice = vector.load %mem1[%i, %c0] : memref, vector<[2]xf64> vector.print %tileslice : vector<[2]xf64> } // Load ZA0.D from "mem1" - %za0_d = vector.load %mem1[%c0] : memref, vector<[2]x[2]xf64> + %za0_d = vector.load %mem1[%c0, %c0] : memref, vector<[2]x[2]xf64> // Allocate "mem2" to store ZA0.D to - %mem2 = memref.alloca(%tilesize) : memref + %mem2 = memref.alloca(%svl_d, %svl_d) : memref // Zero "mem2" - scf.for %i = %c0 to %tilesize step %c1_index { - memref.store %c0_f64, %mem2[%i] : memref + scf.for %i = %c0 to %svl_d step %c1_index { + scf.for %j = %c0 to %svl_d step %c1_index { + memref.store %c0_f64, %mem2[%i, %j] : memref + } } // Verify "mem2" is zeroed by doing an add reduction with initial value of // zero %init_0_f64 = arith.constant 0.0 : f64 - %add_reduce = scf.for %vnum = %c0 to %tilesize step %svl_d iter_args(%iter = %init_0_f64) -> (f64) { - %row = vector.load %mem2[%vnum] : memref, vector<[2]xf64> + %add_reduce = scf.for %vnum = %c0 to %svl_d step %c1_index iter_args(%iter = %init_0_f64) -> (f64) { + %row = vector.load %mem2[%vnum, %c0] : memref, vector<[2]xf64> %inner_add_reduce = scf.for %offset = %c0 to %svl_d step %c1_index iter_args(%inner_iter = %init_0_f64) -> (f64) { %t = vector.extractelement %row[%offset : index] : vector<[2]xf64> @@ -88,16 +89,16 @@ func.func @za0_d_f64() -> i32 { // // CHECK-ZA0_D-NEXT: ( 0, 0 // CHECK-ZA0_D-NEXT: ( 0, 0 - scf.for %i = %c0 to %tilesize step %svl_d { - %tileslice = vector.load %mem2[%i] : memref, vector<[2]xf64> + scf.for %i = %c0 to %svl_d step %c1_index{ + %tileslice = vector.load %mem2[%i, %c0] : memref, vector<[2]xf64> vector.print %tileslice : vector<[2]xf64> } // Verify "mem1" != "mem2" %init_1 = arith.constant 1 : i64 - %mul_reduce_0 = scf.for %vnum = %c0 to %tilesize step %svl_d iter_args(%iter = %init_1) -> (i64) { - %row_1 = vector.load %mem1[%vnum] : memref, vector<[2]xf64> - %row_2 = vector.load %mem2[%vnum] : memref, vector<[2]xf64> + %mul_reduce_0 = scf.for %vnum = %c0 to %svl_d step %c1_index iter_args(%iter = %init_1) -> (i64) { + %row_1 = vector.load %mem1[%vnum, %c0] : memref, vector<[2]xf64> + %row_2 = vector.load %mem2[%vnum, %c0] : memref, vector<[2]xf64> %cmp = arith.cmpf one, %row_1, %row_2 : vector<[2]xf64> %inner_mul_reduce = scf.for %i = %c0 to %svl_d step %c1_index iter_args(%inner_iter = %init_1) -> (i64) { @@ -115,12 +116,12 @@ func.func @za0_d_f64() -> i32 { vector.print %mul_reduce_0 : i64 // Store ZA0.D to "mem2" - vector.store %za0_d, %mem2[%c0] : memref, vector<[2]x[2]xf64> + vector.store %za0_d, %mem2[%c0, %c0] : memref, vector<[2]x[2]xf64> // Verify "mem1" == "mem2" - %mul_reduce_1 = scf.for %vnum = %c0 to %tilesize step %svl_d iter_args(%iter = %init_1) -> (i64) { - %row_1 = vector.load %mem1[%vnum] : memref, vector<[2]xf64> - %row_2 = vector.load %mem2[%vnum] : memref, vector<[2]xf64> + %mul_reduce_1 = scf.for %vnum = %c0 to %svl_d step %c1_index iter_args(%iter = %init_1) -> (i64) { + %row_1 = vector.load %mem1[%vnum, %c0] : memref, vector<[2]xf64> + %row_2 = vector.load %mem2[%vnum, %c0] : memref, vector<[2]xf64> %cmp = arith.cmpf oeq, %row_1, %row_2 : vector<[2]xf64> %inner_mul_reduce = scf.for %i = %c0 to %svl_d step %c1_index iter_args(%inner_iter = %init_1) -> (i64) { @@ -142,8 +143,8 @@ func.func @za0_d_f64() -> i32 { // // CHECK-ZA0_D-NEXT: ( 0.1, 0.1 // CHECK-ZA0_D-NEXT: ( 1.1, 1.1 - scf.for %i = %c0 to %tilesize step %svl_d { - %tileslice = vector.load %mem2[%i] : memref, vector<[2]xf64> + scf.for %i = %c0 to %svl_d step %c1_index{ + %tileslice = vector.load %mem2[%i, %c0] : memref, vector<[2]xf64> vector.print %tileslice : vector<[2]xf64> } @@ -169,9 +170,8 @@ func.func @load_store_two_za_s_tiles() -> i32 { %svl_s = arm_sme.streaming_vl // Allocate memory for two 32-bit element tiles. - %size_of_tile = arith.muli %svl_s, %svl_s : index - %size_of_two_tiles = arith.muli %size_of_tile, %c2_index : index - %mem1 = memref.alloca(%size_of_two_tiles) : memref + %svl_s_x_2 = arith.muli %svl_s, %c2_index : index + %mem1 = memref.alloca(%svl_s_x_2, %svl_s) : memref // Fill memory that tile 1 will be loaded from with '1' and '2' for tile 2. // @@ -191,15 +191,15 @@ func.func @load_store_two_za_s_tiles() -> i32 { // 2, 2, 2, 2 // 2, 2, 2, 2 // - scf.for %i = %c0 to %size_of_two_tiles step %svl_s { - %isFirstTile = arith.cmpi ult, %i, %size_of_tile : index + scf.for %i = %c0 to %svl_s_x_2 step %c1_index { + %isFirstTile = arith.cmpi ult, %i, %svl_s : index %val = scf.if %isFirstTile -> i32 { scf.yield %c1_i32 : i32 } else { scf.yield %c2_i32 : i32 } %splat_val = vector.broadcast %val : i32 to vector<[4]xi32> - vector.store %splat_val, %mem1[%i] : memref, vector<[4]xi32> + vector.store %splat_val, %mem1[%i, %c0] : memref, vector<[4]xi32> } // Dump "mem1". The smallest SVL is 128-bits so each tile will be at least @@ -213,32 +213,32 @@ func.func @load_store_two_za_s_tiles() -> i32 { // CHECK-NEXT: ( 2, 2, 2, 2 // CHECK-NEXT: ( 2, 2, 2, 2 // CHECK-NEXT: ( 2, 2, 2, 2 - scf.for %i = %c0 to %size_of_two_tiles step %svl_s { - %tileslice = vector.load %mem1[%i] : memref, vector<[4]xi32> + scf.for %i = %c0 to %svl_s_x_2 step %c1_index { + %tileslice = vector.load %mem1[%i, %c0] : memref, vector<[4]xi32> vector.print %tileslice : vector<[4]xi32> } // Load tile 1 from memory - %za0_s = vector.load %mem1[%c0] : memref, vector<[4]x[4]xi32> + %za0_s = vector.load %mem1[%c0, %c0] : memref, vector<[4]x[4]xi32> // Load tile 2 from memory - %za1_s = vector.load %mem1[%size_of_tile] : memref, vector<[4]x[4]xi32> + %za1_s = vector.load %mem1[%svl_s, %c0] : memref, vector<[4]x[4]xi32> // Allocate new memory to store tiles to - %mem2 = memref.alloca(%size_of_two_tiles) : memref + %mem2 = memref.alloca(%svl_s_x_2, %svl_s) : memref // Zero new memory - scf.for %i = %c0 to %size_of_two_tiles step %c1_index { - memref.store %c0_i32, %mem2[%i] : memref + scf.for %i = %c0 to %svl_s_x_2 step %c1_index { + memref.store %c0_i32, %mem2[%i, %c0] : memref } // Stores tiles back to (new) memory in reverse order // Store tile 2 to memory - vector.store %za1_s, %mem2[%c0] : memref, vector<[4]x[4]xi32> + vector.store %za1_s, %mem2[%c0, %c0] : memref, vector<[4]x[4]xi32> // Store tile 1 to memory - vector.store %za0_s, %mem2[%size_of_tile] : memref, vector<[4]x[4]xi32> + vector.store %za0_s, %mem2[%svl_s, %c0] : memref, vector<[4]x[4]xi32> // Dump "mem2" and check the tiles were stored in reverse order. The smallest // SVL is 128-bits so the tiles will be at least 4x4xi32. @@ -256,12 +256,12 @@ func.func @load_store_two_za_s_tiles() -> i32 { // CHECK-NEXT: ( 1, 1, 1, 1 // CHECK: TILE END vector.print str "TILE BEGIN\n" - scf.for %i = %c0 to %size_of_two_tiles step %svl_s { - %av = vector.load %mem2[%i] : memref, vector<[4]xi32> + scf.for %i = %c0 to %svl_s_x_2 step %c1_index { + %av = vector.load %mem2[%i, %c0] : memref, vector<[4]xi32> vector.print %av : vector<[4]xi32> - %tileSizeMinusStep = arith.subi %size_of_tile, %svl_s : index - %isNextTile = arith.cmpi eq, %i, %tileSizeMinusStep : index + %tileSizeMinusStep = arith.subi %svl_s, %c1_index : index + %isNextTile = arith.cmpi eq, %i, %svl_s : index scf.if %isNextTile { vector.print str "TILE END\n" vector.print str "TILE BEGIN\n"