[mlir][vector] Implement speculation for vector.transferx ops (#111533)

Groverkss · web-flow · commit 32db6fbdb9a8 · 2024-10-09T13:50:33.000+01:00
This patch implements speculation for
vector.transfer_read/vector.transfer_write ops, allowing these ops to
work with LICM.
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1240,6 +1240,7 @@ def Vector_TransferReadOp :
       DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
       DeclareOpInterfaceMethods<MaskableOpInterface>,
       DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+      DeclareOpInterfaceMethods<ConditionallySpeculatable>,
       AttrSizedOperandSegments,
       DestinationStyleOpInterface
     ]>,
@@ -1487,6 +1488,7 @@ def Vector_TransferWriteOp :
       DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
       DeclareOpInterfaceMethods<MaskableOpInterface>,
       DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+      DeclareOpInterfaceMethods<ConditionallySpeculatable>,
       AttrSizedOperandSegments,
       DestinationStyleOpInterface
   ]>,
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4245,6 +4245,12 @@ void TransferReadOp::getEffects(
                          SideEffects::DefaultResource::get());
 }
 
+Speculation::Speculatability TransferReadOp::getSpeculatability() {
+  if (hasPureTensorSemantics())
+    return Speculation::Speculatable;
+  return Speculation::NotSpeculatable;
+}
+
 namespace {
 /// Store to load forwarding for transfer operations with permuation maps.
 /// Even if the permutation maps are different we can still propagate the store
@@ -4627,6 +4633,12 @@ void TransferWriteOp::getEffects(
                          SideEffects::DefaultResource::get());
 }
 
+Speculation::Speculatability TransferWriteOp::getSpeculatability() {
+  if (hasPureTensorSemantics())
+    return Speculation::Speculatable;
+  return Speculation::NotSpeculatable;
+}
+
 namespace {
 /// Remove dead transfer write from the SSA chain so that it an be eliminated by
 /// DCE
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -1209,3 +1209,110 @@ func.func @hoist_linalg_ops_div_by_zero(%a : tensor<128x128xi32>,
 
   func.return %final : tensor<?x128xi32>
 }
+
+// -----
+
+// CHECK-LABEL: func @hoist_vector_transfer_ops
+// CHECK: vector.transfer_read
+// CHECK: scf.for
+// CHECK-NOT: vector.transfer_read
+// CHECK: arith.addf
+// CHECK: scf.yield
+func.func @hoist_vector_transfer_ops(
+                            %a : tensor<128x128xf32>, 
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %ida : index,
+                            %idb : index) -> vector<4x4xf32> {
+  %cst_0 = arith.constant 0.0 : f32
+  %cst = arith.constant dense<0.0> : vector<4x4xf32>
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> {
+    %read = vector.transfer_read %a[%ida, %idb], %cst_0 : tensor<128x128xf32>, vector<4x4xf32>
+    %out = arith.addf %read, %acc : vector<4x4xf32>
+    scf.yield %out : vector<4x4xf32>
+  }
+  func.return %final : vector<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @hoist_vector_transfer_ops
+// CHECK: vector.transfer_write
+// CHECK: vector.transfer_read
+// CHECK: scf.for
+// CHECK-NOT: vector.transfer_write
+// CHECK-NOT: vector.transfer_read
+// CHECK: arith.addf
+// CHECK: scf.yield
+func.func @hoist_vector_transfer_ops(
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %ida : index,
+                            %idb : index) -> vector<4x4xf32> {
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.0 : f32
+  %cst = arith.constant dense<0.0> : vector<4x4xf32>
+  %empty = tensor.empty() : tensor<4x4xf32>
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> {
+    %a = vector.transfer_write %cst, %empty[%c0, %c0] : vector<4x4xf32>, tensor<4x4xf32>
+    %read = vector.transfer_read %a[%c0, %c0], %cst_0 : tensor<4x4xf32>, vector<4x4xf32>
+    %out = arith.addf %read, %acc : vector<4x4xf32>
+    scf.yield %out : vector<4x4xf32>
+  }
+  func.return %final : vector<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @do_not_hoist_vector_transfer_ops_loop_dep
+// CHECK-NOT: vector.transfer_read
+// CHECK: scf.for
+// CHECK: vector.transfer_read
+// CHECK: arith.addf
+// CHECK: scf.yield
+func.func @do_not_hoist_vector_transfer_ops_loop_dep(
+                            %a : tensor<128x128xf32>, 
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %ida : index) -> vector<4x4xf32> {
+  %cst_0 = arith.constant 0.0 : f32
+  %cst = arith.constant dense<0.0> : vector<4x4xf32>
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> {
+    %read = vector.transfer_read %a[%ida, %i], %cst_0 : tensor<128x128xf32>, vector<4x4xf32>
+    %out = arith.addf %read, %acc : vector<4x4xf32>
+    scf.yield %out : vector<4x4xf32>
+  }
+  func.return %final : vector<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @do_not_hoist_vector_transfer_ops_memref
+// CHECK-NOT: vector.transfer_read
+// CHECK: scf.for
+// CHECK: vector.transfer_read
+// CHECK: arith.addf
+// CHECK: scf.yield
+func.func @do_not_hoist_vector_transfer_ops_memref(
+                            %a : memref<128x128xf32>, 
+                            %lb : index,
+                            %ub : index,
+                            %step : index,
+                            %ida : index,
+                            %idb : index) -> vector<4x4xf32> {
+  %cst_0 = arith.constant 0.0 : f32
+  %cst = arith.constant dense<0.0> : vector<4x4xf32>
+  %final = 
+  scf.for %i = %lb to %ub step %step iter_args(%acc = %cst) -> vector<4x4xf32> {
+    %read = vector.transfer_read %a[%ida, %idb], %cst_0 : memref<128x128xf32>, vector<4x4xf32>
+    %out = arith.addf %read, %acc : vector<4x4xf32>
+    scf.yield %out : vector<4x4xf32>
+  }
+  func.return %final : vector<4x4xf32>
+}