Optimize struct RMW ops in OptimizeInstructions (#7225)

tlively · web-flow · commit 8623f733f938 · 2025-01-17T19:50:33.000Z
When the RMW operation can be proven not to change the accessed value,
optimize it to a simple atomic get instead. This is valid because a
write that does not change an in-memory value does not synchronize with
any subsequent reads of that value, since those reads can be considered
to be reading from the previous write.

Also optimize RMW operations on unshared structs to their non-atomic
equivalent operations. This can increase code size, but can also enable
follow-on optimizations of the simpler operations and can be less
expensive at runtime.
diff --git a/scripts/test/fuzzing.py b/scripts/test/fuzzing.py
@@ -83,6 +83,7 @@
     'gc-atomics-null-refs.wast',
     'shared-structs.wast',
     'heap2local-rmw.wast',
+    'optimize-instructions-struct-rmw.wast',
     # contains too many segments to run in a wasm VM
     'limit-segments_disable-bulk-memory.wast',
     # https://github.com/WebAssembly/binaryen/issues/7176
diff --git a/src/passes/OptimizeInstructions.cpp b/src/passes/OptimizeInstructions.cpp
@@ -1862,6 +1862,194 @@ struct OptimizeInstructions
     }
   }
 
+  void visitStructRMW(StructRMW* curr) {
+    skipNonNullCast(curr->ref, curr);
+    if (trapOnNull(curr, curr->ref)) {
+      return;
+    }
+
+    if (!curr->ref->type.isStruct()) {
+      return;
+    }
+
+    Builder builder(*getModule());
+
+    // Even when the RMW access is to shared memory, we can optimize out the
+    // modify and write parts if we know that the modified value is the same as
+    // the original value. This is valid because reads from writes that don't
+    // change the in-memory value can be considered to be reads from the
+    // previous write to the same location instead. That means there is no read
+    // that necessarily synchronizes with the write.
+    auto* value =
+      Properties::getFallthrough(curr->value, getPassOptions(), *getModule());
+    if (Properties::isSingleConstantExpression(value)) {
+      auto val = Properties::getLiteral(value);
+      bool canOptimize = false;
+      switch (curr->op) {
+        case RMWAdd:
+        case RMWSub:
+        case RMWOr:
+        case RMWXor:
+          canOptimize = val.getInteger() == 0;
+          break;
+        case RMWAnd:
+          canOptimize = val == Literal::makeNegOne(val.type);
+          break;
+        case RMWXchg:
+          canOptimize = false;
+          break;
+      }
+      if (canOptimize) {
+        replaceCurrent(builder.makeStructGet(
+          curr->index,
+          getResultOfFirst(curr->ref, builder.makeDrop(curr->value)),
+          curr->order,
+          curr->type));
+        return;
+      }
+    }
+
+    if (curr->ref->type.getHeapType().isShared()) {
+      return;
+    }
+
+    // Lower the RMW to its more basic operations. Breaking the atomic
+    // operation into several non-atomic operations is safe because no other
+    // thread can observe an intermediate state in the unshared memory. This
+    // initially increases code size, but the more basic operations may be
+    // more optimizable than the original RMW.
+    // TODO: Experiment to determine whether this is worthwhile on real code.
+    // Maybe we should do this optimization only when optimizing for speed over
+    // size.
+    auto ref = builder.addVar(getFunction(), curr->ref->type);
+    auto val = builder.addVar(getFunction(), curr->type);
+    auto result = builder.addVar(getFunction(), curr->type);
+    auto* block = builder.makeBlock(
+      {builder.makeLocalSet(ref, curr->ref),
+       builder.makeLocalSet(val, curr->value),
+       builder.makeLocalSet(
+         result,
+         builder.makeStructGet(curr->index,
+                               builder.makeLocalGet(ref, curr->ref->type),
+                               MemoryOrder::Unordered,
+                               curr->type))});
+    Expression* newVal = nullptr;
+    if (curr->op == RMWXchg) {
+      newVal = builder.makeLocalGet(val, curr->type);
+    } else {
+      Abstract::Op binop = Abstract::Add;
+      switch (curr->op) {
+        case RMWAdd:
+          binop = Abstract::Add;
+          break;
+        case RMWSub:
+          binop = Abstract::Sub;
+          break;
+        case RMWAnd:
+          binop = Abstract::And;
+          break;
+        case RMWOr:
+          binop = Abstract::Or;
+          break;
+        case RMWXor:
+          binop = Abstract::Xor;
+          break;
+        case RMWXchg:
+          WASM_UNREACHABLE("unexpected op");
+      }
+      newVal = builder.makeBinary(Abstract::getBinary(curr->type, binop),
+                                  builder.makeLocalGet(result, curr->type),
+                                  builder.makeLocalGet(val, curr->type));
+    }
+    block->list.push_back(
+      builder.makeStructSet(curr->index,
+                            builder.makeLocalGet(ref, curr->ref->type),
+                            newVal,
+                            MemoryOrder::Unordered));
+
+    // We must maintain this operation's effect on the global order of seqcst
+    // operations.
+    if (curr->order == MemoryOrder::SeqCst) {
+      block->list.push_back(builder.makeAtomicFence());
+    }
+
+    block->list.push_back(builder.makeLocalGet(result, curr->type));
+    block->type = curr->type;
+    replaceCurrent(block);
+  }
+
+  void visitStructCmpxchg(StructCmpxchg* curr) {
+    skipNonNullCast(curr->ref, curr);
+    if (trapOnNull(curr, curr->ref)) {
+      return;
+    }
+
+    if (!curr->ref->type.isStruct()) {
+      return;
+    }
+
+    Builder builder(*getModule());
+
+    // Just like other RMW operations, cmpxchg can be optimized to just a read
+    // if it is known not to change the in-memory value. This is the case when
+    // `expected` and `replacement` are known to be the same.
+    if (areConsecutiveInputsEqual(curr->expected, curr->replacement)) {
+      auto* ref = getResultOfFirst(
+        curr->ref,
+        builder.makeSequence(builder.makeDrop(curr->expected),
+                             builder.makeDrop(curr->replacement)));
+      replaceCurrent(
+        builder.makeStructGet(curr->index, ref, curr->order, curr->type));
+      return;
+    }
+
+    if (curr->ref->type.getHeapType().isShared()) {
+      return;
+    }
+
+    // Just like other RMW operations, lower to basic operations when operating
+    // on unshared memory.
+    auto ref = builder.addVar(getFunction(), curr->ref->type);
+    auto expected = builder.addVar(getFunction(), curr->type);
+    auto replacement = builder.addVar(getFunction(), curr->type);
+    auto result = builder.addVar(getFunction(), curr->type);
+    auto* block =
+      builder.makeBlock({builder.makeLocalSet(ref, curr->ref),
+                         builder.makeLocalSet(expected, curr->expected),
+                         builder.makeLocalSet(replacement, curr->replacement)});
+    auto* lhs = builder.makeLocalTee(
+      result,
+      builder.makeStructGet(curr->index,
+                            builder.makeLocalGet(ref, curr->ref->type),
+                            MemoryOrder::Unordered,
+                            curr->type),
+      curr->type);
+    auto* rhs = builder.makeLocalGet(expected, curr->type);
+    Expression* pred = nullptr;
+    if (curr->type.isRef()) {
+      pred = builder.makeRefEq(lhs, rhs);
+    } else {
+      pred = builder.makeBinary(
+        Abstract::getBinary(curr->type, Abstract::Eq), lhs, rhs);
+    }
+    block->list.push_back(builder.makeIf(
+      pred,
+      builder.makeStructSet(curr->index,
+                            builder.makeLocalGet(ref, curr->ref->type),
+                            builder.makeLocalGet(replacement, curr->type),
+                            MemoryOrder::Unordered)));
+
+    // We must maintain this operation's effect on the global order of seqcst
+    // operations.
+    if (curr->order == MemoryOrder::SeqCst) {
+      block->list.push_back(builder.makeAtomicFence());
+    }
+
+    block->list.push_back(builder.makeLocalGet(result, curr->type));
+    block->type = curr->type;
+    replaceCurrent(block);
+  }
+
   void visitArrayNew(ArrayNew* curr) {
     // If a value is provided, we can optimize in some cases.
     if (curr->type == Type::unreachable || curr->isWithDefault()) {
diff --git a/test/lit/passes/optimize-instructions-struct-rmw.wast b/test/lit/passes/optimize-instructions-struct-rmw.wast