Skip to content

Conversation

@gandhi56
Copy link
Contributor

@gandhi56 gandhi56 commented Nov 3, 2024

Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.

Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.

Change-Id: I8c3b7840ca5029b3c840359c44654ebec133fd2c
@llvmbot
Copy link
Member

llvmbot commented Nov 3, 2024

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-backend-amdgpu

Author: Anshil Gandhi (gandhi56)

Changes

Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.


Patch is 45.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114702.diff

3 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+91)
  • (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+238-306)
  • (added) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll (+50)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259cd6..8767141c5d764c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -86,6 +86,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -338,6 +339,10 @@ class Vectorizer {
   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
   /// in the chain is the leader, and an instr touches distance 0 from itself.
   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+  /// Attempts to prepare atomic read-modify-write instructions for
+  /// vectorization.
+  bool prepareAtomicInstOps(AtomicRMWInst *);
 };
 
 class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -419,8 +424,94 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
   return Changed ? PA : PreservedAnalyses::all();
 }
 
+/**
+ * @brief Prepare operands of atomicrmw instructions for vectorization.
+ *
+ * Ensures the given AtomicRMWInst's pointer and value operands meet type
+ * requirements and are load instructions. Inserts necessary bitcast and
+ * inttoptr instructions.
+ *
+ * @param AI Pointer to the AtomicRMWInst in question.
+ * @return true if the operands were successfully prepared, false otherwise.
+ */
+bool Vectorizer::prepareAtomicInstOps(AtomicRMWInst *AI) {
+  if (AI->isVolatile() || AI->hasMetadata("amdgpu.no.fine.grained.memory"))
+    return false;
+
+  auto *Ptr = AI->getPointerOperand();
+  auto *PtrTy = Ptr->getType();
+  auto *Val = AI->getValOperand();
+  auto *ValTy = Val->getType();
+
+  if (!PtrTy->isPointerTy())
+    return false;
+
+  // Only cast if the value operand type is
+  // <2 x half>, <2 x i16>, <4 x i8>, f32, or i32
+  bool ValTyIsOkay = false;
+  if (auto *VTy = dyn_cast<FixedVectorType>(ValTy)) {
+    if (VTy->getNumElements() == 2) {
+      if (VTy->getElementType()->isHalfTy())
+        ValTyIsOkay = true;
+      if (VTy->getElementType()->isIntegerTy(16))
+        ValTyIsOkay = true;
+    } else if (VTy->getNumElements() == 4 &&
+               VTy->getElementType()->isIntegerTy(8)) {
+      ValTyIsOkay = true;
+    }
+  } else {
+    if (ValTy->isFloatTy())
+      ValTyIsOkay = true;
+    if (ValTy->isIntegerTy(32))
+      ValTyIsOkay = true;
+  }
+  if (!ValTyIsOkay)
+    return false;
+
+  // Walk up the chain of instructions to find the load instruction
+  auto GetLoadInst = [](Value *Ptr) -> LoadInst * {
+    while (Ptr) {
+      if (!isa<Instruction>(Ptr))
+        return nullptr;
+      if (auto *LI = dyn_cast<LoadInst>(Ptr))
+        return LI;
+      if (isa<GetElementPtrInst>(Ptr))
+        return nullptr;
+      if (auto *PtrInst = dyn_cast<Instruction>(Ptr))
+        Ptr = PtrInst->getOperand(0);
+      else
+        return nullptr;
+    }
+    return nullptr;
+  };
+
+  // Pointer and value operands must be load instructions to be vectorized
+  auto *ValLoadInst = GetLoadInst(Val);
+  auto *PtrLoadInst = GetLoadInst(Ptr);
+  if (!ValLoadInst || !PtrLoadInst)
+    return false;
+
+  // Insert bitcast to replace atomicrmw value operand
+  IRBuilder<> Builder(AI->getParent(), AI->getIterator());
+  ValLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+  AI->setOperand(1, Builder.CreateBitCast(ValLoadInst, ValTy,
+                                          ValLoadInst->getName() + ".bitcast"));
+
+  // Insert inttoptr to replace atomicrmw pointer operand
+  PtrLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+  AI->setOperand(0,
+                 Builder.CreateIntToPtr(PtrLoadInst, PtrTy,
+                                        PtrLoadInst->getName() + ".inttoptr"));
+  return true;
+}
+
 bool Vectorizer::run() {
   bool Changed = false;
+
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (auto *AI = dyn_cast<AtomicRMWInst>(&*I))
+      Changed |= prepareAtomicInstOps(AI);
+
   // Break up the BB if there are any instrs which aren't guaranteed to transfer
   // execution to their successor.
   //
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 495bfec5454ee8..4c6d0559593820 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -337,245 +337,222 @@ entry:
 define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) {
 ; GFX7LESS-LABEL: add_i32_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
-; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
-; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
-; GFX7LESS-NEXT:    s_load_dword s2, s[2:3], 0xd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
+; GFX7LESS-NEXT:    s_load_dword s4, s[2:3], 0xd
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_load_dword s8, s[2:3], 0xb
+; GFX7LESS-NEXT:    s_mov_b32 s9, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
 ; GFX7LESS-NEXT:    s_mov_b32 s10, -1
-; GFX7LESS-NEXT:    s_mov_b32 s8, s6
-; GFX7LESS-NEXT:    s_mov_b32 s9, s7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  .LBB1_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: add_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX8-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX8-NEXT:    s_mov_b32 s9, 0
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s0, s8, s0
-; GFX8-NEXT:    s_mov_b32 s15, 0xf000
-; GFX8-NEXT:    s_mov_b32 s14, -1
-; GFX8-NEXT:    s_mov_b32 s12, s6
-; GFX8-NEXT:    s_mov_b32 s13, s7
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  .LBB1_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX9-NEXT:    s_mov_b32 s9, 0
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s0, s8, s0
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB1_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_load_dword s10, s[2:3], 0x34
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
+; GFX1064-NEXT:    s_load_dword s8, s[2:3], 0x2c
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s2, s10, s2
-; GFX1064-NEXT:    s_mov_b32 s14, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b32 s12, s6
-; GFX1064-NEXT:    s_mov_b32 s13, s7
-; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:    s_mov_b32 s9, 0
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b32 null, 0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1]
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s4, v0, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_load_dword s0, s[2:3], 0x34
-; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
-; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    s_mov_b32 s5, 0
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 null, 0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_uniform:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1164-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT:    s_load_b32 s8, s[2:3], 0x2c
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mul_i32 s3, s2, s3
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164-NEXT:    s_mov_b32 s9, 0
+; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1164-NEXT:    s_mov_b32 s8, s6
-; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    buffer_gl1_inv
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB1_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
-; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
+; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_uniform:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX1132-NEXT:    s_load_b32 s0, s[2:3], 0x34
-; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s8, s6
-; GFX1132-NEXT:    s_mov_b32 s9, s7
-; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1132-NEXT:    s_mov_b32 s5, 0
+; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    buffer_gl1_inv
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB1_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
 ; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
@@ -583,72 +560,62 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1264-LABEL: add_i32_uniform:
 ; GFX1264:       ; %bb.0: ; %entry
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1264-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1264-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
-; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1264-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1264-NEXT:    ...
[truncated]

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no reason for this pass to consider atomicrmw. They cannot be vectorized and should be left alone

@gandhi56
Copy link
Contributor Author

I will close this PR and create a new one.

@gandhi56 gandhi56 closed this Dec 29, 2024
@gandhi56 gandhi56 deleted the issue-97715/lsv-insert-bitcasts branch December 29, 2024 22:41
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants