diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66..e4d6e4b2f5459 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask :
 def int_amdgcn_s_wqm :
   DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
 
-class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
+class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
     [data_ty],
     [
       LLVMMatchType<0>,   // llvm value to reduce (SGPR/VGPR)
@@ -2119,8 +2119,13 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
 
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceGenerator<list<string> Operations>{
+  foreach Opcode = Operations in
+    def Opcode : AMDGPUWaveReduce; 
+}
+
+defvar Operations = ["umin", "min", "fmin", "umax", "max", "fmax", "add", "fadd", "sub", "fsub", "and", "or", "xor"];
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;
 
 def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index f408a013d7a37..76c1feb0d5fe0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -751,53 +751,52 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   BasicBlock *ComputeEnd = nullptr;
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
-  if (ValDivergent) {
-    if (ScanImpl == ScanOptions::DPP) {
-      // First we need to set all inactive invocations to the identity value, so
-      // that they can correctly contribute to the final result.
-      NewV =
-          B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-      if (!NeedResult && ST->hasPermLaneX16()) {
-        // On GFX10 the permlanex16 instruction helps us build a reduction
-        // without too many readlanes and writelanes, which are generally bad
-        // for performance.
-        NewV = buildReduction(B, ScanOp, NewV, Identity);
-      } else {
-        NewV = buildScan(B, ScanOp, NewV, Identity);
-        if (NeedResult)
-          ExclScan = buildShiftRight(B, NewV, Identity);
-        // Read the value from the last lane, which has accumulated the values
-        // of each active lane in the wavefront. This will be our new value
-        // which we will provide to the atomic operation.
-        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-        NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
-                                 {NewV, LastLaneIdx});
-      }
-      // Finally mark the readlanes in the WWM section.
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
-    } else if (ScanImpl == ScanOptions::Iterative) {
-      // Alternative implementation for scan
-      ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
-      ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
-      std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
-                                                      ComputeLoop, ComputeEnd);
-    } else {
-      llvm_unreachable("Atomic Optimzer is disabled for None strategy");
-    }
-  } else {
+  // if (ValDivergent) {
+  //   if (ScanImpl == ScanOptions::DPP) {
+  //     // First we need to set all inactive invocations to the identity value, so
+  //     // that they can correctly contribute to the final result.
+  //     NewV =
+  //         B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+  //     if (!NeedResult && ST->hasPermLaneX16()) {
+  //       // On GFX10 the permlanex16 instruction helps us build a reduction
+  //       // without too many readlanes and writelanes, which are generally bad
+  //       // for performance.
+  //       NewV = buildReduction(B, ScanOp, NewV, Identity);
+  //     } else {
+  //       NewV = buildScan(B, ScanOp, NewV, Identity);
+  //       if (NeedResult)
+  //         ExclScan = buildShiftRight(B, NewV, Identity);
+  //       // Read the value from the last lane, which has accumulated the values
+  //       // of each active lane in the wavefront. This will be our new value
+  //       // which we will provide to the atomic operation.
+  //       Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+  //       NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
+  //                                {NewV, LastLaneIdx});
+  //     }
+  //     // Finally mark the readlanes in the WWM section.
+  //     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
+  //   } else if (ScanImpl == ScanOptions::Iterative) {
+  //     // Alternative implementation for scan
+  //     ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
+  //     ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
+  //     std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
+  //                                                     ComputeLoop, ComputeEnd);
+  //   } else {
+  //     llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+  //   }
+  // } else {
+  // **************************************** Implement from here
     switch (Op) {
+    // TODO --implement for floats
     default:
       llvm_unreachable("Unhandled atomic op");
 
     case AtomicRMWInst::Add:
-    case AtomicRMWInst::Sub: {
-      // The new value we will be contributing to the atomic operation is the
-      // old value times the number of active lanes.
-      Value *const Ctpop = B.CreateIntCast(
-          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-      NewV = buildMul(B, V, Ctpop);
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_add, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::Sub:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_sub, Int32Ty, {V, B.getInt32(0)});
       break;
-    }
     case AtomicRMWInst::FAdd:
     case AtomicRMWInst::FSub: {
       Value *const Ctpop = B.CreateIntCast(
@@ -807,28 +806,39 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       break;
     }
     case AtomicRMWInst::And:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_and, Int32Ty, {V, B.getInt32(0)});
+      break;
     case AtomicRMWInst::Or:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_or, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::Xor:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_xor, Int32Ty, {V, B.getInt32(0)});
+      break;
     case AtomicRMWInst::Max:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_max, Int32Ty, {V, B.getInt32(0)});
+      break;
     case AtomicRMWInst::Min:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_min, Int32Ty, {V, B.getInt32(0)});
+      break;
     case AtomicRMWInst::UMax:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, Int32Ty, {V, B.getInt32(0)});
+      break;
     case AtomicRMWInst::UMin:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+      break;
     case AtomicRMWInst::FMin:
     case AtomicRMWInst::FMax:
       // These operations with a uniform value are idempotent: doing the atomic
       // operation multiple times has the same effect as doing it once.
-      NewV = V;
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
       break;
 
-    case AtomicRMWInst::Xor:
-      // The new value we will be contributing to the atomic operation is the
-      // old value times the parity of the number of active lanes.
-      Value *const Ctpop = B.CreateIntCast(
-          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-      NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
-      break;
     }
-  }
+  
+  // **************************************** Implement to here
 
+
+  // NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
   // We only want a single lane to enter our new control flow, and we do this
   // by checking if there are any active lanes below us. Only one lane will
   // have 0 active lanes below us, so that will be the only one to progress.
@@ -854,39 +864,40 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // ComputeEnd block. We also need to set up predecessor to next block when
   // single lane done updating the final reduced value.
   BasicBlock *Predecessor = nullptr;
-  if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
-    // Move terminator from I's block to ComputeEnd block.
-    //
-    // OriginalBB is known to have a branch as terminator because
-    // SplitBlockAndInsertIfThen will have inserted one.
-    BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
-    B.SetInsertPoint(ComputeEnd);
-    Terminator->removeFromParent();
-    B.Insert(Terminator);
-
-    // Branch to ComputeLoop Block unconditionally from the I's block for
-    // iterative approach.
-    B.SetInsertPoint(OriginalBB);
-    B.CreateBr(ComputeLoop);
-
-    // Update the dominator tree for new control flow.
-    SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
-        {{DominatorTree::Insert, OriginalBB, ComputeLoop},
-         {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
-
-    // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
-    // the DT edges as well.
-    for (auto *Succ : Terminator->successors()) {
-      DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
-      DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
-    }
-
-    DTU.applyUpdates(DomTreeUpdates);
-
-    Predecessor = ComputeEnd;
-  } else {
-    Predecessor = OriginalBB;
-  }
+  // if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
+  //   // Move terminator from I's block to ComputeEnd block.
+  //   //
+  //   // OriginalBB is known to have a branch as terminator because
+  //   // SplitBlockAndInsertIfThen will have inserted one.
+  //   BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
+  //   B.SetInsertPoint(ComputeEnd);
+  //   Terminator->removeFromParent();
+  //   B.Insert(Terminator);
+
+  //   // Branch to ComputeLoop Block unconditionally from the I's block for
+  //   // iterative approach.
+  //   B.SetInsertPoint(OriginalBB);
+  //   B.CreateBr(ComputeLoop);
+
+  //   // Update the dominator tree for new control flow.
+  //   SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
+  //       {{DominatorTree::Insert, OriginalBB, ComputeLoop},
+  //        {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
+
+  //   // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
+  //   // the DT edges as well.
+  //   for (auto *Succ : Terminator->successors()) {
+  //     DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
+  //     DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
+  //   }
+
+  //   DTU.applyUpdates(DomTreeUpdates);
+
+  //   Predecessor = ComputeEnd;
+  // } else {
+  //   Predecessor = OriginalBB;
+  // }
+  Predecessor = OriginalBB;
   // Move the IR builder into single_lane next.
   B.SetInsertPoint(SingleLaneTerminator);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bc771d4ef6c08..24c6dc0afbce5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4846,8 +4846,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
       break;
     }
-    case Intrinsic::amdgcn_wave_reduce_umin:
-    case Intrinsic::amdgcn_wave_reduce_umax: {
+    case Intrinsic::amdgcn_wave_reduce_add:
+    case Intrinsic::amdgcn_wave_reduce_fadd: 
+    case Intrinsic::amdgcn_wave_reduce_sub: 
+    case Intrinsic::amdgcn_wave_reduce_fsub: 
+    case Intrinsic::amdgcn_wave_reduce_min: 
+    case Intrinsic::amdgcn_wave_reduce_umin: 
+    case Intrinsic::amdgcn_wave_reduce_fmin: 
+    case Intrinsic::amdgcn_wave_reduce_max: 
+    case Intrinsic::amdgcn_wave_reduce_umax: 
+    case Intrinsic::amdgcn_wave_reduce_fmax: 
+    case Intrinsic::amdgcn_wave_reduce_and: 
+    case Intrinsic::amdgcn_wave_reduce_or: 
+    case Intrinsic::amdgcn_wave_reduce_xor: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 885ecab891b1f..4ffcee15225cd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4861,10 +4861,141 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   Register DstReg = MI.getOperand(0).getReg();
   MachineBasicBlock *RetBB = nullptr;
   if (isSGPR) {
-    // These operations with a uniform value i.e. SGPR are idempotent.
-    // Reduced value will be same as given sgpr.
-    BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
-    RetBB = &BB;
+    switch(Opc){
+      case AMDGPU::S_MIN_U32:
+      case AMDGPU::S_MIN_I32:
+      case AMDGPU::S_MAX_U32:
+      case AMDGPU::S_MAX_I32:
+      case AMDGPU::S_AND_B32:
+      case AMDGPU::S_OR_B32:{
+        // These operations with a uniform value i.e. SGPR are idempotent.
+        // Reduced value will be same as given sgpr.
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+        RetBB = &BB;
+        break;
+      }
+      case AMDGPU::S_XOR_B32:
+      case AMDGPU::S_ADD_I32:
+      case AMDGPU::S_SUB_I32:{
+        // MachineBasicBlock::iterator I = BB.end();
+        // Register SrcReg = MI.getOperand(1).getReg();
+
+        // // Create Control flow for loop
+        // // Split MI's Machine Basic block into For loop
+        // auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+        // // Create virtual registers required for lowering.
+        const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+        const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+        Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
+        // Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+
+        // Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+        // Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+        // Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+
+        // Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+        Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
+
+        bool IsWave32 = ST.isWave32();
+        unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+        unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+        unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
+
+        // Create initail values of induction variable from Exec, Accumulator and
+        // insert branch instr to newly created ComputeBlock
+        // uint32_t InitalValue = 0;
+        
+        auto Exec =
+            BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+        auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
+                                  .addReg(Exec->getOperand(0).getReg());
+
+        // BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+        //     .addImm(InitalValue);
+        // BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+        // // Start constructing ComputeLoop
+        // I = ComputeLoop->end();
+        // auto Accumulator =
+        //     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+        //         .addReg(InitalValReg)
+        //         .addMBB(&BB);
+        // auto ActiveBits =
+        //     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+        //         .addReg(TmpSReg->getOperand(0).getReg())
+        //         .addMBB(&BB);
+
+        // // Perform the computations
+        // unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+        // auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+        //               .addReg(ActiveBits->getOperand(0).getReg());
+
+        // // Manipulate the iterator to get the next active lane
+        // unsigned BITSETOpc =
+        //     IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+        // auto NewActiveBits =
+        //     BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+        //         .addReg(FF1->getOperand(0).getReg())
+        //         .addReg(ActiveBits->getOperand(0).getReg());
+
+        // // Add phi nodes
+        // Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+        //     .addMBB(ComputeLoop);
+        // ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+        //     .addMBB(ComputeLoop);
+
+        // // Creating branching
+        // unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+        // BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+        //     .addReg(NewActiveBits->getOperand(0).getReg())
+        //     .addImm(0);
+        // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+        //     .addMBB(ComputeLoop);
+
+        // I = ComputeEnd->begin();
+        switch(Opc){
+          case AMDGPU::S_XOR_B32:{
+            // Performing an XOR operation on a uniform value
+            // depends on the number of active lanes. If there 
+            // are an even number of active lanes, then the XOR 
+            // will result in 0. And if there are an odd number
+            // of Active lanes then the XOR will result in the
+            // same value as that in the SGPR. This comes from 
+            // the fact that A^A = 0 and A^0 = A.
+
+            Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+
+            auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+                .addReg(NewAccumulator->getOperand(0).getReg())
+                .addImm(1);
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)  
+                .addReg(SrcReg)
+                .addReg(ParityReg->getOperand(0).getReg())  ;
+            break;
+          }
+          case AMDGPU::S_SUB_I32:{
+            // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
+            Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+            
+            // Take the negation of the source operand.
+            auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+                .addReg(InvertedValReg->getOperand(0).getReg())
+                .addReg(NewAccumulator->getOperand(0).getReg());
+            break;
+          }
+          case AMDGPU::S_ADD_I32:{
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+                .addReg(SrcReg)
+                .addReg(NewAccumulator->getOperand(0).getReg());
+            break;
+          }
+        }
+        RetBB = &BB;
+      }
+    }
   } else {
     // TODO: Implement DPP Strategy and switch based on immediate strategy
     // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4900,9 +5031,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
     // Create initail values of induction variable from Exec, Accumulator and
-    // insert branch instr to newly created ComputeBlockk
-    uint32_t InitalValue =
-        (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+    // insert branch instr to newly created ComputeBlock
+    uint32_t InitalValue;
+    switch(Opc){
+      case AMDGPU::S_MIN_U32:
+        InitalValue = std::numeric_limits<uint32_t>::max();
+        break;
+      case AMDGPU::S_MIN_I32:
+        InitalValue = std::numeric_limits<int32_t>::max();
+        break;
+      case AMDGPU::S_MAX_U32:
+        InitalValue = 0;
+        break;
+      case AMDGPU::S_MAX_I32:
+        InitalValue = std::numeric_limits<int32_t>::min();
+        break;
+      case AMDGPU::S_ADD_I32:
+      case AMDGPU::S_SUB_I32:
+      case AMDGPU::S_OR_B32:
+      case AMDGPU::S_XOR_B32:
+        InitalValue = 0x00000000;
+        break;
+      case AMDGPU::S_AND_B32:
+        InitalValue = 0xFFFFFFFF;
+    }
     auto TmpSReg =
         BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -4968,10 +5120,44 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
   switch (MI.getOpcode()) {
-  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
-  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_F32);
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_F32);
+  // case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_F32);
+  // case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_F32);
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
     const DebugLoc &DL = MI.getDebugLoc();
@@ -6859,7 +7045,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
+  // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64  // TODO --> `..are zero-extended to 32-bits, then we ..` , should this be zero-extended from 32 bits?
   // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
   // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
   KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7..c5883ff783903 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -255,15 +255,100 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
     (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
 let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
-  def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_MIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_fmin f32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
   }
+
+  def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_MAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_fmax f32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_ADD_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_add f32:$src, i32:$strategy))]> {
+  }
+  
+  //def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
+  //}
+  
+  def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
+  }
+  
+  //def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  // [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
+  //}
+
+  def WAVE_REDUCE_SUB_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_fsub f32:$src, i32:$strategy))]> {
+  }
+  
+  def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
+  }
+
+  //def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
+  //}
+
+  //def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> {
+  //}
+  
+  def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
+  }
+
+  //def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
+  //}
+
+  //def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> {
+  //}
+  
+  def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
+  }
 }
 
 let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index d1e50bd560cb2..02942254cc555 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -156,7 +156,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
 ; IR-DPP:       14:
 ; IR-DPP-NEXT:    ret void
 ;
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst
   ret void
 }