llvm
diff --git a/‎llvm/lib/Target/AMDGPU/SIFoldOperands.cpp‎
Lines changed: 35 additions & 7 deletions b/‎llvm/lib/Target/AMDGPU/SIFoldOperands.cpp‎
Lines changed: 35 additions & 7 deletions
@@ -730,14 +730,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
     }
   }
 
-  // Rework once the VS_16 register class is updated to include proper
-  // 16-bit SGPRs instead of 32-bit ones.
-  if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
-    Old.setSubReg(AMDGPU::NoSubRegister);
+  Old.setSubReg(New->getSubReg());
   if (New->getReg().isPhysical()) {
     Old.substPhysReg(New->getReg(), *TRI);
   } else {
-    Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
+    Old.substVirtReg(New->getReg(), 0, *TRI);
     Old.setIsUndef(New->isUndef());
   }
   return true;
@@ -1150,10 +1147,14 @@ void SIFoldOperandsImpl::foldOperand(
   if (UseOp->isReg() && OpToFold.isReg()) {
     if (UseOp->isImplicit())
       return;
-    // Allow folding from SGPRs to 16-bit VGPRs.
+
+    MachineInstr *SourceInstruction = MRI->getVRegDef(UseOp->getReg());
+    // Allow folding from SGPRs to 16-bit VGPRs
+    // or folding of non-subregs through REG_SEQUENCES.
     if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
         (UseOp->getSubReg() != AMDGPU::lo16 ||
-         !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
+         !TRI->isSGPRReg(*MRI, OpToFold.getReg())) &&
+        !SourceInstruction->isRegSequence())
       return;
   }
 
@@ -1452,6 +1453,33 @@ void SIFoldOperandsImpl::foldOperand(
       return;
   }
 
+  if (!FoldingImmLike && OpToFold.isReg() && ST->needsAlignedVGPRs()) {
+    unsigned Opc = UseMI->getOpcode();
+    // Special case for DS_GWS instructions that only use 32 bits but hardware
+    // treats it as a 64 bit read.
+    if (Opc == AMDGPU::DS_GWS_INIT || Opc == AMDGPU::DS_GWS_SEMA_BR ||
+        Opc == AMDGPU::DS_GWS_BARRIER) {
+      const TargetRegisterClass *RC =
+          TRI->getRegClassForReg(*MRI, OpToFold.getReg());
+      assert(RC);
+
+      const auto isAlignedReg = [&OpToFold, &UseOp, &UseMI, &RC,
+                                 this](AMDGPU::OpName OpName) -> bool {
+        const MachineOperand *Op = TII->getNamedOperand(*UseMI, OpName);
+        if (Op != UseOp)
+          return true;
+        Register Reg = OpToFold.getReg();
+        assert(!Reg.isPhysical());
+        return TRI->getRegSizeInBits(*RC) > 32 &&
+               !(TRI->getChannelFromSubReg(OpToFold.getSubReg()) & 1) &&
+               TRI->isProperlyAlignedRC(*RC);
+      };
+
+      if (!isAlignedReg(AMDGPU::OpName::data0))
+        return;
+    }
+  }
+
   // FIXME: We could try to change the instruction from 64-bit to 32-bit
   // to enable more folding opportunities.  The shrink operands pass
   // already does this.