llvm
diff --git a/‎llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp‎
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstrInfo.cpp‎
Lines changed: 11 additions & 3 deletions b/‎llvm/lib/Target/AMDGPU/SIInstrInfo.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp‎
Lines changed: 1 addition & 3 deletions b/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/VOP1Instructions.td‎
Lines changed: 20 additions & 3 deletions b/‎llvm/lib/Target/AMDGPU/VOP1Instructions.td‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll‎
Lines changed: 16460 additions & 28699 deletions b/‎llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll‎
Lines changed: 16460 additions & 28699 deletions
@@ -1088,7 +1088,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
       assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
              "We do not expect to see 16-bit copies from VGPR to SGPR unless "
              "we have 16-bit VGPRs");
-      assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
+      assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_32RegClass ||
              MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
              MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
       // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
 
@@ -7245,7 +7245,8 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
   MachineBasicBlock *MBB = MI.getParent();
   // Legalize operands and check for size mismatch
   if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
-      OpIdx >= get(Opcode).getNumOperands())
+      OpIdx >= get(Opcode).getNumOperands() ||
+      get(Opcode).operands()[OpIdx].RegClass == -1)
     return;
 
   MachineOperand &Op = MI.getOperand(OpIdx);
@@ -7803,15 +7804,22 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
       // that copies will end up as machine instructions and not be
       // eliminated.
       addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
-      MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
-      MRI.clearKillFlags(Inst.getOperand(1).getReg());
+      Register NewDstReg = Inst.getOperand(1).getReg();
+      MRI.replaceRegWith(DstReg, NewDstReg);
+      MRI.clearKillFlags(NewDstReg);
       Inst.getOperand(0).setReg(DstReg);
       // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
       // these are deleted later, but at -O0 it would leave a suspicious
       // looking illegal copy of an undef register.
       for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
         Inst.removeOperand(I);
       Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+      // Legalize t16 operand since replaceReg is called after addUsersToVALU
+      for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+                                             E = MRI.use_end();
+           I != E; ++I) {
+        legalizeOperandsVALUt16(*I->getParent(), MRI);
+      }
       return;
     }
 
 
@@ -3553,9 +3553,7 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
 
 const TargetRegisterClass *
 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
-  if (BitWidth == 16)
-    return &AMDGPU::SGPR_LO16RegClass;
-  if (BitWidth == 32)
+  if (BitWidth == 16 || BitWidth == 32)
     return &AMDGPU::SReg_32RegClass;
   if (BitWidth == 64)
     return &AMDGPU::SReg_64RegClass;
 
@@ -1470,17 +1470,34 @@ def : GCNPat<
 >;
 
 def : GCNPat<
-  (i64 (anyext i16:$src)),
+  (i64 (UniformUnaryFrag<anyext> i16:$src)),
+  (REG_SEQUENCE VReg_64,
+     (i32 (COPY $src)), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<anyext> i16:$src)),
   (REG_SEQUENCE VReg_64, $src, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
 >;
 
 def : GCNPat<
-  (i16 (trunc i32:$src)),
+  (i16 (UniformUnaryFrag<trunc> i32:$src)),
+  (COPY $src)
+>;
+
+def : GCNPat<
+  (i16 (DivergentUnaryFrag<trunc> i32:$src)),
   (EXTRACT_SUBREG $src, lo16)
 >;
 
 def : GCNPat <
-  (i16 (trunc i64:$src)),
+  (i16 (UniformUnaryFrag<trunc> i64:$src)),
+  (EXTRACT_SUBREG $src, sub0)
+>;
+
+def : GCNPat <
+  (i16 (DivergentUnaryFrag<trunc> i64:$src)),
   (EXTRACT_SUBREG $src, lo16)
 >;