Sisyph · Sisyph · Nov 22, 2022
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -345,7 +345,7 @@ def int_amdgcn_rsq_clamp : Intrinsic<
 
 // For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used.
 def int_amdgcn_ldexp : Intrinsic<
-  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
+  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1167,8 +1167,8 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
   // TODO: Should be able to support other operand types like globals.
   const MachineOperand &MO = MI->getOperand(OpNo);
   if (MO.isReg()) {
-    AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
-                                       *MF->getSubtarget().getRegisterInfo());
+    AMDGPUInstPrinter::printRegOperand(
+        MO.getReg(), O, *MF->getSubtarget().getRegisterInfo(), *getGlobalSTI());
     return false;
   } else if (MO.isImm()) {
     int64_t Val = MO.getImm();

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR",
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
-  [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
+  [VGPR_LO16, VGPR_HI16, VGPR_16_Lo128, VGPR_16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -281,6 +281,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
+  bool isRegOrImmWithTInt16InputMods() const {
+    return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::i16);
+  }
+
   bool isRegOrImmWithInt32InputMods() const {
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
@@ -289,6 +293,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
+  bool isRegOrInlineImmWithTInt16InputMods() const {
+    return isRegOrInline(AMDGPU::VS_16RegClassID, MVT::i16);
+  }
+
   bool isRegOrInlineImmWithInt32InputMods() const {
     return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i32);
   }
@@ -297,6 +305,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
 
+  bool isRegOrImmWithTFP16InputMods() const {
+    return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::f16);
+  }
+
   bool isRegOrImmWithFP16InputMods() const {
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16);
   }
@@ -309,6 +321,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
+  bool isRegOrInlineImmWithTFP16InputMods() const {
+    return isRegOrInline(AMDGPU::VS_16RegClassID, MVT::f16);
+  }
+
   bool isRegOrInlineImmWithFP16InputMods() const {
     return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
   }
@@ -343,6 +359,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   }
 
   bool isVRegWithInputMods() const;
+  bool isT16_Lo128VRegWithInputMods() const;
   bool isT16VRegWithInputMods() const;
 
   bool isSDWAOperand(MVT type) const;
@@ -523,16 +540,20 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
 
+  bool isVCSrcTB16() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::i16);
+  }
+
   bool isVCSrcTB16_Lo128() const {
-    return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::i16);
+    return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::i16);
   }
 
   bool isVCSrcB16() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
   bool isVCSrcV2B16() const {
-    return isVCSrcB16();
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
   bool isVCSrcF32() const {
@@ -543,8 +564,12 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
+  bool isVCSrcTF16() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16);
+  }
+
   bool isVCSrcTF16_Lo128() const {
-    return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::f16);
+    return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16);
   }
 
   bool isVCSrcF16() const {
@@ -563,6 +588,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVCSrcF64() || isLiteralImm(MVT::i64);
   }
 
+  bool isVSrcTB16() const { return isVCSrcTB16() || isLiteralImm(MVT::i16); }
+
   bool isVSrcTB16_Lo128() const {
     return isVCSrcTB16_Lo128() || isLiteralImm(MVT::i16);
   }
@@ -599,6 +626,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVCSrcF64() || isLiteralImm(MVT::f64);
   }
 
+  bool isVSrcTF16() const { return isVCSrcTF16() || isLiteralImm(MVT::f16); }
+
   bool isVSrcTF16_Lo128() const {
     return isVCSrcTF16_Lo128() || isLiteralImm(MVT::f16);
   }
@@ -1352,11 +1381,9 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                            SmallVectorImpl<AsmToken> &Tokens);
   unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
                         unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens);
-  bool ParseRegRange(unsigned& Num, unsigned& Width);
-  unsigned getRegularReg(RegisterKind RegKind,
-                         unsigned RegNum,
-                         unsigned RegWidth,
-                         SMLoc Loc);
+  bool ParseRegRange(unsigned &Num, unsigned &RegWidth);
+  unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum,
+                         unsigned RegWidth, SMLoc Loc, bool IsHigh);
 
   bool isRegister();
   bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -2065,8 +2092,12 @@ bool AMDGPUOperand::isVRegWithInputMods() const {
           AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]);
 }
 
+bool AMDGPUOperand::isT16_Lo128VRegWithInputMods() const {
+  return isRegClass(AMDGPU::VGPR_16_Lo128RegClassID);
+}
+
 bool AMDGPUOperand::isT16VRegWithInputMods() const {
-  return isRegClass(AMDGPU::VGPR_32_Lo128RegClassID);
+  return isRegClass(AMDGPU::VGPR_16RegClassID);
 }
 
 bool AMDGPUOperand::isSDWAOperand(MVT type) const {
@@ -2371,6 +2402,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
   if (Is == IS_VGPR) {
     switch (RegWidth) {
       default: return -1;
+      case 16:
+        return AMDGPU::VGPR_16RegClassID;
       case 32:
         return AMDGPU::VGPR_32RegClassID;
       case 64:
@@ -2632,6 +2665,8 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
     StringRef RegName = Reg->Name;
     StringRef RegSuffix = Str.substr(RegName.size());
     if (!RegSuffix.empty()) {
+      RegSuffix.consume_back(".l");
+      RegSuffix.consume_back(".h");
       unsigned Num;
       // A single register with an index: rXX
       if (getRegNum(RegSuffix, Num))
@@ -2652,11 +2687,9 @@ AMDGPUAsmParser::isRegister()
   return isRegister(getToken(), peekToken());
 }
 
-unsigned
-AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
-                               unsigned RegNum,
-                               unsigned RegWidth,
-                               SMLoc Loc) {
+unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
+                                        unsigned RegWidth, SMLoc Loc,
+                                        bool IsHigh) {
 
   assert(isRegularReg(RegKind));
 
@@ -2681,6 +2714,11 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   const MCRegisterClass RC = TRI->getRegClass(RCID);
+  if (RCID == VGPR_16RegClassID || RCID == VGPR_16_Lo128RegClassID)
+    // RegIdx depends on register class definition in SIRegisterInfo.td
+    // Expected register order is alternating lo and hi :
+    // VGPR0_LO16, VGPR0_HI16, VGPR1_LO16, VGPR1_HI16, ...
+    RegIdx = RegIdx * 2 + IsHigh;
   if (RegIdx >= RC.getNumRegs()) {
     Error(Loc, "register index is out of range");
     return AMDGPU::NoRegister;
@@ -2764,20 +2802,31 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
 
   RegKind = RI->Kind;
   StringRef RegSuffix = RegName.substr(RI->Name.size());
+  bool IsHigh = false;
   if (!RegSuffix.empty()) {
+    RegWidth = 32;
+    // We don't know the opcode till we are done parsing, so we don't know if
+    // registers should be 16 or 32 bit. It is therefore mandatory to put .l or
+    // .h to correctly specify 16 bit registers. We also can't determine class
+    // VGPR_16_Lo128 or VGPR_16, so always parse them as VGPR_16.
+    if (RegSuffix.consume_back(".h")) {
+      RegWidth = 16;
+      IsHigh = 1;
+    } else if (RegSuffix.consume_back(".l")) {
+      RegWidth = 16;
+    }
     // Single 32-bit register: vXX.
     if (!getRegNum(RegSuffix, RegNum)) {
       Error(Loc, "invalid register index");
       return AMDGPU::NoRegister;
     }
-    RegWidth = 32;
   } else {
     // Range of registers: v[XX:YY]. ":YY" is optional.
     if (!ParseRegRange(RegNum, RegWidth))
       return AMDGPU::NoRegister;
   }
 
-  return getRegularReg(RegKind, RegNum, RegWidth, Loc);
+  return getRegularReg(RegKind, RegNum, RegWidth, Loc, IsHigh);
 }
 
 unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
@@ -2829,7 +2878,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
   }
 
   if (isRegularReg(RegKind))
-    Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc);
+    Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc, false /*Low*/);
 
   return Reg;
 }

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -493,8 +493,8 @@ class MUBUF_Load_Pseudo <string opName,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind,
-                         RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
-                         RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret>
+                         RegisterOperand vdata_rc = getVregSrcForVT<vdata_vt>.ret,
+                         RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc.RegClass, isTFE>.ret>
   : MUBUF_Pseudo<opName,
                  !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
                  !con(getMUBUFIns<addrKindCopy, [], isTFE>.ret,
@@ -594,7 +594,7 @@ class MUBUF_Store_Pseudo <string opName,
                           int addrKindCopy = addrKind>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE>.ret,
+                 getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret.RegClass], isTFE>.ret,
                  " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" #
                    !if(isTFE, " tfe", "") # "$swz",
                  pattern>,