AMDGPU: Add agpr variants of multi-data DS instructions

arsenm · arsenm · commit 76a69476c07c · 2025-09-02T16:53:56.000+09:00
The instruction definitions for loads and stores do not accurately model the operand constraints of loads and stores with AGPRs. They use AV register classes, plus a hack a hack in getRegClass/getOpRegClass to avoid using AGPRs or AV classes with the multiple operand cases, but it did not consider the 3 operand case. Model this correctly by using separate all-VGPR and all-AGPR variants for the cases with multiple data operands. This does regress the assembler errors on gfx908 for the multi-operand cases. It now reports a generic operand invalid error for GPU instead of the specific message that agpr loads and stores aren't supported. In the future AMDGPURewriteAGPRCopyMFMA should be taught to replace the VGPR forms with the AGPR ones. Most of the diff is fighting the DS pseudo structure. The mnemonic was being used as the key to SIMCInstr, which is a collision in the AGPR case. We also need to go out of our way to make sure we are using the gfx9+ variants of the pseudos without the m0 use. The DS multiclasses could use a lot of cleanup. Fixes #155777
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2598,6 +2598,17 @@ class getLdStRegisterOperand<RegisterClass RC> {
           !eq(RC.Size, 1024) : AVLdSt_1024);
 }
 
+class getEquivalentAGPRClass<RegisterClass RC> {
+  RegisterClass ret =
+    !cond(!eq(RC.Size, 32)   : AGPR_32,
+          !eq(RC.Size, 64)   : AReg_64,
+          !eq(RC.Size, 96)   : AReg_96,
+          !eq(RC.Size, 128)  : AReg_128,
+          !eq(RC.Size, 160)  : AReg_160,
+          !eq(RC.Size, 1024) : AReg_1024);
+}
+
+
 class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
                  ValueType Src1VT = i32, ValueType Src2VT = i32> {
   bit ret =    !if(!eq(DstVT.Size, 64),
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1395,3 +1395,35 @@ def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">;
 def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">;
 def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">;
 def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">;
+
+//===----------------------------------------------------------------------===//
+//  Tablegen programming utilities
+//===----------------------------------------------------------------------===//
+
+/// Helper function to extract the register class from an
+/// instruction's operand list, which may be a RegisterOperand or a
+/// direct RegisterClass reference.
+class getRegClassFromOp<DAGOperand Op> {
+  SIRegisterClass ret = !if(
+    !isa<RegisterOperand>(Op),
+    !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass),
+    !cast<SIRegisterClass>(Op));
+}
+
+/// Check if the operand will use an AV_* class.
+class OperandIsAV<DAGOperand Op> {
+  defvar reg_class = getRegClassFromOp<Op>.ret;
+  bit ret = !and(reg_class.HasAGPR, reg_class.HasVGPR);
+}
+
+/// Check if the operand will use an AGPR class.
+class OperandIsAGPR<DAGOperand Op> {
+  defvar reg_class = getRegClassFromOp<Op>.ret;
+  bit ret = !and(reg_class.HasAGPR, !not(reg_class.HasVGPR));
+}
+
+/// Check if the operand will use a VGPR class.
+class OperandIsVGPR<DAGOperand Op> {
+  defvar reg_class = getRegClassFromOp<Op>.ret;
+  bit ret = !and(reg_class.HasVGPR, !not(reg_class.HasAGPR));
+}
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomic-cmpxchg.ll
@@ -77,49 +77,112 @@ define void @ds_atomic_cmpxchg_i32_ret_av_av__a(ptr addrspace(3) %ptr) #0 {
   ret void
 }
 
-; FIXME: Broken
-; define void @ds_atomic_cmpxchg_i32_ret_a_a__a(ptr addrspace(3) %ptr) #0 {
-;   %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
-;   %data0 = call i32 asm "; def $0", "=a"()
-;   %data1 = call i32 asm "; def $0", "=a"()
-;   %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
-;   %result = extractvalue { i32, i1 } %pair, 0
-;   call void asm "; use $0", "a"(i32 %result)
-;   ret void
-; }
+define void @ds_atomic_cmpxchg_i32_ret_a_a__a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_a_a__a:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a1
+; CHECK-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+  %data0 = call i32 asm "; def $0", "=a"()
+  %data1 = call i32 asm "; def $0", "=a"()
+  %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  call void asm "; use $0", "a"(i32 %result)
+  ret void
+}
 
-; FIXME: Broken
-; define void @ds_atomic_cmpxchg_i32_ret_a_a__v(ptr addrspace(3) %ptr) #0 {
-;   %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
-;   %data0 = call i32 asm "; def $0", "=a"()
-;   %data1 = call i32 asm "; def $0", "=a"()
-;   %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
-;   %result = extractvalue { i32, i1 } %pair, 0
-;   call void asm "; use $0", "v"(i32 %result)
-;   ret void
-; }
+define void @ds_atomic_cmpxchg_i32_ret_a_a__v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_a_a__v:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a1
+; CHECK-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+  %data0 = call i32 asm "; def $0", "=a"()
+  %data1 = call i32 asm "; def $0", "=a"()
+  %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  call void asm "; use $0", "v"(i32 %result)
+  ret void
+}
 
-; FIXME: Broken
-; define void @ds_atomic_cmpxchg_i32_ret_v_a__v(ptr addrspace(3) %ptr) #0 {
-;   %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
-;   %data0 = call i32 asm "; def $0", "=v"()
-;   %data1 = call i32 asm "; def $0", "=a"()
-;   %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
-;   %result = extractvalue { i32, i1 } %pair, 0
-;   call void asm "; use $0", "v"(i32 %result)
-;   ret void
-; }
+define void @ds_atomic_cmpxchg_i32_ret_v_a__v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_v_a__v:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+  %data0 = call i32 asm "; def $0", "=v"()
+  %data1 = call i32 asm "; def $0", "=a"()
+  %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  call void asm "; use $0", "v"(i32 %result)
+  ret void
+}
 
-; FIXME: Broken
-; define void @ds_atomic_cmpxchg_i32_ret_a_v__v(ptr addrspace(3) %ptr) #0 {
-;   %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
-;   %data0 = call i32 asm "; def $0", "=a"()
-;   %data1 = call i32 asm "; def $0", "=v"()
-;   %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
-;   %result = extractvalue { i32, i1 } %pair, 0
-;   call void asm "; use $0", "v"(i32 %result)
-;   ret void
-; }
+define void @ds_atomic_cmpxchg_i32_ret_a_v__v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_a_v__v:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ds_cmpst_rtn_b32 v0, v0, v2, v1 offset:40
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+  %data0 = call i32 asm "; def $0", "=a"()
+  %data1 = call i32 asm "; def $0", "=v"()
+  %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  call void asm "; use $0", "v"(i32 %result)
+  ret void
+}
 
 define void @ds_atomic_cmpxchg_i32_ret_v_v__a(ptr addrspace(3) %ptr) #0 {
 ; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_v_v__a:
diff --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s
@@ -166,13 +166,13 @@ buffer_store_dwordx4 v[0:3], off, s[12:15], s4 offset:4095 glc tfe
 // GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 ds_write2_b64 v1, a[4:5], v[2:3] offset1:255
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: data and dst should be all VGPR or AGPR
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 ds_write2_b64 v1, v[4:5], a[2:3] offset1:255
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: data and dst should be all VGPR or AGPR
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 ds_wrxchg2st64_rtn_b32 v[6:7], v1, a2, a3 offset0:127
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: data and dst should be all VGPR or AGPR
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 image_load v[0:4], v2, s[0:7] dmask:0xf unorm tfe
 // GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s