Skip to content

Commit 6118a25

Browse files
jrbyrnesarsenm
andauthored
[AMDGPU] Allocate AVRegClass last (#146606)
This changes the RC priorities such that AVRegClass is the least prioritized. These registers are less constrained than the VRegClass and ARegClass as they can be either agpr or vgpr. Thus, assigning them last removes unnecessary constraints from VRegClass and ARegClass assignments, and allows the RA to make smarter decisions about whether to use vgpr / agpr for AVRegClass. We only have 5 bits for RC priorities, and we still want to prioritize larger RCs over smaller ones. Since this new prioritization uses the 5th bit for AVRegClass vs ARegClass / VRegClass, we only have 4 bits to encode the size priorities. Previously, each RC with a distinct size, had a distinct priority. However, this PR groups together multiple sizes to the same priority. Currently, this will have no effect on prioritization in practice because we only have one actually defined RC per group per vector register type. For example, a register class with 15 or 16 32bit registers will have the same size priority (14). However, we only have VReg_512 (VReg_480 doesn't exist), so only one actual RC in VRegClass has this priority. Similarly, we give register class with 17-32+ 32 bit registers a size priority of 15, but we only have VReg_1024. The effect of this PR is to prioritize first the vector register type (VReg & Areg have top priority, then AVReg), with the size of the register class having second priority. Passes PSDB. --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent 64704c6 commit 6118a25

33 files changed

+3012
-2698
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
109109
let TSFlags{2} = HasVGPR;
110110
let TSFlags{3} = HasAGPR;
111111
let TSFlags{4} = HasSGPR;
112+
113+
// RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
114+
// to decide which registers to try to assign first. Usually, this RegisterClass priority is given
115+
// very high priority, if not the highest priority, when considering which VirtReg to allocate next.
116+
//
117+
// We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
118+
// assign more constrained RegisterClasses first. As a result, we prioritize register classes with
119+
// more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
120+
//
121+
// The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
122+
// In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
123+
// and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
124+
// RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
125+
// is used for scaling of the bit (i.e. 1 << 4).
126+
field int BaseClassPriority = 1;
127+
field int BaseClassScaleFactor = 16;
128+
112129
}
113130

114131
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -575,7 +592,7 @@ let HasVGPR = 1 in {
575592
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
576593
(add (interleave (sequence "VGPR%u_LO16", 0, 255),
577594
(sequence "VGPR%u_HI16", 0, 255)))> {
578-
let AllocationPriority = 2;
595+
let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
579596
let Size = 16;
580597
let GeneratePressureSet = 0;
581598

@@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
601618
// i16/f16 only on VI+
602619
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
603620
(add (sequence "VGPR%u", 0, 255))> {
604-
let AllocationPriority = 0;
621+
let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
605622
let Size = 32;
606623
let Weight = 1;
607624
let BaseClassOrder = 32;
@@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
610627
// Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
611628
def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
612629
(add (sequence "VGPR%u", 0, 127))> {
613-
let AllocationPriority = 0;
630+
let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
614631
let GeneratePressureSet = 0;
615632
let Size = 32;
616633
let Weight = 1;
@@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
668685
// AccVGPR 32-bit registers
669686
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
670687
(add (sequence "AGPR%u", 0, 255))> {
671-
let AllocationPriority = 0;
688+
let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
672689
let Size = 32;
673690
let Weight = 1;
674691
let BaseClassOrder = 32;
@@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
940957

941958
// Requires n v_mov_b32 to copy
942959
let CopyCost = numRegs;
943-
let AllocationPriority = !sub(numRegs, 1);
960+
961+
// Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
962+
// 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
963+
// of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
964+
// regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
965+
// RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
966+
// and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
967+
defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
968+
969+
let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
944970
let Weight = numRegs;
945971
}
946972

947973
// Define a register tuple class, along with one requiring an even
948974
// aligned base register.
949975
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
950-
let HasVGPR = 1 in {
976+
let HasVGPR = 1, BaseClassPriority = 1 in {
951977
// Define the regular class.
952978
def "" : VRegClassBase<numRegs, regTypes, regList> {
953979
let BaseClassOrder = !mul(numRegs, 32);
@@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
9811007
}
9821008

9831009
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
984-
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
1010+
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
9851011
// Define the regular class.
9861012
def "" : VRegClassBase<numRegs, regTypes, regList> {
9871013
let BaseClassOrder = !mul(numRegs, 32);
@@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
10661092
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
10671093
let HasVGPR = 1;
10681094
let HasAGPR = 1;
1095+
let BaseClassPriority = 0;
10691096
let Size = 32;
10701097
}
10711098
} // End GeneratePressureSet = 0
@@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
10741101
// aligned base register.
10751102
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
10761103
dag vregList, dag aregList> {
1077-
let HasVGPR = 1, HasAGPR = 1 in {
1104+
let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
10781105
// Define the regular class.
10791106
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
10801107

0 commit comments

Comments
 (0)