Skip to content

Commit 3579c49

Browse files
jrbyrnesarsenm
andcommitted
[AMDGPU] Allocate AVRegClass last (llvm#146606)
This changes the RC priorities such that AVRegClass is the least prioritized. These registers are less constrained than the VRegClass and ARegClass as they can be either agpr or vgpr. Thus, assigning them last removes unnecessary constraints from VRegClass and ARegClass assignments, and allows the RA to make smarter decisions about whether to use vgpr / agpr for AVRegClass. We only have 5 bits for RC priorities, and we still want to prioritize larger RCs over smaller ones. Since this new prioritization uses the 5th bit for AVRegClass vs ARegClass / VRegClass, we only have 4 bits to encode the size priorities. Previously, each RC with a distinct size, had a distinct priority. However, this PR groups together multiple sizes to the same priority. Currently, this will have no effect on prioritization in practice because we only have one actually defined RC per group per vector register type. For example, a register class with 15 or 16 32bit registers will have the same size priority (14). However, we only have VReg_512 (VReg_480 doesn't exist), so only one actual RC in VRegClass has this priority. Similarly, we give register class with 17-32+ 32 bit registers a size priority of 15, but we only have VReg_1024. The effect of this PR is to prioritize first the vector register type (VReg & Areg have top priority, then AVReg), with the size of the register class having second priority. Passes PSDB. --------- Co-authored-by: Matt Arsenault <[email protected]> Change-Id: I43e72fb7939121226427bcdc786bae4c55cb6565
1 parent de56260 commit 3579c49

30 files changed

+2115
-3333
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
109109
let TSFlags{2} = HasVGPR;
110110
let TSFlags{3} = HasAGPR;
111111
let TSFlags{4} = HasSGPR;
112+
113+
// RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
114+
// to decide which registers to try to assign first. Usually, this RegisterClass priority is given
115+
// very high priority, if not the highest priority, when considering which VirtReg to allocate next.
116+
//
117+
// We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
118+
// assign more constrained RegisterClasses first. As a result, we prioritize register classes with
119+
// more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
120+
//
121+
// The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
122+
// In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
123+
// and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
124+
// RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
125+
// is used for scaling of the bit (i.e. 1 << 4).
126+
field int BaseClassPriority = 1;
127+
field int BaseClassScaleFactor = 16;
128+
112129
}
113130

114131
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -571,7 +588,7 @@ let HasVGPR = 1 in {
571588
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
572589
(add (interleave (sequence "VGPR%u_LO16", 0, 255),
573590
(sequence "VGPR%u_HI16", 0, 255)))> {
574-
let AllocationPriority = 2;
591+
let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
575592
let Size = 16;
576593
let GeneratePressureSet = 0;
577594

@@ -597,7 +614,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
597614
// i16/f16 only on VI+
598615
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
599616
(add (sequence "VGPR%u", 0, 255))> {
600-
let AllocationPriority = 0;
617+
let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
601618
let Size = 32;
602619
let Weight = 1;
603620
let BaseClassOrder = 32;
@@ -606,7 +623,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
606623
// Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
607624
def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
608625
(add (sequence "VGPR%u", 0, 127))> {
609-
let AllocationPriority = 0;
626+
let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
610627
let GeneratePressureSet = 0;
611628
let Size = 32;
612629
let Weight = 1;
@@ -664,7 +681,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
664681
// AccVGPR 32-bit registers
665682
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
666683
(add (sequence "AGPR%u", 0, 255))> {
667-
let AllocationPriority = 0;
684+
let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
668685
let Size = 32;
669686
let Weight = 1;
670687
let BaseClassOrder = 32;
@@ -936,14 +953,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
936953

937954
// Requires n v_mov_b32 to copy
938955
let CopyCost = numRegs;
939-
let AllocationPriority = !sub(numRegs, 1);
956+
957+
// Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
958+
// 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
959+
// of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
960+
// regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
961+
// RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
962+
// and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
963+
defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
964+
965+
let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
940966
let Weight = numRegs;
941967
}
942968

943969
// Define a register tuple class, along with one requiring an even
944970
// aligned base register.
945971
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
946-
let HasVGPR = 1 in {
972+
let HasVGPR = 1, BaseClassPriority = 1 in {
947973
// Define the regular class.
948974
def "" : VRegClassBase<numRegs, regTypes, regList> {
949975
let BaseClassOrder = !mul(numRegs, 32);
@@ -977,7 +1003,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
9771003
}
9781004

9791005
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
980-
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
1006+
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
9811007
// Define the regular class.
9821008
def "" : VRegClassBase<numRegs, regTypes, regList> {
9831009
let BaseClassOrder = !mul(numRegs, 32);
@@ -1062,6 +1088,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
10621088
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
10631089
let HasVGPR = 1;
10641090
let HasAGPR = 1;
1091+
let BaseClassPriority = 0;
10651092
let Size = 32;
10661093
}
10671094
} // End GeneratePressureSet = 0
@@ -1070,7 +1097,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
10701097
// aligned base register.
10711098
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
10721099
dag vregList, dag aregList> {
1073-
let HasVGPR = 1, HasAGPR = 1 in {
1100+
let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
10741101
// Define the regular class.
10751102
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
10761103

0 commit comments

Comments
 (0)