Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,19 @@ void GCNRegPressure::inc(unsigned Reg,
LaneBitmask PrevMask,
LaneBitmask NewMask,
const MachineRegisterInfo &MRI) {
if (SIRegisterInfo::getNumCoveredRegs(NewMask) ==
SIRegisterInfo::getNumCoveredRegs(PrevMask))
unsigned NewNumCoveredRegs = SIRegisterInfo::getNumCoveredRegs(NewMask);
unsigned PrevNumCoveredRegs = SIRegisterInfo::getNumCoveredRegs(PrevMask);
if (NewNumCoveredRegs == PrevNumCoveredRegs)
return;

int Sign = 1;
if (NewMask < PrevMask) {
std::swap(NewMask, PrevMask);
std::swap(NewNumCoveredRegs, PrevNumCoveredRegs);
Sign = -1;
}
assert(PrevMask < NewMask && "prev mask should always be lesser than new");
assert(PrevMask < NewMask && PrevNumCoveredRegs < NewNumCoveredRegs &&
"prev mask should always be lesser than new");

const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
Expand All @@ -66,7 +69,24 @@ void GCNRegPressure::inc(unsigned Reg,
Value[TupleIdx] += Sign * TRI->getRegClassWeight(RC).RegWeight;
}
// Pressure scales with number of new registers covered by the new mask.
Sign *= SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask);
// Note when true16 is enabled, we can no longer safely use the following
// approach to calculate the difference in the number of 32-bit registers
// between two masks:
//
// Sign *= SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask);
//
// The issue is that the mask calculation `~PrevMask & NewMask` doesn't
// properly account for partial usage of a 32-bit register when dealing with
// 16-bit registers.
//
// Consider this example:
// Assume PrevMask = 0b0010 and NewMask = 0b1111. Here, the correct register
// usage difference should be 1, because even though PrevMask uses only half
// of a 32-bit register, it should still be counted as a full register use.
// However, the mask calculation yields `~PrevMask & NewMask = 0b1101`, and
// calling `getNumCoveredRegs` returns 2 instead of 1. This incorrect
// calculation can lead to integer overflow when Sign = -1.
Sign *= NewNumCoveredRegs - PrevNumCoveredRegs;
}
Value[RegKind] += Sign;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -x mir -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -run-pass=machine-scheduler %s -o - | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need -x mir


---
name: foo
tracksRegLiveness: true
liveins:
- { reg: '$sgpr4_sgpr5', virtual-reg: '%0' }
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5

; CHECK-LABEL: name: foo
; CHECK: liveins: $sgpr4_sgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_128 = COPY [[S_MOV_B32_]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]]
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[COPY1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
; CHECK-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64_e64 24, [[BUFFER_LOAD_DWORDX2_OFFSET]], implicit $exec
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].lo16:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].lo16
; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 0, 8, 8, [[V_LSHLREV_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_PK_LSHLREV_B16_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; CHECK-NEXT: S_WAITCNT 0
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr_64(p4) = COPY killed $sgpr4_sgpr5
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %0(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
%2:sreg_32 = S_MOV_B32 0
undef %3.sub0:sgpr_128 = COPY %2
%3.sub1:sgpr_128 = COPY %2
%3.sub2:sgpr_128 = COPY %2
%3.sub3:sgpr_128 = COPY killed %2
%4:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET killed %3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8)
%5:vreg_64 = V_LSHRREV_B64_e64 24, killed %4, implicit $exec
undef %6.lo16:vgpr_32 = COPY killed %5.lo16
%7:vgpr_32 = V_LSHLREV_B32_e64 16, killed %6, implicit $exec
%8:vgpr_32 = V_PK_LSHLREV_B16 0, 8, 8, killed %7, 0, 0, 0, 0, 0, implicit $exec
%9:vreg_64 = COPY killed %1
FLAT_STORE_DWORD killed %9, killed %8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
S_WAITCNT 0
S_ENDPGM 0
...