Skip to content

Commit 125af56

Browse files
authored
[AMDGPU][SDAG] Only fold flat offsets if they are inbounds PTRADDs (#165427)
For flat memory instructions where the address is supplied as a base address register with an immediate offset, the memory aperture test ignores the immediate offset. Currently, SDISel does not respect that, which leads to miscompilations where valid input programs crash when the address computation relies on the immediate offset to get the base address in the proper memory aperture. Global or scratch instructions are not affected. This patch only selects flat instructions with immediate offsets from PTRADD address computations with the inbounds flag: If the PTRADD does not leave the bounds of the allocated object, it cannot leave the bounds of the memory aperture and is therefore safe to handle with an immediate offset. Affected tests: - CodeGen/AMDGPU/fold-gep-offset.ll: Offsets are no longer wrongly folded, added new positive tests where we still do fold them. - CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll: Offset folding doesn't seem integral to this test, so the test is not changed to make offset folding still happen. - CodeGen/AMDGPU/loop-prefetch-data.ll: loop-reduce transforms inbounds addresses for accesses to be based on potentially OOB addresses used for prefetching. - I think the remaining ones suffer from the limited preservation of the inbounds flag in PTRADD DAGCombines due to the provenance problems pointed out in PR #165424 and the fact that `AMDGPUTargetLowering::SplitVector{Load|Store}` legalizes too-wide accesses by repeatedly splitting them in half. Legalizing a V32S32 memory accesses therefore leads to inbounds ptradd chains like (ptradd inbounds (ptradd inbounds (ptradd inbounds P, 64), 32), 16). The DAGCombines fold them into a single ptradd, but the involved transformations generally cannot preserve the inbounds flag (even though it would be valid in this case). Similar previous PR that relied on `ISD::ADD inbounds` instead of `ISD::PTRADD inbounds` (closed): #132353 Analogous PR for GISel (merged): #153001 Fixes SWDEV-516125.
1 parent 2f6a8a7 commit 125af56

File tree

8 files changed

+6252
-5357
lines changed

8 files changed

+6252
-5357
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 76 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1850,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
18501850
isFlatScratchBaseLegal(Addr))) {
18511851
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
18521852

1853-
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1854-
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1855-
Addr = N0;
1856-
OffsetVal = COffsetVal;
1857-
} else {
1858-
// If the offset doesn't fit, put the low bits into the offset field and
1859-
// add the rest.
1860-
//
1861-
// For a FLAT instruction the hardware decides whether to access
1862-
// global/scratch/shared memory based on the high bits of vaddr,
1863-
// ignoring the offset field, so we have to ensure that when we add
1864-
// remainder to vaddr it still points into the same underlying object.
1865-
// The easiest way to do that is to make sure that we split the offset
1866-
// into two pieces that are both >= 0 or both <= 0.
1867-
1868-
SDLoc DL(N);
1869-
uint64_t RemainderOffset;
1870-
1871-
std::tie(OffsetVal, RemainderOffset) =
1872-
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1873-
1874-
SDValue AddOffsetLo =
1875-
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1876-
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1877-
1878-
if (Addr.getValueType().getSizeInBits() == 32) {
1879-
SmallVector<SDValue, 3> Opnds;
1880-
Opnds.push_back(N0);
1881-
Opnds.push_back(AddOffsetLo);
1882-
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1883-
if (Subtarget->hasAddNoCarry()) {
1884-
AddOp = AMDGPU::V_ADD_U32_e64;
1885-
Opnds.push_back(Clamp);
1886-
}
1887-
Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1853+
// Adding the offset to the base address in a FLAT instruction must not
1854+
// change the memory aperture in which the address falls. Therefore we can
1855+
// only fold offsets from inbounds GEPs into FLAT instructions.
1856+
bool IsInBounds =
1857+
Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1858+
if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1859+
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1860+
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1861+
Addr = N0;
1862+
OffsetVal = COffsetVal;
18881863
} else {
1889-
// TODO: Should this try to use a scalar add pseudo if the base address
1890-
// is uniform and saddr is usable?
1891-
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1892-
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1893-
1894-
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1895-
DL, MVT::i32, N0, Sub0);
1896-
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1897-
DL, MVT::i32, N0, Sub1);
1898-
1899-
SDValue AddOffsetHi =
1900-
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1901-
1902-
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1903-
1904-
SDNode *Add =
1905-
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1906-
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1907-
1908-
SDNode *Addc = CurDAG->getMachineNode(
1909-
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1910-
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1911-
1912-
SDValue RegSequenceArgs[] = {
1913-
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1914-
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1915-
1916-
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1917-
MVT::i64, RegSequenceArgs),
1918-
0);
1864+
// If the offset doesn't fit, put the low bits into the offset field
1865+
// and add the rest.
1866+
//
1867+
// For a FLAT instruction the hardware decides whether to access
1868+
// global/scratch/shared memory based on the high bits of vaddr,
1869+
// ignoring the offset field, so we have to ensure that when we add
1870+
// remainder to vaddr it still points into the same underlying object.
1871+
// The easiest way to do that is to make sure that we split the offset
1872+
// into two pieces that are both >= 0 or both <= 0.
1873+
1874+
SDLoc DL(N);
1875+
uint64_t RemainderOffset;
1876+
1877+
std::tie(OffsetVal, RemainderOffset) =
1878+
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1879+
1880+
SDValue AddOffsetLo =
1881+
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1882+
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1883+
1884+
if (Addr.getValueType().getSizeInBits() == 32) {
1885+
SmallVector<SDValue, 3> Opnds;
1886+
Opnds.push_back(N0);
1887+
Opnds.push_back(AddOffsetLo);
1888+
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1889+
if (Subtarget->hasAddNoCarry()) {
1890+
AddOp = AMDGPU::V_ADD_U32_e64;
1891+
Opnds.push_back(Clamp);
1892+
}
1893+
Addr =
1894+
SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1895+
} else {
1896+
// TODO: Should this try to use a scalar add pseudo if the base
1897+
// address is uniform and saddr is usable?
1898+
SDValue Sub0 =
1899+
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1900+
SDValue Sub1 =
1901+
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1902+
1903+
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1904+
DL, MVT::i32, N0, Sub0);
1905+
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1906+
DL, MVT::i32, N0, Sub1);
1907+
1908+
SDValue AddOffsetHi =
1909+
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1910+
1911+
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1912+
1913+
SDNode *Add =
1914+
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1915+
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1916+
1917+
SDNode *Addc = CurDAG->getMachineNode(
1918+
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1919+
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1920+
1921+
SDValue RegSequenceArgs[] = {
1922+
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1923+
MVT::i32),
1924+
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1925+
1926+
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1927+
MVT::i64, RegSequenceArgs),
1928+
0);
1929+
}
19191930
}
19201931
}
19211932
}

0 commit comments

Comments
 (0)