Skip to content

Commit 57dc169

Browse files
committed
[LegalizeTypes] Expand 128-bit UDIV/UREM by constant via Chunk Addition
This patch improves the lowering of 128-bit unsigned division and remainder by constants (UDIV/UREM) by avoiding a fallback to libcall (__udivti3/uremti3) for specific divisors. When a divisor D satisfies the condition (1 << ChunkWidth) % D == 1, the 128-bit value is split into fixed-width chunks (e.g., 30-bit) and summed before applying a smaller UDIV/UREM. This transformation is based on the "remainder by summing digits" trick described in Hacker’s Delight. This fixes PR137514 for some constants.
1 parent a76448c commit 57dc169

File tree

8 files changed

+758
-122
lines changed

8 files changed

+758
-122
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7981,8 +7981,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
79817981

79827982
// If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
79837983
// then add in the carry.
7984-
// TODO: If we can't split it in half, we might be able to split into 3 or
7985-
// more pieces using a smaller bit width.
79867984
if (HalfMaxPlus1.urem(Divisor).isOne()) {
79877985
assert(!LL == !LH && "Expected both input halves or no input halves!");
79887986
if (!LL)
@@ -8030,6 +8028,80 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
80308028
DAG.getConstant(0, dl, HiLoVT));
80318029
Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry);
80328030
}
8031+
8032+
} else {
8033+
// If we cannot split in two halves. Let's look for a smaller chunk
8034+
// width where (1 << ChunkWidth) mod Divisor == 1.
8035+
// This ensures that the sum of all such chunks modulo Divisor
8036+
// is equivalent to the original value modulo Divisor.
8037+
const APInt &Divisor = CN->getAPIntValue();
8038+
unsigned BitWidth = VT.getScalarSizeInBits();
8039+
unsigned BestChunkWidth = 0;
8040+
8041+
// We restrict to small chunk sizes (e.g., ≤ 32 bits) to ensure that all
8042+
// operations remain legal on most targets.
8043+
unsigned MaxChunk = 32;
8044+
for (int i = MaxChunk; i >= 1; --i) {
8045+
APInt ChunkMaxPlus1 = APInt::getOneBitSet(BitWidth, i);
8046+
if (ChunkMaxPlus1.urem(Divisor).isOne()) {
8047+
BestChunkWidth = i;
8048+
break;
8049+
}
8050+
}
8051+
8052+
// If we found a good chunk width, slice the number and sum the pieces.
8053+
if (BestChunkWidth > 0) {
8054+
EVT ChunkVT = EVT::getIntegerVT(*DAG.getContext(), BestChunkWidth);
8055+
8056+
if (!LL)
8057+
std::tie(LL, LH) =
8058+
DAG.SplitScalar(N->getOperand(0), dl, HiLoVT, HiLoVT);
8059+
SDValue In = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
8060+
8061+
SmallVector<SDValue, 8> Parts;
8062+
// Split into fixed-size chunks
8063+
for (unsigned i = 0; i < BitWidth; i += BestChunkWidth) {
8064+
SDValue Shift = DAG.getShiftAmountConstant(i, VT, dl);
8065+
SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift);
8066+
Chunk = DAG.getNode(ISD::TRUNCATE, dl, ChunkVT, Chunk);
8067+
Parts.push_back(Chunk);
8068+
}
8069+
if (Parts.empty())
8070+
return false;
8071+
Sum = Parts[0];
8072+
8073+
// Use uaddo_carry if we can, otherwise use a compare to detect overflow.
8074+
// same logic as used in above if condition.
8075+
SDValue Carry = DAG.getConstant(0, dl, ChunkVT);
8076+
EVT SetCCType =
8077+
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ChunkVT);
8078+
for (unsigned i = 1; i < Parts.size(); ++i) {
8079+
if (isOperationLegalOrCustom(ISD::UADDO_CARRY, ChunkVT)) {
8080+
SDVTList VTList = DAG.getVTList(ChunkVT, SetCCType);
8081+
SDValue UAdd = DAG.getNode(ISD::UADDO, dl, VTList, Sum, Parts[i]);
8082+
Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, UAdd, Carry,
8083+
UAdd.getValue(1));
8084+
} else {
8085+
SDValue Add = DAG.getNode(ISD::ADD, dl, ChunkVT, Sum, Parts[i]);
8086+
SDValue NewCarry = DAG.getSetCC(dl, SetCCType, Add, Sum, ISD::SETULT);
8087+
8088+
if (getBooleanContents(ChunkVT) ==
8089+
TargetLoweringBase::ZeroOrOneBooleanContent)
8090+
NewCarry = DAG.getZExtOrTrunc(NewCarry, dl, ChunkVT);
8091+
else
8092+
NewCarry = DAG.getSelect(dl, ChunkVT, NewCarry,
8093+
DAG.getConstant(1, dl, ChunkVT),
8094+
DAG.getConstant(0, dl, ChunkVT));
8095+
8096+
Sum = DAG.getNode(ISD::ADD, dl, ChunkVT, Add, Carry);
8097+
Carry = NewCarry;
8098+
}
8099+
}
8100+
8101+
Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, Sum);
8102+
} else {
8103+
return false;
8104+
}
80338105
}
80348106

80358107
// If we didn't find a sum, we can't do the expansion.

llvm/test/CodeGen/RISCV/div-by-constant.ll

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -115,16 +115,76 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
115115
}
116116

117117
define i64 @udiv64_constant_add(i64 %a) nounwind {
118-
; RV32-LABEL: udiv64_constant_add:
119-
; RV32: # %bb.0:
120-
; RV32-NEXT: addi sp, sp, -16
121-
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
122-
; RV32-NEXT: li a2, 7
123-
; RV32-NEXT: li a3, 0
124-
; RV32-NEXT: call __udivdi3
125-
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
126-
; RV32-NEXT: addi sp, sp, 16
127-
; RV32-NEXT: ret
118+
; RV32IM-LABEL: udiv64_constant_add:
119+
; RV32IM: # %bb.0:
120+
; RV32IM-NEXT: lui a2, 262144
121+
; RV32IM-NEXT: slli a3, a1, 2
122+
; RV32IM-NEXT: srli a4, a0, 30
123+
; RV32IM-NEXT: srli a5, a1, 28
124+
; RV32IM-NEXT: lui a6, 149797
125+
; RV32IM-NEXT: addi a2, a2, -1
126+
; RV32IM-NEXT: or a3, a4, a3
127+
; RV32IM-NEXT: and a4, a0, a2
128+
; RV32IM-NEXT: add a3, a0, a3
129+
; RV32IM-NEXT: add a5, a3, a5
130+
; RV32IM-NEXT: and a3, a3, a2
131+
; RV32IM-NEXT: sltu a3, a3, a4
132+
; RV32IM-NEXT: lui a4, 449390
133+
; RV32IM-NEXT: add a3, a5, a3
134+
; RV32IM-NEXT: lui a5, 748983
135+
; RV32IM-NEXT: addi a6, a6, -1755
136+
; RV32IM-NEXT: addi a4, a4, -1171
137+
; RV32IM-NEXT: addi a5, a5, -585
138+
; RV32IM-NEXT: and a2, a3, a2
139+
; RV32IM-NEXT: mulhu a3, a2, a6
140+
; RV32IM-NEXT: slli a6, a3, 3
141+
; RV32IM-NEXT: add a2, a2, a3
142+
; RV32IM-NEXT: sub a2, a2, a6
143+
; RV32IM-NEXT: sub a3, a0, a2
144+
; RV32IM-NEXT: sltu a0, a0, a2
145+
; RV32IM-NEXT: mul a2, a3, a4
146+
; RV32IM-NEXT: mulhu a4, a3, a5
147+
; RV32IM-NEXT: sub a1, a1, a0
148+
; RV32IM-NEXT: add a2, a4, a2
149+
; RV32IM-NEXT: mul a1, a1, a5
150+
; RV32IM-NEXT: add a1, a2, a1
151+
; RV32IM-NEXT: mul a0, a3, a5
152+
; RV32IM-NEXT: ret
153+
;
154+
; RV32IMZB-LABEL: udiv64_constant_add:
155+
; RV32IMZB: # %bb.0:
156+
; RV32IMZB-NEXT: srli a2, a0, 30
157+
; RV32IMZB-NEXT: srli a3, a1, 28
158+
; RV32IMZB-NEXT: lui a4, 786432
159+
; RV32IMZB-NEXT: slli a5, a0, 2
160+
; RV32IMZB-NEXT: lui a6, 149797
161+
; RV32IMZB-NEXT: sh2add a2, a1, a2
162+
; RV32IMZB-NEXT: srli a5, a5, 2
163+
; RV32IMZB-NEXT: add a2, a0, a2
164+
; RV32IMZB-NEXT: add a3, a2, a3
165+
; RV32IMZB-NEXT: andn a2, a2, a4
166+
; RV32IMZB-NEXT: sltu a2, a2, a5
167+
; RV32IMZB-NEXT: lui a5, 449390
168+
; RV32IMZB-NEXT: add a2, a3, a2
169+
; RV32IMZB-NEXT: lui a3, 748983
170+
; RV32IMZB-NEXT: addi a6, a6, -1755
171+
; RV32IMZB-NEXT: addi a5, a5, -1171
172+
; RV32IMZB-NEXT: addi a3, a3, -585
173+
; RV32IMZB-NEXT: andn a2, a2, a4
174+
; RV32IMZB-NEXT: mulhu a4, a2, a6
175+
; RV32IMZB-NEXT: slli a6, a4, 3
176+
; RV32IMZB-NEXT: add a2, a2, a4
177+
; RV32IMZB-NEXT: sub a2, a2, a6
178+
; RV32IMZB-NEXT: sub a4, a0, a2
179+
; RV32IMZB-NEXT: sltu a0, a0, a2
180+
; RV32IMZB-NEXT: mul a2, a4, a5
181+
; RV32IMZB-NEXT: mulhu a5, a4, a3
182+
; RV32IMZB-NEXT: sub a1, a1, a0
183+
; RV32IMZB-NEXT: add a2, a5, a2
184+
; RV32IMZB-NEXT: mul a1, a1, a3
185+
; RV32IMZB-NEXT: add a1, a2, a1
186+
; RV32IMZB-NEXT: mul a0, a4, a3
187+
; RV32IMZB-NEXT: ret
128188
;
129189
; RV64-LABEL: udiv64_constant_add:
130190
; RV64: # %bb.0:

llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll

Lines changed: 155 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -117,24 +117,89 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
117117
define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
118118
; RV32-LABEL: test_udiv_7:
119119
; RV32: # %bb.0:
120-
; RV32-NEXT: addi sp, sp, -16
121-
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
122-
; RV32-NEXT: li a2, 7
123-
; RV32-NEXT: li a3, 0
124-
; RV32-NEXT: call __udivdi3
125-
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
126-
; RV32-NEXT: addi sp, sp, 16
120+
; RV32-NEXT: lui a2, 262144
121+
; RV32-NEXT: slli a3, a1, 2
122+
; RV32-NEXT: srli a4, a0, 30
123+
; RV32-NEXT: srli a5, a1, 28
124+
; RV32-NEXT: lui a6, 149797
125+
; RV32-NEXT: addi a2, a2, -1
126+
; RV32-NEXT: or a3, a4, a3
127+
; RV32-NEXT: and a4, a0, a2
128+
; RV32-NEXT: add a3, a0, a3
129+
; RV32-NEXT: add a5, a3, a5
130+
; RV32-NEXT: and a3, a3, a2
131+
; RV32-NEXT: sltu a3, a3, a4
132+
; RV32-NEXT: lui a4, 449390
133+
; RV32-NEXT: add a3, a5, a3
134+
; RV32-NEXT: lui a5, 748983
135+
; RV32-NEXT: addi a6, a6, -1755
136+
; RV32-NEXT: addi a4, a4, -1171
137+
; RV32-NEXT: addi a5, a5, -585
138+
; RV32-NEXT: and a2, a3, a2
139+
; RV32-NEXT: mulhu a3, a2, a6
140+
; RV32-NEXT: slli a6, a3, 3
141+
; RV32-NEXT: add a2, a2, a3
142+
; RV32-NEXT: sub a2, a2, a6
143+
; RV32-NEXT: sub a3, a0, a2
144+
; RV32-NEXT: sltu a0, a0, a2
145+
; RV32-NEXT: mul a2, a3, a4
146+
; RV32-NEXT: mulhu a4, a3, a5
147+
; RV32-NEXT: sub a1, a1, a0
148+
; RV32-NEXT: add a2, a4, a2
149+
; RV32-NEXT: mul a1, a1, a5
150+
; RV32-NEXT: add a1, a2, a1
151+
; RV32-NEXT: mul a0, a3, a5
127152
; RV32-NEXT: ret
128153
;
129154
; RV64-LABEL: test_udiv_7:
130155
; RV64: # %bb.0:
131-
; RV64-NEXT: addi sp, sp, -16
132-
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
133-
; RV64-NEXT: li a2, 7
134-
; RV64-NEXT: li a3, 0
135-
; RV64-NEXT: call __udivti3
136-
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
137-
; RV64-NEXT: addi sp, sp, 16
156+
; RV64-NEXT: slli a2, a1, 4
157+
; RV64-NEXT: srli a3, a0, 60
158+
; RV64-NEXT: slli a4, a1, 34
159+
; RV64-NEXT: srli a5, a0, 30
160+
; RV64-NEXT: lui a6, 262144
161+
; RV64-NEXT: srli a7, a1, 26
162+
; RV64-NEXT: or a2, a3, a2
163+
; RV64-NEXT: lui a3, 748983
164+
; RV64-NEXT: or a4, a5, a4
165+
; RV64-NEXT: addi a6, a6, -1
166+
; RV64-NEXT: addi a3, a3, -585
167+
; RV64-NEXT: add a4, a0, a4
168+
; RV64-NEXT: slli a5, a3, 33
169+
; RV64-NEXT: add a3, a3, a5
170+
; RV64-NEXT: and a5, a0, a6
171+
; RV64-NEXT: add a2, a4, a2
172+
; RV64-NEXT: and a4, a4, a6
173+
; RV64-NEXT: sltu a5, a4, a5
174+
; RV64-NEXT: add a5, a2, a5
175+
; RV64-NEXT: and a2, a2, a6
176+
; RV64-NEXT: sltu a2, a2, a4
177+
; RV64-NEXT: srli a4, a1, 56
178+
; RV64-NEXT: add a2, a2, a4
179+
; RV64-NEXT: lui a4, %hi(.LCPI2_0)
180+
; RV64-NEXT: add a7, a5, a7
181+
; RV64-NEXT: and a5, a5, a6
182+
; RV64-NEXT: add a2, a7, a2
183+
; RV64-NEXT: and a7, a7, a6
184+
; RV64-NEXT: sltu a5, a7, a5
185+
; RV64-NEXT: lui a7, %hi(.LCPI2_1)
186+
; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a4)
187+
; RV64-NEXT: ld a7, %lo(.LCPI2_1)(a7)
188+
; RV64-NEXT: add a2, a2, a5
189+
; RV64-NEXT: and a2, a2, a6
190+
; RV64-NEXT: mulhu a4, a2, a4
191+
; RV64-NEXT: slli a5, a4, 3
192+
; RV64-NEXT: add a2, a2, a4
193+
; RV64-NEXT: sub a2, a2, a5
194+
; RV64-NEXT: sub a4, a0, a2
195+
; RV64-NEXT: sltu a0, a0, a2
196+
; RV64-NEXT: mul a2, a4, a7
197+
; RV64-NEXT: mulhu a5, a4, a3
198+
; RV64-NEXT: sub a1, a1, a0
199+
; RV64-NEXT: add a2, a5, a2
200+
; RV64-NEXT: mul a1, a1, a3
201+
; RV64-NEXT: add a1, a2, a1
202+
; RV64-NEXT: mul a0, a4, a3
138203
; RV64-NEXT: ret
139204
%a = udiv iXLen2 %x, 7
140205
ret iXLen2 %a
@@ -143,24 +208,86 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
143208
define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
144209
; RV32-LABEL: test_udiv_9:
145210
; RV32: # %bb.0:
146-
; RV32-NEXT: addi sp, sp, -16
147-
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
148-
; RV32-NEXT: li a2, 9
149-
; RV32-NEXT: li a3, 0
150-
; RV32-NEXT: call __udivdi3
151-
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
152-
; RV32-NEXT: addi sp, sp, 16
211+
; RV32-NEXT: lui a2, 262144
212+
; RV32-NEXT: slli a3, a1, 2
213+
; RV32-NEXT: srli a4, a0, 30
214+
; RV32-NEXT: srli a5, a1, 28
215+
; RV32-NEXT: lui a6, 233017
216+
; RV32-NEXT: addi a2, a2, -1
217+
; RV32-NEXT: or a3, a4, a3
218+
; RV32-NEXT: and a4, a0, a2
219+
; RV32-NEXT: add a3, a0, a3
220+
; RV32-NEXT: add a5, a3, a5
221+
; RV32-NEXT: and a3, a3, a2
222+
; RV32-NEXT: sltu a3, a3, a4
223+
; RV32-NEXT: lui a4, 582542
224+
; RV32-NEXT: addi a6, a6, -455
225+
; RV32-NEXT: addi a4, a4, 910
226+
; RV32-NEXT: add a3, a5, a3
227+
; RV32-NEXT: and a2, a3, a2
228+
; RV32-NEXT: mulhu a3, a2, a6
229+
; RV32-NEXT: srli a3, a3, 1
230+
; RV32-NEXT: slli a5, a3, 3
231+
; RV32-NEXT: sub a2, a2, a3
232+
; RV32-NEXT: sub a2, a2, a5
233+
; RV32-NEXT: sub a3, a0, a2
234+
; RV32-NEXT: sltu a0, a0, a2
235+
; RV32-NEXT: mul a2, a3, a4
236+
; RV32-NEXT: mulhu a4, a3, a6
237+
; RV32-NEXT: sub a1, a1, a0
238+
; RV32-NEXT: add a2, a4, a2
239+
; RV32-NEXT: mul a1, a1, a6
240+
; RV32-NEXT: add a1, a2, a1
241+
; RV32-NEXT: mul a0, a3, a6
153242
; RV32-NEXT: ret
154243
;
155244
; RV64-LABEL: test_udiv_9:
156245
; RV64: # %bb.0:
157-
; RV64-NEXT: addi sp, sp, -16
158-
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
159-
; RV64-NEXT: li a2, 9
160-
; RV64-NEXT: li a3, 0
161-
; RV64-NEXT: call __udivti3
162-
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
163-
; RV64-NEXT: addi sp, sp, 16
246+
; RV64-NEXT: slli a2, a1, 4
247+
; RV64-NEXT: srli a3, a0, 60
248+
; RV64-NEXT: slli a4, a1, 34
249+
; RV64-NEXT: srli a5, a0, 30
250+
; RV64-NEXT: lui a6, 262144
251+
; RV64-NEXT: srli a7, a1, 26
252+
; RV64-NEXT: or a2, a3, a2
253+
; RV64-NEXT: srli a3, a1, 56
254+
; RV64-NEXT: or a4, a5, a4
255+
; RV64-NEXT: addi a6, a6, -1
256+
; RV64-NEXT: add a4, a0, a4
257+
; RV64-NEXT: and a5, a0, a6
258+
; RV64-NEXT: add a2, a4, a2
259+
; RV64-NEXT: and a4, a4, a6
260+
; RV64-NEXT: sltu a5, a4, a5
261+
; RV64-NEXT: add a5, a2, a5
262+
; RV64-NEXT: and a2, a2, a6
263+
; RV64-NEXT: sltu a2, a2, a4
264+
; RV64-NEXT: lui a4, %hi(.LCPI3_0)
265+
; RV64-NEXT: add a2, a2, a3
266+
; RV64-NEXT: lui a3, %hi(.LCPI3_1)
267+
; RV64-NEXT: add a7, a5, a7
268+
; RV64-NEXT: and a5, a5, a6
269+
; RV64-NEXT: add a2, a7, a2
270+
; RV64-NEXT: and a7, a7, a6
271+
; RV64-NEXT: sltu a5, a7, a5
272+
; RV64-NEXT: lui a7, %hi(.LCPI3_2)
273+
; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a4)
274+
; RV64-NEXT: ld a3, %lo(.LCPI3_1)(a3)
275+
; RV64-NEXT: ld a7, %lo(.LCPI3_2)(a7)
276+
; RV64-NEXT: add a2, a2, a5
277+
; RV64-NEXT: and a2, a2, a6
278+
; RV64-NEXT: mulhu a4, a2, a4
279+
; RV64-NEXT: slli a5, a4, 3
280+
; RV64-NEXT: sub a2, a2, a4
281+
; RV64-NEXT: sub a2, a2, a5
282+
; RV64-NEXT: sub a4, a0, a2
283+
; RV64-NEXT: sltu a0, a0, a2
284+
; RV64-NEXT: mul a2, a4, a3
285+
; RV64-NEXT: mulhu a3, a4, a7
286+
; RV64-NEXT: sub a1, a1, a0
287+
; RV64-NEXT: add a2, a3, a2
288+
; RV64-NEXT: mul a1, a1, a7
289+
; RV64-NEXT: add a1, a2, a1
290+
; RV64-NEXT: mul a0, a4, a7
164291
; RV64-NEXT: ret
165292
%a = udiv iXLen2 %x, 9
166293
ret iXLen2 %a

0 commit comments

Comments
 (0)