Skip to content

Commit e91f504

Browse files
davemgreenkrishna2803
authored andcommitted
[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps (llvm#150896)
Similar to llvm#150639 this fixes the AggressiveInstCombine fold for convert tables to cttz instructions if the gep types are not array types. i.e `gep i16 @glob, i64 %idx` instead of `gep [64 x i16] @glob, i64 0, i64 %idx`.
1 parent f067f19 commit e91f504

File tree

4 files changed

+305
-63
lines changed

4 files changed

+305
-63
lines changed

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Lines changed: 45 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -458,29 +458,19 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
458458
// Check if this array of constants represents a cttz table.
459459
// Iterate over the elements from \p Table by trying to find/match all
460460
// the numbers from 0 to \p InputBits that should represent cttz results.
461-
static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
462-
uint64_t Shift, uint64_t InputBits) {
463-
unsigned Length = Table.getNumElements();
464-
if (Length < InputBits || Length > InputBits * 2)
465-
return false;
466-
467-
APInt Mask = APInt::getBitsSetFrom(InputBits, Shift);
468-
unsigned Matched = 0;
469-
470-
for (unsigned i = 0; i < Length; i++) {
471-
uint64_t Element = Table.getElementAsInteger(i);
472-
if (Element >= InputBits)
473-
continue;
474-
475-
// Check if \p Element matches a concrete answer. It could fail for some
476-
// elements that are never accessed, so we keep iterating over each element
477-
// from the table. The number of matched elements should be equal to the
478-
// number of potential right answers which is \p InputBits actually.
479-
if ((((Mul << Element) & Mask.getZExtValue()) >> Shift) == i)
480-
Matched++;
461+
static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
462+
const APInt &AndMask, Type *AccessTy,
463+
unsigned InputBits, const APInt &GEPIdxFactor,
464+
const DataLayout &DL) {
465+
for (unsigned Idx = 0; Idx < InputBits; Idx++) {
466+
APInt Index = (APInt(InputBits, 1).shl(Idx) * Mul).lshr(Shift) & AndMask;
467+
ConstantInt *C = dyn_cast_or_null<ConstantInt>(
468+
ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
469+
if (!C || C->getValue() != Idx)
470+
return false;
481471
}
482472

483-
return Matched == InputBits;
473+
return true;
484474
}
485475

486476
// Try to recognize table-based ctz implementation.
@@ -495,6 +485,11 @@ static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
495485
// this can be lowered to `cttz` instruction.
496486
// There is also a special case when the element is 0.
497487
//
488+
// The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
489+
// sequence that contains each pattern of bits in it. The shift extracts
490+
// the top bits after the multiply, and that index into the table should
491+
// represent the number of trailing zeros in the original number.
492+
//
498493
// Here are some examples or LLVM IR for a 64-bit target:
499494
//
500495
// CASE 1:
@@ -536,8 +531,8 @@ static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
536531
// i64 %shr
537532
// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
538533
//
539-
// All this can be lowered to @llvm.cttz.i32/64 intrinsic.
540-
static bool tryToRecognizeTableBasedCttz(Instruction &I) {
534+
// All these can be lowered to @llvm.cttz.i32/64 intrinsics.
535+
static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
541536
LoadInst *LI = dyn_cast<LoadInst>(&I);
542537
if (!LI)
543538
return false;
@@ -547,53 +542,47 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
547542
return false;
548543

549544
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
550-
if (!GEP || !GEP->hasNoUnsignedSignedWrap() || GEP->getNumIndices() != 2)
551-
return false;
552-
553-
if (!GEP->getSourceElementType()->isArrayTy())
554-
return false;
555-
556-
uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();
557-
if (ArraySize != 32 && ArraySize != 64)
545+
if (!GEP || !GEP->hasNoUnsignedSignedWrap())
558546
return false;
559547

560548
GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
561549
if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
562550
return false;
563551

564-
ConstantDataArray *ConstData =
565-
dyn_cast<ConstantDataArray>(GVTable->getInitializer());
566-
if (!ConstData)
567-
return false;
568-
569-
if (!match(GEP->idx_begin()->get(), m_ZeroInt()))
552+
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
553+
APInt ModOffset(BW, 0);
554+
SmallMapVector<Value *, APInt, 4> VarOffsets;
555+
if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
556+
VarOffsets.size() != 1 || ModOffset != 0)
570557
return false;
558+
auto [GepIdx, GEPScale] = VarOffsets.front();
571559

572-
Value *Idx2 = std::next(GEP->idx_begin())->get();
573560
Value *X1;
574-
uint64_t MulConst, ShiftConst;
575-
// FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
576-
// probably fail for other (e.g. 32-bit) targets.
577-
if (!match(Idx2, m_ZExtOrSelf(
578-
m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
579-
m_ConstantInt(MulConst)),
580-
m_ConstantInt(ShiftConst)))))
561+
const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
562+
// Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
563+
// This might be extended to the pointer index type, and if the gep index type
564+
// has been replaced with an i8 then a new And (and different ShiftConst) will
565+
// be present.
566+
auto MatchInner = m_LShr(
567+
m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), m_APInt(MulConst)),
568+
m_APInt(ShiftConst));
569+
if (!match(GepIdx, m_CastOrSelf(MatchInner)) &&
570+
!match(GepIdx, m_CastOrSelf(m_And(MatchInner, m_APInt(AndCst)))))
581571
return false;
582572

583573
unsigned InputBits = X1->getType()->getScalarSizeInBits();
584-
if (InputBits != 32 && InputBits != 64)
585-
return false;
586-
587-
// Shift should extract top 5..7 bits.
588-
if (InputBits - Log2_32(InputBits) != ShiftConst &&
589-
InputBits - Log2_32(InputBits) - 1 != ShiftConst)
574+
if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
590575
return false;
591576

592-
if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits))
577+
if (!GEPScale.isIntN(InputBits) ||
578+
!isCTTZTable(GVTable->getInitializer(), *MulConst, *ShiftConst,
579+
AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
580+
InputBits, GEPScale.zextOrTrunc(InputBits), DL))
593581
return false;
594582

595-
auto ZeroTableElem = ConstData->getElementAsInteger(0);
596-
bool DefinedForZero = ZeroTableElem == InputBits;
583+
ConstantInt *ZeroTableElem = cast<ConstantInt>(
584+
ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
585+
bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
597586

598587
IRBuilder<> B(LI);
599588
ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
@@ -607,8 +596,7 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
607596
// If the value in elem 0 isn't the same as InputBits, we still want to
608597
// produce the value from the table.
609598
auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
610-
auto Select =
611-
B.CreateSelect(Cmp, ConstantInt::get(XType, ZeroTableElem), Cttz);
599+
auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
612600

613601
// NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
614602
// it should be handled as: `cttz(x) & (typeSize - 1)`.
@@ -1477,7 +1465,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
14771465
MadeChange |= foldGuardedFunnelShift(I, DT);
14781466
MadeChange |= tryToRecognizePopCount(I);
14791467
MadeChange |= tryToFPToSat(I, TTI);
1480-
MadeChange |= tryToRecognizeTableBasedCttz(I);
1468+
MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
14811469
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
14821470
MadeChange |= foldPatternedLoads(I, DL);
14831471
MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);

llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,39 @@ return: ; preds = %entry, %if.end
190190
ret i32 %retval.0
191191
}
192192

193+
define i32 @ctz3_with_i8gep(i32 %x) {
194+
; CHECK-LABEL: @ctz3_with_i8gep(
195+
; CHECK-NEXT: entry:
196+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
197+
; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
198+
; CHECK: if.end:
199+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[X]], i1 true)
200+
; CHECK-NEXT: br label [[RETURN]]
201+
; CHECK: return:
202+
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ [[TMP2]], [[IF_END]] ], [ 32, [[ENTRY:%.*]] ]
203+
; CHECK-NEXT: ret i32 [[RETVAL_0]]
204+
;
205+
entry:
206+
%cmp = icmp eq i32 %x, 0
207+
br i1 %cmp, label %return, label %if.end
208+
209+
if.end: ; preds = %entry
210+
%sub = sub i32 0, %x
211+
%and = and i32 %x, %sub
212+
%mul = mul i32 %and, 81224991
213+
%0 = lshr i32 %mul, 25
214+
%1 = and i32 %0, 124
215+
%arrayidx.idx = zext nneg i32 %1 to i64
216+
%arrayidx = getelementptr inbounds nuw i8, ptr @ctz3.table, i64 %arrayidx.idx
217+
%2 = load i32, ptr %arrayidx, align 4
218+
br label %return
219+
220+
return: ; preds = %if.end, %entry
221+
%retval.0 = phi i32 [ %2, %if.end ], [ 32, %entry ]
222+
ret i32 %retval.0
223+
}
224+
225+
193226
@table = internal unnamed_addr constant [64 x i32] [i32 0, i32 1, i32 12, i32 2, i32 13, i32 22, i32 17, i32 3, i32 14, i32 33, i32 23, i32 36, i32 18, i32 58, i32 28, i32 4, i32 62, i32 15, i32 34, i32 26, i32 24, i32 48, i32 50, i32 37, i32 19, i32 55, i32 59, i32 52, i32 29, i32 44, i32 39, i32 5, i32 63, i32 11, i32 21, i32 16, i32 32, i32 35, i32 57, i32 27, i32 61, i32 25, i32 47, i32 49, i32 54, i32 51, i32 43, i32 38, i32 10, i32 20, i32 31, i32 56, i32 60, i32 46, i32 53, i32 42, i32 9, i32 30, i32 45, i32 41, i32 8, i32 40, i32 7, i32 6], align 4
194227

195228
define i32 @ctz4(i64 %b) {
@@ -276,3 +309,192 @@ entry:
276309
%0 = load i32, ptr %arrayidx, align 4
277310
ret i32 %0
278311
}
312+
313+
;; This has a wrong table size but is otherwise fine.
314+
@ctz9.table = internal unnamed_addr constant [128 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
315+
define i32 @ctz9(i32 %x) {
316+
; CHECK-LABEL: @ctz9(
317+
; CHECK-NEXT: entry:
318+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
319+
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
320+
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
321+
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
322+
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
323+
; CHECK-NEXT: ret i32 [[CONV]]
324+
;
325+
entry:
326+
%sub = sub i32 0, %x
327+
%and = and i32 %sub, %x
328+
%mul = mul i32 %and, 125613361
329+
%shr = lshr i32 %mul, 27
330+
%idxprom = zext i32 %shr to i64
331+
%arrayidx = getelementptr inbounds [128 x i8], ptr @ctz9.table, i64 0, i64 %idxprom
332+
%0 = load i8, ptr %arrayidx, align 1
333+
%conv = zext i8 %0 to i32
334+
ret i32 %conv
335+
}
336+
337+
define i32 @ctz1_with_i8_gep(i32 %x) {
338+
; CHECK-LABEL: @ctz1_with_i8_gep(
339+
; CHECK-NEXT: entry:
340+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
341+
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
342+
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
343+
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
344+
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
345+
; CHECK-NEXT: ret i32 [[CONV]]
346+
;
347+
entry:
348+
%sub = sub i32 0, %x
349+
%and = and i32 %sub, %x
350+
%mul = mul i32 %and, 125613361
351+
%shr = lshr i32 %mul, 27
352+
%idxprom = zext i32 %shr to i64
353+
%arrayidx = getelementptr inbounds i8, ptr @ctz7.table, i64 %idxprom
354+
%0 = load i8, ptr %arrayidx, align 1
355+
%conv = zext i8 %0 to i32
356+
ret i32 %conv
357+
}
358+
359+
; This is the same a ctz2 (i16 table) with an i8 gep making the indices invalid
360+
define i32 @ctz2_with_i8_gep(i32 %x) {
361+
; CHECK-LABEL: @ctz2_with_i8_gep(
362+
; CHECK-NEXT: entry:
363+
; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[X:%.*]]
364+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[SUB]], [[X]]
365+
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[AND]], 72416175
366+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[MUL]], 26
367+
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
368+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
369+
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
370+
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
371+
; CHECK-NEXT: ret i32 [[CONV]]
372+
;
373+
entry:
374+
%sub = sub i32 0, %x
375+
%and = and i32 %sub, %x
376+
%mul = mul i32 %and, 72416175
377+
%shr = lshr i32 %mul, 26
378+
%idxprom = zext i32 %shr to i64
379+
%arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
380+
%0 = load i16, ptr %arrayidx, align 1
381+
%conv = sext i16 %0 to i32
382+
ret i32 %conv
383+
}
384+
385+
; This is the same a ctz2_with_i8_gep but with the gep index multiplied by 2.
386+
define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
387+
; CHECK-LABEL: @ctz2_with_i8_gep_fixed(
388+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
389+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
390+
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32
391+
; CHECK-NEXT: ret i32 [[CONV]]
392+
;
393+
%sub = sub i32 0, %x
394+
%and = and i32 %x, %sub
395+
%mul = mul i32 %and, 72416175
396+
%shr = lshr i32 %mul, 25
397+
%shr2 = and i32 %shr, 126
398+
%1 = zext nneg i32 %shr2 to i64
399+
%arrayidx = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 %1
400+
%2 = load i16, ptr %arrayidx, align 2
401+
%conv = sext i16 %2 to i32
402+
ret i32 %conv
403+
}
404+
405+
; This is a i16 input with the debruijn table stored in a single i128.
406+
@tablei128 = internal unnamed_addr constant i128 16018378897745984667142067713738932480, align 16
407+
define i32 @cttz_i16_via_i128(i16 noundef %x) {
408+
; CHECK-LABEL: @cttz_i16_via_i128(
409+
; CHECK-NEXT: entry:
410+
; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true)
411+
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i16 [[X]], 0
412+
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP0]]
413+
; CHECK-NEXT: [[TMP1:%.*]] = trunc i16 [[TMP2]] to i8
414+
; CHECK-NEXT: [[CONV6:%.*]] = zext i8 [[TMP1]] to i32
415+
; CHECK-NEXT: ret i32 [[CONV6]]
416+
;
417+
entry:
418+
%sub = sub i16 0, %x
419+
%and = and i16 %x, %sub
420+
%mul = mul i16 %and, 2479
421+
%0 = lshr i16 %mul, 12
422+
%idxprom = zext nneg i16 %0 to i64
423+
%arrayidx = getelementptr inbounds nuw i8, ptr @tablei128, i64 %idxprom
424+
%1 = load i8, ptr %arrayidx, align 1
425+
%conv6 = zext i8 %1 to i32
426+
ret i32 %conv6
427+
}
428+
429+
; Same as above but the table is a little off
430+
@tablei128b = internal unnamed_addr constant i128 16018378897745984667142068813250560256, align 16
431+
define i32 @cttz_i16_via_i128_incorrecttable(i16 noundef %x) {
432+
; CHECK-LABEL: @cttz_i16_via_i128_incorrecttable(
433+
; CHECK-NEXT: entry:
434+
; CHECK-NEXT: [[SUB:%.*]] = sub i16 0, [[X:%.*]]
435+
; CHECK-NEXT: [[AND:%.*]] = and i16 [[X]], [[SUB]]
436+
; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[AND]], 2479
437+
; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[MUL]], 12
438+
; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i16 [[TMP0]] to i64
439+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @tablei128b, i64 [[IDXPROM]]
440+
; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
441+
; CHECK-NEXT: [[CONV6:%.*]] = zext i8 [[TMP3]] to i32
442+
; CHECK-NEXT: ret i32 [[CONV6]]
443+
;
444+
entry:
445+
%sub = sub i16 0, %x
446+
%and = and i16 %x, %sub
447+
%mul = mul i16 %and, 2479
448+
%0 = lshr i16 %mul, 12
449+
%idxprom = zext nneg i16 %0 to i64
450+
%arrayidx = getelementptr inbounds nuw i8, ptr @tablei128b, i64 %idxprom
451+
%1 = load i8, ptr %arrayidx, align 1
452+
%conv6 = zext i8 %1 to i32
453+
ret i32 %conv6
454+
}
455+
456+
; Same as ctz1 but the table and load is very large
457+
@ctz7i128.table = internal unnamed_addr constant [32 x i128] [i128 0, i128 1, i128 28, i128 2, i128 29, i128 14, i128 24, i128 3, i128 30, i128 22, i128 20, i128 15, i128 25, i128 17, i128 4, i128 8, i128 31, i128 27, i128 13, i128 23, i128 21, i128 19, i128 16, i128 7, i128 26, i128 12, i128 18, i128 6, i128 11, i128 5, i128 10, i128 9], align 16
458+
define i128 @ctz1_i128(i32 %x) {
459+
; CHECK-LABEL: @ctz1_i128(
460+
; CHECK-NEXT: entry:
461+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
462+
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
463+
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
464+
; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i128
465+
; CHECK-NEXT: ret i128 [[TMP3]]
466+
;
467+
entry:
468+
%sub = sub i32 0, %x
469+
%and = and i32 %sub, %x
470+
%mul = mul i32 %and, 125613361
471+
%shr = lshr i32 %mul, 27
472+
%idxprom = zext i32 %shr to i64
473+
%arrayidx = getelementptr inbounds [32 x i128], ptr @ctz7i128.table, i64 0, i64 %idxprom
474+
%l = load i128, ptr %arrayidx, align 1
475+
ret i128 %l
476+
}
477+
478+
; This is roughly the same as ctz1 but using i128.
479+
@table.i128 = internal unnamed_addr constant [128 x i8] c"\00\01e\02tf<\03|ug^R=!\04}yvWoh_5ZSE>0\22\14\05~rzPwmX.pkiI`K6\1Ab[TBMF?'81*#\1C\15\0E\06\7Fds;{]Q xVn4YD/\13qOl-jHJ\19aAL&7)\1B\0Dc:\\\1FU3C\12N,G\18@%(\0C9\1E2\11+\17$\0B\1D\10\16\0A\0F\09\08\07", align 1
480+
define i32 @src(i128 noundef %x) {
481+
; CHECK-LABEL: @src(
482+
; CHECK-NEXT: entry:
483+
; CHECK-NEXT: [[TMP3:%.*]] = call i128 @llvm.cttz.i128(i128 [[X:%.*]], i1 true)
484+
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i128 [[X]], 0
485+
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i128 0, i128 [[TMP3]]
486+
; CHECK-NEXT: [[TMP0:%.*]] = trunc i128 [[TMP2]] to i8
487+
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
488+
; CHECK-NEXT: ret i32 [[CONV]]
489+
;
490+
entry:
491+
%sub = sub i128 0, %x
492+
%and = and i128 %x, %sub
493+
%mul = mul i128 %and, 2647824804797170443043024478319300753
494+
%shr = lshr i128 %mul, 121
495+
%idxprom = trunc i128 %shr to i64
496+
%arrayidx = getelementptr inbounds nuw i8, ptr @table.i128, i64 %idxprom
497+
%0 = load i8, ptr %arrayidx, align 1
498+
%conv = zext i8 %0 to i32
499+
ret i32 %conv
500+
}

0 commit comments

Comments
 (0)