Skip to content

Commit 9430fcd

Browse files
Major change to the patch. Shares more code with Masked Gather and
Scatter. Now also removes the instruction when its mask is zero.
1 parent 7c61336 commit 9430fcd

File tree

4 files changed

+234
-58
lines changed

4 files changed

+234
-58
lines changed

llvm/include/llvm/CodeGen/SelectionDAGNodes.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2935,8 +2935,8 @@ class MaskedGatherScatterSDNode : public MemSDNode {
29352935
const SDValue &getScale() const { return getOperand(5); }
29362936

29372937
static bool classof(const SDNode *N) {
2938-
return N->getOpcode() == ISD::MGATHER ||
2939-
N->getOpcode() == ISD::MSCATTER;
2938+
return N->getOpcode() == ISD::MGATHER || N->getOpcode() == ISD::MSCATTER ||
2939+
N->getOpcode() == ISD::EXPERIMENTAL_VECTOR_HISTOGRAM;
29402940
}
29412941
};
29422942

@@ -2991,15 +2991,15 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
29912991
}
29922992
};
29932993

2994-
class MaskedHistogramSDNode : public MemSDNode {
2994+
class MaskedHistogramSDNode : public MaskedGatherScatterSDNode {
29952995
public:
29962996
friend class SelectionDAG;
29972997

29982998
MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
29992999
EVT MemVT, MachineMemOperand *MMO,
30003000
ISD::MemIndexType IndexType)
3001-
: MemSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL, VTs, MemVT,
3002-
MMO) {
3001+
: MaskedGatherScatterSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL,
3002+
VTs, MemVT, MMO, IndexType) {
30033003
LSBaseSDNodeBits.AddressingMode = IndexType;
30043004
}
30053005

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,7 @@ namespace {
549549
SDValue visitMSTORE(SDNode *N);
550550
SDValue visitMGATHER(SDNode *N);
551551
SDValue visitMSCATTER(SDNode *N);
552+
SDValue visitMHISTOGRAM(SDNode *N);
552553
SDValue visitVPGATHER(SDNode *N);
553554
SDValue visitVPSCATTER(SDNode *N);
554555
SDValue visitVP_STRIDED_LOAD(SDNode *N);
@@ -1972,6 +1973,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
19721973
case ISD::MLOAD: return visitMLOAD(N);
19731974
case ISD::MSCATTER: return visitMSCATTER(N);
19741975
case ISD::MSTORE: return visitMSTORE(N);
1976+
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
19751977
case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N);
19761978
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
19771979
case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
@@ -12353,6 +12355,37 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
1235312355
return SDValue();
1235412356
}
1235512357

12358+
SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
12359+
MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
12360+
SDValue Chain = HG->getChain();
12361+
SDValue Inc = HG->getInc();
12362+
SDValue Mask = HG->getMask();
12363+
SDValue BasePtr = HG->getBasePtr();
12364+
SDValue Index = HG->getIndex();
12365+
SDLoc DL(HG);
12366+
12367+
EVT MemVT = HG->getMemoryVT();
12368+
MachineMemOperand *MMO = HG->getMemOperand();
12369+
ISD::MemIndexType IndexType = HG->getIndexType();
12370+
12371+
if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) {
12372+
return Chain;
12373+
}
12374+
SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
12375+
HG->getScale(), HG->getIntID()};
12376+
if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL)) {
12377+
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
12378+
MMO, IndexType);
12379+
}
12380+
EVT DataVT = Index.getValueType();
12381+
DataVT.changeVectorElementType(Inc.getValueType());
12382+
if (refineIndexType(Index, IndexType, DataVT, DAG)) {
12383+
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
12384+
MMO, IndexType);
12385+
}
12386+
return SDValue();
12387+
}
12388+
1235612389
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
1235712390
auto *SLD = cast<VPStridedLoadSDNode>(N);
1235812391
EVT EltVT = SLD->getValueType(0).getVectorElementType();

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24082,24 +24082,6 @@ static SDValue performMaskedGatherScatterCombine(
2408224082
SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
2408324083
if (!DCI.isBeforeLegalize())
2408424084
return SDValue();
24085-
24086-
if (N->getOpcode() == ISD::EXPERIMENTAL_VECTOR_HISTOGRAM) {
24087-
MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
24088-
24089-
SDValue Index = HG->getIndex();
24090-
if (!ISD::isExtOpcode(Index->getOpcode())) {
24091-
return SDValue();
24092-
}
24093-
SDLoc DL(HG);
24094-
SDValue ExtOp = Index.getOperand(0);
24095-
SDValue Ops[] = {HG->getChain(), HG->getInc(), HG->getMask(),
24096-
HG->getBasePtr(), ExtOp, HG->getScale(),
24097-
HG->getIntID()};
24098-
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24099-
DL, Ops, HG->getMemOperand(),
24100-
HG->getIndexType());
24101-
}
24102-
2410324085
MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
2410424086

2410524087
SDLoc DL(MGS);
@@ -24110,8 +24092,9 @@ static SDValue performMaskedGatherScatterCombine(
2411024092
SDValue BasePtr = MGS->getBasePtr();
2411124093
ISD::MemIndexType IndexType = MGS->getIndexType();
2411224094

24113-
if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
24095+
if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG)) {
2411424096
return SDValue();
24097+
}
2411524098

2411624099
// Here we catch such cases early and change MGATHER's IndexType to allow
2411724100
// the use of an Index that's more legalisation friendly.
@@ -24122,12 +24105,18 @@ static SDValue performMaskedGatherScatterCombine(
2412224105
DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
2412324106
Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
2412424107
}
24125-
auto *MSC = cast<MaskedScatterSDNode>(MGS);
24126-
SDValue Data = MSC->getValue();
24127-
SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24128-
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
24129-
Ops, MSC->getMemOperand(), IndexType,
24130-
MSC->isTruncatingStore());
24108+
if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
24109+
SDValue Data = MSC->getValue();
24110+
SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24111+
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
24112+
DL, Ops, MSC->getMemOperand(), IndexType,
24113+
MSC->isTruncatingStore());
24114+
}
24115+
auto *HG = cast<MaskedHistogramSDNode>(MGS);
24116+
SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24117+
Index, Scale, HG->getIntID()};
24118+
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24119+
DL, Ops, HG->getMemOperand(), IndexType);
2413124120
}
2413224121

2413324122
/// Target-specific DAG combine function for NEON load/store intrinsics

llvm/test/CodeGen/AArch64/sve2-histcnt.ll

Lines changed: 182 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,56 @@ define void @histogram_i16_8_lane(ptr %base, <vscale x 8 x i32> %indices, i16 %i
267267
ret void
268268
}
269269

270-
define void @histogram_i32_zextend(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
271-
; CHECK-LABEL: histogram_i32_zextend:
270+
define void @histogram_i8_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i8 %inc) #0{
271+
; CHECK-LABEL: histogram_i8_zext:
272+
; CHECK: // %bb.0:
273+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
274+
; CHECK-NEXT: mov z3.s, w1
275+
; CHECK-NEXT: ld1b { z2.s }, p0/z, [x0, z0.s, uxtw]
276+
; CHECK-NEXT: ptrue p1.s
277+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
278+
; CHECK-NEXT: st1b { z1.s }, p0, [x0, z0.s, uxtw]
279+
; CHECK-NEXT: ret
280+
%extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
281+
%buckets = getelementptr i8, ptr %base, <vscale x 4 x i64> %extended
282+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> %buckets, i8 %inc, <vscale x 4 x i1> %mask)
283+
ret void
284+
}
285+
286+
define void @histogram_i16_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i16 %inc) #0{
287+
; CHECK-LABEL: histogram_i16_zext:
288+
; CHECK: // %bb.0:
289+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
290+
; CHECK-NEXT: mov z3.s, w1
291+
; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, uxtw #1]
292+
; CHECK-NEXT: ptrue p1.s
293+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
294+
; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, uxtw #1]
295+
; CHECK-NEXT: ret
296+
%extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
297+
%buckets = getelementptr i16, ptr %base, <vscale x 4 x i64> %extended
298+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 %inc, <vscale x 4 x i1> %mask)
299+
ret void
300+
}
301+
302+
define void @histogram_i32_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
303+
; CHECK-LABEL: histogram_i32_zext:
304+
; CHECK: // %bb.0:
305+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
306+
; CHECK-NEXT: mov z3.s, #1 // =0x1
307+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
308+
; CHECK-NEXT: ptrue p1.s
309+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
310+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
311+
; CHECK-NEXT: ret
312+
%extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
313+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
314+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
315+
ret void
316+
}
317+
318+
define void @histogram_i32_sext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
319+
; CHECK-LABEL: histogram_i32_sext:
272320
; CHECK: // %bb.0:
273321
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
274322
; CHECK-NEXT: mov z3.s, #1 // =0x1
@@ -277,53 +325,142 @@ define void @histogram_i32_zextend(ptr %base, <vscale x 4 x i32> %indices, <vsca
277325
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
278326
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
279327
; CHECK-NEXT: ret
280-
%extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
328+
%extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
281329
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
282330
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
283331
ret void
284332
}
285333

286-
define void @histogram_i32_8_lane_zextend(ptr %base, <vscale x 8 x i32> %indices, i32 %inc, <vscale x 8 x i1> %mask) #0 {
287-
; CHECK-LABEL: histogram_i32_8_lane_zextend:
334+
define void @histogram_zext_from_i8_to_i64(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
335+
; CHECK-LABEL: histogram_zext_from_i8_to_i64:
288336
; CHECK: // %bb.0:
289-
; CHECK-NEXT: punpklo p1.h, p0.b
290-
; CHECK-NEXT: mov z4.s, w1
291-
; CHECK-NEXT: ptrue p2.s
292-
; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
293-
; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
294-
; CHECK-NEXT: punpkhi p0.h, p0.b
295-
; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
296-
; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, sxtw #2]
297-
; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
298-
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2]
299-
; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
300-
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
337+
; CHECK-NEXT: and z0.s, z0.s, #0xff
338+
; CHECK-NEXT: mov z3.s, #1 // =0x1
339+
; CHECK-NEXT: ptrue p1.s
340+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
341+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
342+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
343+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
301344
; CHECK-NEXT: ret
302-
%extended = zext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
303-
%buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
304-
call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 %inc, <vscale x 8 x i1> %mask)
345+
%extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i64>
346+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
347+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
305348
ret void
306349
}
307-
define void @histogram_i32_sextend(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0{
308-
; CHECK-LABEL: histogram_i32_sextend:
350+
351+
define void @histogram_zext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
352+
; CHECK-LABEL: histogram_zext_from_i16_to_i64:
309353
; CHECK: // %bb.0:
354+
; CHECK-NEXT: and z0.s, z0.s, #0xffff
355+
; CHECK-NEXT: mov z3.s, #1 // =0x1
356+
; CHECK-NEXT: ptrue p1.s
310357
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
358+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
359+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
360+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
361+
; CHECK-NEXT: ret
362+
%extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
363+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
364+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
365+
ret void
366+
}
367+
368+
define void @histogram_sext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
369+
; CHECK-LABEL: histogram_sext_from_i16_to_i64:
370+
; CHECK: // %bb.0:
371+
; CHECK-NEXT: ptrue p1.s
311372
; CHECK-NEXT: mov z3.s, #1 // =0x1
373+
; CHECK-NEXT: sxth z0.s, p1/m, z0.s
374+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
312375
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
313-
; CHECK-NEXT: ptrue p1.s
314376
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
315377
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
316378
; CHECK-NEXT: ret
317-
%extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
379+
%extended = sext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
318380
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
319381
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
320382
ret void
321383
}
322-
define void @histogram_i32_8_lane_sextend(ptr %base, <vscale x 8 x i32> %indices, i32 %inc, <vscale x 8 x i1> %mask) #0 {
323-
; CHECK-LABEL: histogram_i32_8_lane_sextend:
384+
385+
define void @histogram_zext_from_i8_to_i32(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
386+
; CHECK-LABEL: histogram_zext_from_i8_to_i32:
387+
; CHECK: // %bb.0:
388+
; CHECK-NEXT: and z0.s, z0.s, #0xff
389+
; CHECK-NEXT: mov z3.s, #1 // =0x1
390+
; CHECK-NEXT: ptrue p1.s
391+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
392+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
393+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
394+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
395+
; CHECK-NEXT: ret
396+
%extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i32>
397+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
398+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
399+
ret void
400+
}
401+
402+
define void @histogram_zext_from_i16_to_i32(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0 {
403+
; CHECK-LABEL: histogram_zext_from_i16_to_i32:
404+
; CHECK: // %bb.0:
405+
; CHECK-NEXT: and z0.s, z0.s, #0xffff
406+
; CHECK-NEXT: mov z3.s, #1 // =0x1
407+
; CHECK-NEXT: ptrue p1.s
408+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
409+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
410+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
411+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
412+
; CHECK-NEXT: ret
413+
%extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i32>
414+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
415+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
416+
ret void
417+
}
418+
419+
define void @histogram_2_lane_zext(ptr %base, <vscale x 2 x i32> %indices, <vscale x 2 x i1> %mask) #0 {
420+
; CHECK-LABEL: histogram_2_lane_zext:
421+
; CHECK: // %bb.0:
422+
; CHECK-NEXT: mov z1.d, z0.d
423+
; CHECK-NEXT: mov z3.d, #1 // =0x1
424+
; CHECK-NEXT: ptrue p1.d
425+
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x0, z0.d, uxtw #2]
426+
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
427+
; CHECK-NEXT: histcnt z1.d, p0/z, z1.d, z1.d
428+
; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d
429+
; CHECK-NEXT: st1w { z1.d }, p0, [x0, z0.d, uxtw #2]
430+
; CHECK-NEXT: ret
431+
%extended = zext <vscale x 2 x i32> %indices to <vscale x 2 x i64>
432+
%buckets = getelementptr i32, ptr %base, <vscale x 2 x i64> %extended
433+
call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> %buckets, i32 1, <vscale x 2 x i1> %mask)
434+
ret void
435+
}
436+
437+
define void @histogram_8_lane_zext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
438+
; CHECK-LABEL: histogram_8_lane_zext:
324439
; CHECK: // %bb.0:
325440
; CHECK-NEXT: punpklo p1.h, p0.b
326-
; CHECK-NEXT: mov z4.s, w1
441+
; CHECK-NEXT: mov z4.s, #1 // =0x1
442+
; CHECK-NEXT: ptrue p2.s
443+
; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
444+
; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, uxtw #2]
445+
; CHECK-NEXT: punpkhi p0.h, p0.b
446+
; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
447+
; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, uxtw #2]
448+
; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
449+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, uxtw #2]
450+
; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
451+
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
452+
; CHECK-NEXT: ret
453+
%extended = zext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
454+
%buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
455+
call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
456+
ret void
457+
}
458+
459+
define void @histogram_8_lane_sext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
460+
; CHECK-LABEL: histogram_8_lane_sext:
461+
; CHECK: // %bb.0:
462+
; CHECK-NEXT: punpklo p1.h, p0.b
463+
; CHECK-NEXT: mov z4.s, #1 // =0x1
327464
; CHECK-NEXT: ptrue p2.s
328465
; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
329466
; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
@@ -337,9 +474,26 @@ define void @histogram_i32_8_lane_sextend(ptr %base, <vscale x 8 x i32> %indices
337474
; CHECK-NEXT: ret
338475
%extended = sext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
339476
%buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
340-
call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 %inc, <vscale x 8 x i1> %mask)
477+
call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
341478
ret void
342479
}
343480

481+
define void @histogram_zero_mask(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0{
482+
; CHECK-LABEL: histogram_zero_mask:
483+
; CHECK: // %bb.0:
484+
; CHECK-NEXT: ret
485+
call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> zeroinitializer)
486+
ret void
487+
}
488+
489+
define void @histogram_sext_zero_mask(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0{
490+
; CHECK-LABEL: histogram_sext_zero_mask:
491+
; CHECK: // %bb.0:
492+
; CHECK-NEXT: ret
493+
%extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
494+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
495+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> zeroinitializer)
496+
ret void
497+
}
344498

345499
attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }

0 commit comments

Comments
 (0)