diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b7011e0ea1669..ea83e9d12069b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24168,6 +24168,7 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // Ensure that all elements' bits are either 0s or 1s. ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT); + bool IsLE = DAG.getDataLayout().isLittleEndian(); SmallVector MaskConstants; if (DAG.getSubtarget().isNeonAvailable() && VecVT == MVT::v16i8) { @@ -24175,7 +24176,10 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // per entry. We split it into two halves, apply the mask, zip the halves to // create 8x 16-bit values, and the perform the vector reduce. for (unsigned Half = 0; Half < 2; ++Half) { - for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) { + for (unsigned I = 0; I < 8; ++I) { + // On big-endian targets, the lane order in sub-byte vector elements + // gets reversed, so we need to flip the bit index. + unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32)); } } @@ -24193,8 +24197,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { } // All other vector sizes. - unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1); - for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) { + unsigned NumEl = VecVT.getVectorNumElements(); + for (unsigned I = 0; I < NumEl; ++I) { + unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64)); } diff --git a/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll new file mode 100644 index 0000000000000..01c83ca220b65 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll @@ -0,0 +1,75 @@ +; RUN: llc -O3 -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-LE +; RUN: llc -O3 -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-BE + +define i16 @convert_to_bitmask16(<16 x i8> %vec) { + %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer + %bitmask = bitcast <16 x i1> %cmp_result to i16 + ret i16 %bitmask +} + +define i16 @convert_to_bitmask8(<8 x i16> %vec) { + %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer + %bitmask = bitcast <8 x i1> %cmp_result to i8 + %extended_bitmask = zext i8 %bitmask to i16 + ret i16 %extended_bitmask +} + +; Little endian + +; CHECK-LE-LABEL: .LCPI0_0: +; CHECK-LE-NEXT: .byte 1 +; CHECK-LE-NEXT: .byte 2 +; CHECK-LE-NEXT: .byte 4 +; CHECK-LE-NEXT: .byte 8 +; CHECK-LE-NEXT: .byte 16 +; CHECK-LE-NEXT: .byte 32 +; CHECK-LE-NEXT: .byte 64 +; CHECK-LE-NEXT: .byte 128 +; CHECK-LE-NEXT: .byte 1 +; CHECK-LE-NEXT: .byte 2 +; CHECK-LE-NEXT: .byte 4 +; CHECK-LE-NEXT: .byte 8 +; CHECK-LE-NEXT: .byte 16 +; CHECK-LE-NEXT: .byte 32 +; CHECK-LE-NEXT: .byte 64 +; CHECK-LE-NEXT: .byte 128 + +; CHECK-LE-LABEL: .LCPI1_0: +; CHECK-LE-NEXT: .hword 1 +; CHECK-LE-NEXT: .hword 2 +; CHECK-LE-NEXT: .hword 4 +; CHECK-LE-NEXT: .hword 8 +; CHECK-LE-NEXT: .hword 16 +; CHECK-LE-NEXT: .hword 32 +; CHECK-LE-NEXT: .hword 64 +; CHECK-LE-NEXT: .hword 128 + +; Big endian + +; CHECK-BE-LABEL: .LCPI0_0: +; CHECK-BE-NEXT: .byte 128 +; CHECK-BE-NEXT: .byte 64 +; CHECK-BE-NEXT: .byte 32 +; CHECK-BE-NEXT: .byte 16 +; CHECK-BE-NEXT: .byte 8 +; CHECK-BE-NEXT: .byte 4 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 1 +; CHECK-BE-NEXT: .byte 128 +; CHECK-BE-NEXT: .byte 64 +; CHECK-BE-NEXT: .byte 32 +; CHECK-BE-NEXT: .byte 16 +; CHECK-BE-NEXT: .byte 8 +; CHECK-BE-NEXT: .byte 4 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 1 + +; CHECK-BE-LABEL: .LCPI1_0: +; CHECK-BE-NEXT: .hword 128 +; CHECK-BE-NEXT: .hword 64 +; CHECK-BE-NEXT: .hword 32 +; CHECK-BE-NEXT: .hword 16 +; CHECK-BE-NEXT: .hword 8 +; CHECK-BE-NEXT: .hword 4 +; CHECK-BE-NEXT: .hword 2 +; CHECK-BE-NEXT: .hword 1