From d294d467a75d99f6f4eccd462f9aa303c1f4c5e9 Mon Sep 17 00:00:00 2001 From: Giuseppe Cesarano Date: Mon, 1 Sep 2025 13:04:57 +0200 Subject: [PATCH 1/3] [AArch64] Fix vectorToScalarBitmask BE (#156312) --- .../Target/AArch64/AArch64ISelLowering.cpp | 11 ++- .../AArch64/vector-to-scalar-bitmask.ll | 89 +++++++++++++++++++ 2 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b7011e0ea1669..ea83e9d12069b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24168,6 +24168,7 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // Ensure that all elements' bits are either 0s or 1s. ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT); + bool IsLE = DAG.getDataLayout().isLittleEndian(); SmallVector MaskConstants; if (DAG.getSubtarget().isNeonAvailable() && VecVT == MVT::v16i8) { @@ -24175,7 +24176,10 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // per entry. We split it into two halves, apply the mask, zip the halves to // create 8x 16-bit values, and the perform the vector reduce. for (unsigned Half = 0; Half < 2; ++Half) { - for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) { + for (unsigned I = 0; I < 8; ++I) { + // On big-endian targets, the lane order in sub-byte vector elements + // gets reversed, so we need to flip the bit index. + unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32)); } } @@ -24193,8 +24197,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { } // All other vector sizes. - unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1); - for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) { + unsigned NumEl = VecVT.getVectorNumElements(); + for (unsigned I = 0; I < NumEl; ++I) { + unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64)); } diff --git a/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll new file mode 100644 index 0000000000000..59c8b7389db54 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll @@ -0,0 +1,89 @@ +; RUN: llc -O0 -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-LE +; RUN: llc -O0 -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-BE + +@haystack4 = internal unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 +@haystack16 = internal unnamed_addr constant [16 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15], align 16 + + +define i8 @test4() { + %matches = alloca <4 x i1>, align 1 + %index_ptr = alloca i64, align 8 + store i64 0, ptr %index_ptr, align 8 + %index_val = load i64, ptr %index_ptr, align 8 + %haystack = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @haystack4, i64 0), i64 %index_val + %h_vec = load <4 x i32>, ptr %haystack, align 4 + %cmp_vec = icmp eq <4 x i32> %h_vec, + store <4 x i1> %cmp_vec, ptr %matches, align 1 + %cmp_load = load <4 x i1>, ptr %matches, align 1 + %extr = extractelement <4 x i1> %cmp_load, i64 2 + %ret = zext i1 %extr to i8 + ret i8 %ret +} + +define i8 @test16() { + %matches = alloca <16 x i1>, align 2 + %index_ptr = alloca i64, align 8 + store i64 0, ptr %index_ptr, align 8 + %index_val = load i64, ptr %index_ptr, align 8 + %haystack = getelementptr inbounds i8, ptr getelementptr inbounds (i8, ptr @haystack16, i64 0), i64 %index_val + %h_vec = load <16 x i8>, ptr %haystack, align 16 + %cmp_vec = icmp eq <16 x i8> %h_vec, + store <16 x i1> %cmp_vec, ptr %matches, align 2 + %cmp_load = load <16 x i1>, ptr %matches, align 2 + %extr = extractelement <16 x i1> %cmp_load, i64 7 + %ret = zext i1 %extr to i8 + ret i8 %ret +} + +; Little endian + +; CHECK-LE-LABEL: .LCPI0_0: +; CHECK-LE-NEXT: .word 1 +; CHECK-LE-NEXT: .word 2 +; CHECK-LE-NEXT: .word 4 +; CHECK-LE-NEXT: .word 8 + +; CHECK-LE-LABEL: .LCPI1_0: +; CHECK-LE-NEXT: .byte 1 +; CHECK-LE-NEXT: .byte 2 +; CHECK-LE-NEXT: .byte 4 +; CHECK-LE-NEXT: .byte 8 +; CHECK-LE-NEXT: .byte 16 +; CHECK-LE-NEXT: .byte 32 +; CHECK-LE-NEXT: .byte 64 +; CHECK-LE-NEXT: .byte 128 +; CHECK-LE-NEXT: .byte 1 +; CHECK-LE-NEXT: .byte 2 +; CHECK-LE-NEXT: .byte 4 +; CHECK-LE-NEXT: .byte 8 +; CHECK-LE-NEXT: .byte 16 +; CHECK-LE-NEXT: .byte 32 +; CHECK-LE-NEXT: .byte 64 +; CHECK-LE-NEXT: .byte 128 + + +; Big endian + +; CHECK-BE-LABEL: .LCPI0_0: +; CHECK-BE-NEXT: .word 8 +; CHECK-BE-NEXT: .word 4 +; CHECK-BE-NEXT: .word 2 +; CHECK-BE-NEXT: .word 1 + +; CHECK-BE-LABEL: .LCPI1_0: +; CHECK-BE-NEXT: .byte 128 +; CHECK-BE-NEXT: .byte 64 +; CHECK-BE-NEXT: .byte 32 +; CHECK-BE-NEXT: .byte 16 +; CHECK-BE-NEXT: .byte 8 +; CHECK-BE-NEXT: .byte 4 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 1 +; CHECK-BE-NEXT: .byte 128 +; CHECK-BE-NEXT: .byte 64 +; CHECK-BE-NEXT: .byte 32 +; CHECK-BE-NEXT: .byte 16 +; CHECK-BE-NEXT: .byte 8 +; CHECK-BE-NEXT: .byte 4 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 1 From 1fd14af96f933edb43a7550b6cef601ea80aca6e Mon Sep 17 00:00:00 2001 From: Giuseppe Cesarano Date: Wed, 3 Sep 2025 11:47:22 +0200 Subject: [PATCH 2/3] Made test not require -O0 --- .../test/CodeGen/AArch64/vector-to-scalar-bitmask.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll index 59c8b7389db54..6f4081787551f 100644 --- a/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-LE -; RUN: llc -O0 -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -O3 -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-LE +; RUN: llc -O3 -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-BE @haystack4 = internal unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 @haystack16 = internal unnamed_addr constant [16 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15], align 16 @@ -13,8 +13,8 @@ define i8 @test4() { %haystack = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @haystack4, i64 0), i64 %index_val %h_vec = load <4 x i32>, ptr %haystack, align 4 %cmp_vec = icmp eq <4 x i32> %h_vec, - store <4 x i1> %cmp_vec, ptr %matches, align 1 - %cmp_load = load <4 x i1>, ptr %matches, align 1 + store volatile <4 x i1> %cmp_vec, ptr %matches, align 1 + %cmp_load = load volatile <4 x i1>, ptr %matches, align 1 %extr = extractelement <4 x i1> %cmp_load, i64 2 %ret = zext i1 %extr to i8 ret i8 %ret @@ -28,8 +28,8 @@ define i8 @test16() { %haystack = getelementptr inbounds i8, ptr getelementptr inbounds (i8, ptr @haystack16, i64 0), i64 %index_val %h_vec = load <16 x i8>, ptr %haystack, align 16 %cmp_vec = icmp eq <16 x i8> %h_vec, - store <16 x i1> %cmp_vec, ptr %matches, align 2 - %cmp_load = load <16 x i1>, ptr %matches, align 2 + store volatile <16 x i1> %cmp_vec, ptr %matches, align 2 + %cmp_load = load volatile <16 x i1>, ptr %matches, align 2 %extr = extractelement <16 x i1> %cmp_load, i64 7 %ret = zext i1 %extr to i8 ret i8 %ret From e4dbcae172e5532edeb04ae546e4ba70ad78f211 Mon Sep 17 00:00:00 2001 From: Giuseppe Cesarano Date: Thu, 4 Sep 2025 23:02:43 +0200 Subject: [PATCH 3/3] Removed compile time known constants from the test --- .../AArch64/vector-to-scalar-bitmask.ll | 70 ++++++++----------- 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll index 6f4081787551f..01c83ca220b65 100644 --- a/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll @@ -1,49 +1,22 @@ ; RUN: llc -O3 -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-LE ; RUN: llc -O3 -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-BE -@haystack4 = internal unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 -@haystack16 = internal unnamed_addr constant [16 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15], align 16 - - -define i8 @test4() { - %matches = alloca <4 x i1>, align 1 - %index_ptr = alloca i64, align 8 - store i64 0, ptr %index_ptr, align 8 - %index_val = load i64, ptr %index_ptr, align 8 - %haystack = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @haystack4, i64 0), i64 %index_val - %h_vec = load <4 x i32>, ptr %haystack, align 4 - %cmp_vec = icmp eq <4 x i32> %h_vec, - store volatile <4 x i1> %cmp_vec, ptr %matches, align 1 - %cmp_load = load volatile <4 x i1>, ptr %matches, align 1 - %extr = extractelement <4 x i1> %cmp_load, i64 2 - %ret = zext i1 %extr to i8 - ret i8 %ret +define i16 @convert_to_bitmask16(<16 x i8> %vec) { + %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer + %bitmask = bitcast <16 x i1> %cmp_result to i16 + ret i16 %bitmask } -define i8 @test16() { - %matches = alloca <16 x i1>, align 2 - %index_ptr = alloca i64, align 8 - store i64 0, ptr %index_ptr, align 8 - %index_val = load i64, ptr %index_ptr, align 8 - %haystack = getelementptr inbounds i8, ptr getelementptr inbounds (i8, ptr @haystack16, i64 0), i64 %index_val - %h_vec = load <16 x i8>, ptr %haystack, align 16 - %cmp_vec = icmp eq <16 x i8> %h_vec, - store volatile <16 x i1> %cmp_vec, ptr %matches, align 2 - %cmp_load = load volatile <16 x i1>, ptr %matches, align 2 - %extr = extractelement <16 x i1> %cmp_load, i64 7 - %ret = zext i1 %extr to i8 - ret i8 %ret +define i16 @convert_to_bitmask8(<8 x i16> %vec) { + %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer + %bitmask = bitcast <8 x i1> %cmp_result to i8 + %extended_bitmask = zext i8 %bitmask to i16 + ret i16 %extended_bitmask } ; Little endian ; CHECK-LE-LABEL: .LCPI0_0: -; CHECK-LE-NEXT: .word 1 -; CHECK-LE-NEXT: .word 2 -; CHECK-LE-NEXT: .word 4 -; CHECK-LE-NEXT: .word 8 - -; CHECK-LE-LABEL: .LCPI1_0: ; CHECK-LE-NEXT: .byte 1 ; CHECK-LE-NEXT: .byte 2 ; CHECK-LE-NEXT: .byte 4 @@ -61,16 +34,19 @@ define i8 @test16() { ; CHECK-LE-NEXT: .byte 64 ; CHECK-LE-NEXT: .byte 128 +; CHECK-LE-LABEL: .LCPI1_0: +; CHECK-LE-NEXT: .hword 1 +; CHECK-LE-NEXT: .hword 2 +; CHECK-LE-NEXT: .hword 4 +; CHECK-LE-NEXT: .hword 8 +; CHECK-LE-NEXT: .hword 16 +; CHECK-LE-NEXT: .hword 32 +; CHECK-LE-NEXT: .hword 64 +; CHECK-LE-NEXT: .hword 128 ; Big endian ; CHECK-BE-LABEL: .LCPI0_0: -; CHECK-BE-NEXT: .word 8 -; CHECK-BE-NEXT: .word 4 -; CHECK-BE-NEXT: .word 2 -; CHECK-BE-NEXT: .word 1 - -; CHECK-BE-LABEL: .LCPI1_0: ; CHECK-BE-NEXT: .byte 128 ; CHECK-BE-NEXT: .byte 64 ; CHECK-BE-NEXT: .byte 32 @@ -87,3 +63,13 @@ define i8 @test16() { ; CHECK-BE-NEXT: .byte 4 ; CHECK-BE-NEXT: .byte 2 ; CHECK-BE-NEXT: .byte 1 + +; CHECK-BE-LABEL: .LCPI1_0: +; CHECK-BE-NEXT: .hword 128 +; CHECK-BE-NEXT: .hword 64 +; CHECK-BE-NEXT: .hword 32 +; CHECK-BE-NEXT: .hword 16 +; CHECK-BE-NEXT: .hword 8 +; CHECK-BE-NEXT: .hword 4 +; CHECK-BE-NEXT: .hword 2 +; CHECK-BE-NEXT: .hword 1