Skip to content

Conversation

@davemgreen
Copy link
Collaborator

We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will
anyext into the top bits. The instruction we create (UADDV) is known to be
zeroes in the upper bits, so we can convert to a larger v2i32 vector and
extract from there, similar to the operation currently performed for i64 types.

Fixes #140707

We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will
anyext into the top bits. The instruction we create (UADDV) is known to be
zeroes in the upper bits, so we can convert to a larger v2i32 vector and
extract from there, similar to the operation currently performed for i64 types.
@llvmbot
Copy link
Member

llvmbot commented May 20, 2025

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will
anyext into the top bits. The instruction we create (UADDV) is known to be
zeroes in the upper bits, so we can convert to a larger v2i32 vector and
extract from there, similar to the operation currently performed for i64 types.

Fixes #140707


Full diff: https://github.com/llvm/llvm-project/pull/140718.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+4-7)
  • (modified) llvm/test/CodeGen/AArch64/popcount.ll (+109)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 293292d47dd48..64a422a195437 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10852,13 +10852,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
 
     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
     SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
-    if (VT == MVT::i32)
-      AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
-                         DAG.getConstant(0, DL, MVT::i64));
-    else
-      AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                         DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
-                         DAG.getConstant(0, DL, MVT::i64));
+    AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
+                       VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
+    AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
+                       DAG.getConstant(0, DL, MVT::i64));
     if (IsParity)
       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
     return AddV;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index e664e73594923..61f221988777f 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -648,4 +648,113 @@ Entry:
   ret <4 x i16> %1
 }
 
+define i32 @ctpop_into_extract(ptr %p) {
+; CHECKO0-LABEL: ctpop_into_extract:
+; CHECKO0:       // %bb.0:
+; CHECKO0-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECKO0-NEXT:    // implicit-def: $d1
+; CHECKO0-NEXT:    // implicit-def: $q0
+; CHECKO0-NEXT:    fmov d0, d1
+; CHECKO0-NEXT:    mov v0.s[0], w8
+; CHECKO0-NEXT:    fmov d2, d0
+; CHECKO0-NEXT:    ldr d0, [x0]
+; CHECKO0-NEXT:    fmov s1, s0
+; CHECKO0-NEXT:    fmov w8, s1
+; CHECKO0-NEXT:    fmov s1, w8
+; CHECKO0-NEXT:    // kill: def $d1 killed $s1
+; CHECKO0-NEXT:    cnt v1.8b, v1.8b
+; CHECKO0-NEXT:    uaddlv h1, v1.8b
+; CHECKO0-NEXT:    // kill: def $q1 killed $h1
+; CHECKO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECKO0-NEXT:    fmov w8, s1
+; CHECKO0-NEXT:    // implicit-def: $q1
+; CHECKO0-NEXT:    fmov d1, d2
+; CHECKO0-NEXT:    mov v1.s[1], w8
+; CHECKO0-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECKO0-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECKO0-NEXT:    str d0, [x0]
+; CHECKO0-NEXT:    mov w0, wzr
+; CHECKO0-NEXT:    ret
+;
+; CHECK-LABEL: ctpop_into_extract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    cnt v1.8b, v1.8b
+; CHECK-NEXT:    addv b1, v1.8b
+; CHECK-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-NEXT:    sub v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    str d0, [x8]
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: ctpop_into_extract:
+; BE:       // %bb.0:
+; BE-NEXT:    ld1 { v0.2s }, [x0]
+; BE-NEXT:    movi v2.2d, #0xffffffffffffffff
+; BE-NEXT:    mov x8, x0
+; BE-NEXT:    mov w0, wzr
+; BE-NEXT:    fmov w9, s0
+; BE-NEXT:    fmov s1, w9
+; BE-NEXT:    cnt v1.8b, v1.8b
+; BE-NEXT:    addv b1, v1.8b
+; BE-NEXT:    mov v2.s[1], v1.s[0]
+; BE-NEXT:    sub v0.2s, v0.2s, v2.2s
+; BE-NEXT:    st1 { v0.2s }, [x8]
+; BE-NEXT:    ret
+;
+; GISEL-LABEL: ctpop_into_extract:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    ldr d0, [x0]
+; GISEL-NEXT:    mov w9, #-1 // =0xffffffff
+; GISEL-NEXT:    mov x8, x0
+; GISEL-NEXT:    mov v2.s[0], w9
+; GISEL-NEXT:    mov w0, wzr
+; GISEL-NEXT:    fmov w10, s0
+; GISEL-NEXT:    fmov s1, w10
+; GISEL-NEXT:    cnt v1.8b, v1.8b
+; GISEL-NEXT:    uaddlv h1, v1.8b
+; GISEL-NEXT:    mov v2.s[1], v1.s[0]
+; GISEL-NEXT:    sub v0.2s, v0.2s, v2.2s
+; GISEL-NEXT:    str d0, [x8]
+; GISEL-NEXT:    ret
+;
+; GISELO0-LABEL: ctpop_into_extract:
+; GISELO0:       // %bb.0:
+; GISELO0-NEXT:    mov w8, #-1 // =0xffffffff
+; GISELO0-NEXT:    // implicit-def: $d1
+; GISELO0-NEXT:    // implicit-def: $q0
+; GISELO0-NEXT:    fmov d0, d1
+; GISELO0-NEXT:    mov v0.s[0], w8
+; GISELO0-NEXT:    fmov d2, d0
+; GISELO0-NEXT:    ldr d0, [x0]
+; GISELO0-NEXT:    fmov s1, s0
+; GISELO0-NEXT:    fmov w8, s1
+; GISELO0-NEXT:    fmov s1, w8
+; GISELO0-NEXT:    // kill: def $d1 killed $s1
+; GISELO0-NEXT:    cnt v1.8b, v1.8b
+; GISELO0-NEXT:    uaddlv h1, v1.8b
+; GISELO0-NEXT:    // kill: def $q1 killed $h1
+; GISELO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; GISELO0-NEXT:    fmov w8, s1
+; GISELO0-NEXT:    // implicit-def: $q1
+; GISELO0-NEXT:    fmov d1, d2
+; GISELO0-NEXT:    mov v1.s[1], w8
+; GISELO0-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; GISELO0-NEXT:    sub v0.2s, v0.2s, v1.2s
+; GISELO0-NEXT:    str d0, [x0]
+; GISELO0-NEXT:    mov w0, wzr
+; GISELO0-NEXT:    ret
+  %1 = load <2 x i32>, ptr %p, align 4
+  %2 = extractelement <2 x i32> %1, i64 0
+  %3 = call i32 @llvm.ctpop.i32(i32 %2)
+  %4 = insertelement <2 x i32> <i32 -1, i32 poison>, i32 %3, i64 1
+  %5 = sub <2 x i32> %1, %4
+  store <2 x i32> %5, ptr %p, align 4
+  ret i32 0
+}
+
 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)

Copy link
Contributor

@usha1830 usha1830 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, Thanks!

@davemgreen
Copy link
Collaborator Author

Thanks

@davemgreen davemgreen merged commit 47b89fb into llvm:main May 20, 2025
13 checks passed
@davemgreen davemgreen deleted the gh-a64-fixpopcntextracttype branch May 20, 2025 17:09
@llvm-ci
Copy link
Collaborator

llvm-ci commented May 20, 2025

LLVM Buildbot has detected a new failure on builder lldb-x86_64-debian running on lldb-x86_64-debian while building llvm at step 6 "test".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/22791

Here is the relevant piece of the build log for the reference
Step 6 (test) failure: build (failure)
...
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Python/Crashlog/parser_json.test (2942 of 2953)
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Python/Crashlog/no_threadState.test (2943 of 2953)
UNSUPPORTED: lldb-shell :: SymbolFile/PDB/calling-conventions-arm.test (2944 of 2953)
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Python/Crashlog/interactive_crashlog_json.test (2945 of 2953)
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Lua/breakpoint_function_callback.test (2946 of 2953)
UNSUPPORTED: lldb-shell :: Process/Windows/exception_access_violation.cpp (2947 of 2953)
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Lua/breakpoint_callback.test (2948 of 2953)
PASS: lldb-api :: api/multithreaded/TestMultithreaded.py (2949 of 2953)
PASS: lldb-api :: terminal/TestEditlineCompletions.py (2950 of 2953)
UNRESOLVED: lldb-api :: tools/lldb-dap/launch/TestDAP_launch.py (2951 of 2953)
******************** TEST 'lldb-api :: tools/lldb-dap/launch/TestDAP_launch.py' FAILED ********************
Script:
--
/usr/bin/python3 /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/dotest.py -u CXXFLAGS -u CFLAGS --env LLVM_LIBS_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/./lib --env LLVM_INCLUDE_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/include --env LLVM_TOOLS_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/./bin --arch x86_64 --build-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex --lldb-module-cache-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex/module-cache-lldb/lldb-api --clang-module-cache-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex/module-cache-clang/lldb-api --executable /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/lldb --compiler /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/clang --dsymutil /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/dsymutil --make /usr/bin/gmake --llvm-tools-dir /home/worker/2.0.1/lldb-x86_64-debian/build/./bin --lldb-obj-root /home/worker/2.0.1/lldb-x86_64-debian/build/tools/lldb --lldb-libs-dir /home/worker/2.0.1/lldb-x86_64-debian/build/./lib -t /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/tools/lldb-dap/launch -p TestDAP_launch.py
--
Exit Code: 1

Command Output (stdout):
--
lldb version 21.0.0git (https://github.com/llvm/llvm-project.git revision 47b89fb412ad1b9c2dfc3ca34622a4315584c6e9)
  clang revision 47b89fb412ad1b9c2dfc3ca34622a4315584c6e9
  llvm revision 47b89fb412ad1b9c2dfc3ca34622a4315584c6e9
Skipping the following test categories: ['libc++', 'dsym', 'gmodules', 'debugserver', 'objc']

--
Command Output (stderr):
--
Change dir to: /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/tools/lldb-dap/launch
runCmd: settings clear --all

output: 

runCmd: settings set symbols.enable-external-lookup false

output: 

runCmd: settings set target.inherit-tcc true

output: 

runCmd: settings set target.disable-aslr false

output: 

runCmd: settings set target.detach-on-error false

output: 

runCmd: settings set target.auto-apply-fixits false

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[AArch64] Miscompile due to 32-bit insertelement lowered to 8-bit move

4 participants