diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0879165aac139..ca91d35573c3e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22568,7 +22568,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, return SDValue(); ISD::LoadExtType ExtTy = - ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD; + ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD; if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll new file mode 100644 index 0000000000000..30ce0cb09fc08 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; FIXME: Currently, we avoid narrowing this v4i32 load, in the +; hopes of being able to fold the shift, despite it requiring stack +; storage + loads. Ideally, we should narrow here and load the i32 +; directly from the variable offset e.g: +; +; add x8, x0, x1, lsl #4 +; and x9, x2, #0x3 +; ldr w0, [x8, x9, lsl #2] +; +; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should +; probably be updated to choose load-narrowing instead of folding the +; lsl in larger vector cases. +; +define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) { +; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, x1, lsl #4] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: bfi x8, x2, #2, #2 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off + %x = load <4 x i32>, ptr %idx, align 8 + %res = extractelement <4 x i32> %x, i32 %ele + ret i32 %res +}