From ce5fcadfdeb492e95cd81dcb9f98152194743f56 Mon Sep 17 00:00:00 2001 From: Lewis Crawford Date: Mon, 14 Oct 2024 11:03:37 +0000 Subject: [PATCH] [DAGCombiner] Fix check for extending loads Fix a check for extending loads in DAGCombiner, where if the result type has more bits than the loaded type it should count as an extending load. All backends apart from AArch64 ignore this ExtTy argument to shouldReduceLoadWidth, so this change currently only impacts AArch64. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +- .../AArch64/aarch64-scalarize-vec-load-ext.ll | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0879165aac139..ca91d35573c3e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22568,7 +22568,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, return SDValue(); ISD::LoadExtType ExtTy = - ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD; + ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD; if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) || !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll new file mode 100644 index 0000000000000..30ce0cb09fc08 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; FIXME: Currently, we avoid narrowing this v4i32 load, in the +; hopes of being able to fold the shift, despite it requiring stack +; storage + loads. Ideally, we should narrow here and load the i32 +; directly from the variable offset e.g: +; +; add x8, x0, x1, lsl #4 +; and x9, x2, #0x3 +; ldr w0, [x8, x9, lsl #2] +; +; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should +; probably be updated to choose load-narrowing instead of folding the +; lsl in larger vector cases. +; +define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) { +; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, x1, lsl #4] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: bfi x8, x2, #2, #2 +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +entry: + %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off + %x = load <4 x i32>, ptr %idx, align 8 + %res = extractelement <4 x i32> %x, i32 %ele + ret i32 %res +}