|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| 2 | +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s |
| 3 | + |
| 4 | +; FIXME: Currently, we avoid narrowing this v4i32 load, in the |
| 5 | +; hopes of being able to fold the shift, despite it requiring stack |
| 6 | +; storage + loads. Ideally, we should narrow here and load the i32 |
| 7 | +; directly from the variable offset e.g: |
| 8 | +; |
| 9 | +; add x8, x0, x1, lsl #4 |
| 10 | +; and x9, x2, #0x3 |
| 11 | +; ldr w0, [x8, x9, lsl #2] |
| 12 | +; |
| 13 | +; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should |
| 14 | +; probably be updated to choose load-narrowing instead of folding the |
| 15 | +; lsl in larger vector cases. |
| 16 | +; |
| 17 | +define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) { |
| 18 | +; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx: |
| 19 | +; CHECK: // %bb.0: // %entry |
| 20 | +; CHECK-NEXT: sub sp, sp, #16 |
| 21 | +; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| 22 | +; CHECK-NEXT: ldr q0, [x0, x1, lsl #4] |
| 23 | +; CHECK-NEXT: mov x8, sp |
| 24 | +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 |
| 25 | +; CHECK-NEXT: bfi x8, x2, #2, #2 |
| 26 | +; CHECK-NEXT: str q0, [sp] |
| 27 | +; CHECK-NEXT: ldr w0, [x8] |
| 28 | +; CHECK-NEXT: add sp, sp, #16 |
| 29 | +; CHECK-NEXT: ret |
| 30 | +entry: |
| 31 | + %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off |
| 32 | + %x = load <4 x i32>, ptr %idx, align 8 |
| 33 | + %res = extractelement <4 x i32> %x, i32 %ele |
| 34 | + ret i32 %res |
| 35 | +} |
0 commit comments