Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 47 additions & 5 deletions llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1583,6 +1583,8 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
EVT DstVT = N->getValueType(0);
ISD::MemIndexedMode AM = LD->getAddressingMode();
bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
int OffsetVal = (int)OffsetOp->getZExtValue();

// We're not doing validity checking here. That was done when checking
// if we should mark the load as indexed or not. We're just selecting
Expand Down Expand Up @@ -1637,18 +1639,58 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
} else if (VT == MVT::f32) {
Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
} else if (VT == MVT::f64 || VT.is64BitVector()) {
} else if (VT == MVT::f64 ||
(VT.is64BitVector() && Subtarget->isLittleEndian())) {
Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
} else if (VT.is128BitVector()) {
} else if (VT.is128BitVector() && Subtarget->isLittleEndian()) {
Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
} else if (VT.is64BitVector()) {
if (IsPre || OffsetVal != 8)
return false;
switch (VT.getScalarSizeInBits()) {
case 8:
Opcode = AArch64::LD1Onev8b_POST;
break;
case 16:
Opcode = AArch64::LD1Onev4h_POST;
break;
case 32:
Opcode = AArch64::LD1Onev2s_POST;
break;
case 64:
Opcode = AArch64::LD1Onev1d_POST;
break;
default:
llvm_unreachable("Expected vector element to be a power of 2");
}
} else if (VT.is128BitVector()) {
if (IsPre || OffsetVal != 16)
return false;
switch (VT.getScalarSizeInBits()) {
case 8:
Opcode = AArch64::LD1Onev16b_POST;
break;
case 16:
Opcode = AArch64::LD1Onev8h_POST;
break;
case 32:
Opcode = AArch64::LD1Onev4s_POST;
break;
case 64:
Opcode = AArch64::LD1Onev2d_POST;
break;
default:
llvm_unreachable("Expected vector element to be a power of 2");
}
} else
return false;
SDValue Chain = LD->getChain();
SDValue Base = LD->getBasePtr();
ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
int OffsetVal = (int)OffsetOp->getZExtValue();
SDLoc dl(N);
SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
// LD1 encodes an immediate offset by using XZR as the offset register.
SDValue Offset = (VT.isVector() && !Subtarget->isLittleEndian())
? CurDAG->getRegister(AArch64::XZR, MVT::i64)
: CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
SDValue Ops[] = { Base, Offset, Chain };
SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
MVT::Other, Ops);
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2108,12 +2108,18 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);

// When little-endian we can use ordinary d and q register loads/stores for
// vector types, but when big-endian we need to use structure load/store which
// only allow post-index addressing.
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
}
} else {
setIndexedLoadAction(ISD::POST_INC, VT, Legal);
setIndexedStoreAction(ISD::POST_INC, VT, Legal);
}

if (Subtarget->hasD128()) {
Expand Down Expand Up @@ -27067,6 +27073,12 @@ bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
RHSC = -(uint64_t)RHSC;
if (!isInt<9>(RHSC))
return false;
// When big-endian VLD1/VST1 are used for vector load and store, and these
// only allow an offset that's equal to the store size.
EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
if (!Subtarget->isLittleEndian() && MemType.isVector() &&
RHSC != MemType.getStoreSize())
return false;
// Always emit pre-inc/post-inc addressing mode. Use negated constant offset
// when dealing with subtraction.
Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
Expand Down
84 changes: 51 additions & 33 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -4942,39 +4942,42 @@ def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
def : Pat<(post_store (bf16 FPR16:$Rt), GPR64sp:$addr, simm9:$off),
(STRHpost FPR16:$Rt, GPR64sp:$addr, simm9:$off)>;

def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;

def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(post_store(v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;

def : Pat<(post_store(v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store(v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
}

//===----------------------------------------------------------------------===//
// Load/store exclusive instructions.
Expand Down Expand Up @@ -8925,6 +8928,21 @@ def : St1Pat<v4i16, ST1Onev4h>;
def : St1Pat<v2i32, ST1Onev2s>;
def : St1Pat<v1i64, ST1Onev1d>;

class St1PostPat<ValueType ty, Instruction INST, int off>
: Pat<(post_store ty:$Vt, GPR64sp:$Rn, (i64 off)),
(INST ty:$Vt, GPR64sp:$Rn, XZR)>;

let Predicates = [IsBE] in {
def : St1PostPat<v16i8, ST1Onev16b_POST, 16>;
def : St1PostPat<v8i16, ST1Onev8h_POST, 16>;
def : St1PostPat<v4i32, ST1Onev4s_POST, 16>;
def : St1PostPat<v2i64, ST1Onev2d_POST, 16>;
def : St1PostPat<v8i8, ST1Onev8b_POST, 8>;
def : St1PostPat<v4i16, ST1Onev4h_POST, 8>;
def : St1PostPat<v2i32, ST1Onev2s_POST, 8>;
def : St1PostPat<v1i64, ST1Onev1d_POST, 8>;
}

//---
// Single-element
//---
Expand Down
170 changes: 170 additions & 0 deletions llvm/test/CodeGen/AArch64/vector-ldst-offset.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s --check-prefixes=CHECK-LE
; RUN: llc -mtriple=aarch64_be < %s -o - | FileCheck %s --check-prefixes=CHECK-BE

; Check that we use the correct offset mode for vector loads and stores, and in
; particular for big-endian we use ld1/st1 which only allows postindex immediate
; offset of the same size as the memory access size.
; FIXME: Currently we fail to make use of postindex register offset ld1/st1.

define [2 x ptr] @postidx_same_size([2 x ptr] %x) {
; CHECK-LE-LABEL: postidx_same_size:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ldr d0, [x0], #8
; CHECK-LE-NEXT: str d0, [x1], #8
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: postidx_same_size:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: ld1 { v0.4h }, [x0], #8
; CHECK-BE-NEXT: st1 { v0.4h }, [x1], #8
; CHECK-BE-NEXT: ret
entry:
%ldptr = extractvalue [2 x ptr] %x, 0
%stptr = extractvalue [2 x ptr] %x, 1
%val = load <4 x i16>, ptr %ldptr, align 2
store <4 x i16> %val, ptr %stptr, align 2
%add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
%add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
%ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
%ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
ret [2 x ptr] %ret2
}

define [2 x ptr] @preidx_same_size([2 x ptr] %x) {
; CHECK-LE-LABEL: preidx_same_size:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ldr d0, [x0, #8]!
; CHECK-LE-NEXT: str d0, [x1, #8]!
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: preidx_same_size:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: add x0, x0, #8
; CHECK-BE-NEXT: add x1, x1, #8
; CHECK-BE-NEXT: ld1 { v0.4h }, [x0]
; CHECK-BE-NEXT: st1 { v0.4h }, [x1]
; CHECK-BE-NEXT: ret
entry:
%ldptr = extractvalue [2 x ptr] %x, 0
%stptr = extractvalue [2 x ptr] %x, 1
%add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
%add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
%val = load <4 x i16>, ptr %add.ldptr, align 2
store <4 x i16> %val, ptr %add.stptr, align 2
%ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
%ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
ret [2 x ptr] %ret2
}

define [2 x ptr] @postidx_different_size([2 x ptr] %x) {
; CHECK-LE-LABEL: postidx_different_size:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ldr d0, [x0], #16
; CHECK-LE-NEXT: str d0, [x1], #16
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: postidx_different_size:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: ld1 { v0.4h }, [x0]
; CHECK-BE-NEXT: mov x8, x1
; CHECK-BE-NEXT: add x0, x0, #16
; CHECK-BE-NEXT: add x1, x1, #16
; CHECK-BE-NEXT: st1 { v0.4h }, [x8]
; CHECK-BE-NEXT: ret
entry:
%ldptr = extractvalue [2 x ptr] %x, 0
%stptr = extractvalue [2 x ptr] %x, 1
%val = load <4 x i16>, ptr %ldptr, align 2
store <4 x i16> %val, ptr %stptr, align 2
%add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
%add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
%ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
%ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
ret [2 x ptr] %ret2
}

define [2 x ptr] @preidx_different_size([2 x ptr] %x) {
; CHECK-LE-LABEL: preidx_different_size:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ldr d0, [x0, #16]!
; CHECK-LE-NEXT: str d0, [x1, #16]!
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: preidx_different_size:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: add x0, x0, #16
; CHECK-BE-NEXT: add x1, x1, #16
; CHECK-BE-NEXT: ld1 { v0.4h }, [x0]
; CHECK-BE-NEXT: st1 { v0.4h }, [x1]
; CHECK-BE-NEXT: ret
entry:
%ldptr = extractvalue [2 x ptr] %x, 0
%stptr = extractvalue [2 x ptr] %x, 1
%add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
%add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
%val = load <4 x i16>, ptr %add.ldptr, align 2
store <4 x i16> %val, ptr %add.stptr, align 2
%ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
%ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
ret [2 x ptr] %ret2
}

define [2 x ptr] @postidx_reg([2 x ptr] %x, i64 %off) {
; CHECK-LE-LABEL: postidx_reg:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: ldr d0, [x0]
; CHECK-LE-NEXT: mov x8, x1
; CHECK-LE-NEXT: add x0, x0, x2
; CHECK-LE-NEXT: add x1, x1, x2
; CHECK-LE-NEXT: str d0, [x8]
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: postidx_reg:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: ld1 { v0.4h }, [x0]
; CHECK-BE-NEXT: mov x8, x1
; CHECK-BE-NEXT: add x0, x0, x2
; CHECK-BE-NEXT: add x1, x1, x2
; CHECK-BE-NEXT: st1 { v0.4h }, [x8]
; CHECK-BE-NEXT: ret
entry:
%ldptr = extractvalue [2 x ptr] %x, 0
%stptr = extractvalue [2 x ptr] %x, 1
%val = load <4 x i16>, ptr %ldptr, align 2
store <4 x i16> %val, ptr %stptr, align 2
%add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
%add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
%ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
%ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
ret [2 x ptr] %ret2
}

define [2 x ptr] @preidx_reg([2 x ptr] %x, i64 %off) {
; CHECK-LE-LABEL: preidx_reg:
; CHECK-LE: // %bb.0: // %entry
; CHECK-LE-NEXT: mov x8, x1
; CHECK-LE-NEXT: ldr d0, [x0, x2]
; CHECK-LE-NEXT: add x0, x0, x2
; CHECK-LE-NEXT: add x1, x1, x2
; CHECK-LE-NEXT: str d0, [x8, x2]
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: preidx_reg:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: add x0, x0, x2
; CHECK-BE-NEXT: add x1, x1, x2
; CHECK-BE-NEXT: ld1 { v0.4h }, [x0]
; CHECK-BE-NEXT: st1 { v0.4h }, [x1]
; CHECK-BE-NEXT: ret
entry:
%ldptr = extractvalue [2 x ptr] %x, 0
%stptr = extractvalue [2 x ptr] %x, 1
%add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
%add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
%val = load <4 x i16>, ptr %add.ldptr, align 2
store <4 x i16> %val, ptr %add.stptr, align 2
%ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
%ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
ret [2 x ptr] %ret2
}
Loading
Loading