Skip to content

Commit 0e85d30

Browse files
committed
[AArch64] Optimize DUP of extending loads to avoid GPR->FPR transfer
Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data.
1 parent 6345222 commit 0e85d30

File tree

5 files changed

+297
-15
lines changed

5 files changed

+297
-15
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21737,6 +21737,7 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
2173721737
SDLoc DL(N);
2173821738
EVT VT = N->getValueType(0);
2173921739
SDValue N0 = N->getOperand(0);
21740+
2174021741
if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
2174121742
N0.getOpcode() == AArch64ISD::DUP) {
2174221743
SDValue Op = N0.getOperand(0);
@@ -26632,11 +26633,34 @@ static SDValue performDUPCombine(SDNode *N,
2663226633
}
2663326634

2663426635
if (N->getOpcode() == AArch64ISD::DUP) {
26636+
SDValue Op = N->getOperand(0);
26637+
26638+
// Optimize DUP(extload/zextload i8/i16) to avoid GPR->FPR transfer.
26639+
// For example:
26640+
// v4i32 = DUP (i32 (zextloadi8 addr))
26641+
// =>
26642+
// v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
26643+
// v4i32 = DUPLANE32 (v4i32), 0
26644+
if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
26645+
ISD::LoadExtType ExtType = LD->getExtensionType();
26646+
EVT MemVT = LD->getMemoryVT();
26647+
EVT ElemVT = VT.getVectorElementType();
26648+
if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
26649+
(MemVT == MVT::i8 || MemVT == MVT::i16) && ElemVT != MemVT &&
26650+
LD->hasOneUse()) {
26651+
EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26652+
128 / ElemVT.getSizeInBits());
26653+
SDValue ScalarToVec =
26654+
DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
26655+
return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
26656+
DCI.DAG.getConstant(0, DL, MVT::i64));
26657+
}
26658+
}
26659+
2663526660
// If the instruction is known to produce a scalar in SIMD registers, we can
2663626661
// duplicate it across the vector lanes using DUPLANE instead of moving it
2663726662
// to a GPR first. For example, this allows us to handle:
2663826663
// v4i32 = DUP (i32 (FCMGT (f32, f32)))
26639-
SDValue Op = N->getOperand(0);
2664026664
// FIXME: Ideally, we should be able to handle all instructions that
2664126665
// produce a scalar value in FPRs.
2664226666
if (Op.getOpcode() == AArch64ISD::FCMEQ ||

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4375,6 +4375,26 @@ def : Pat <(v1i64 (scalar_to_vector (i64
43754375
(load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
43764376
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
43774377

4378+
// Patterns for scalar_to_vector with zero-extended loads.
4379+
// Enables direct SIMD register loads for small integer types (i8/i16) that are
4380+
// naturally zero-extended to i32/i64.
4381+
multiclass ScalarToVectorExtLoad<ValueType VecTy, ValueType ScalarTy> {
4382+
def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
4383+
(SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
4384+
def : Pat<(VecTy (scalar_to_vector (ScalarTy (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
4385+
(SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
4386+
4387+
def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
4388+
(SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
4389+
def : Pat<(VecTy (scalar_to_vector (ScalarTy (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
4390+
(SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
4391+
}
4392+
4393+
defm : ScalarToVectorExtLoad<v16i8, i32>;
4394+
defm : ScalarToVectorExtLoad<v8i16, i32>;
4395+
defm : ScalarToVectorExtLoad<v4i32, i32>;
4396+
defm : ScalarToVectorExtLoad<v2i64, i64>;
4397+
43784398
// Pre-fetch.
43794399
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
43804400
[(AArch64Prefetch timm:$Rt,

llvm/test/CodeGen/AArch64/aarch64-smull.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
222222
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
223223
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
224224
; CHECK-NEON: // %bb.0:
225-
; CHECK-NEON-NEXT: ldrh w8, [x0]
226-
; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
225+
; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
226+
; CHECK-NEON-NEXT: ldr h0, [x0]
227227
; CHECK-NEON-NEXT: ldr d1, [x1]
228-
; CHECK-NEON-NEXT: fmov d0, x8
229-
; CHECK-NEON-NEXT: mov v0.d[1], x9
228+
; CHECK-NEON-NEXT: mov v0.d[1], x8
230229
; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
231230
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
232231
; CHECK-NEON-NEXT: ret
233232
;
234233
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
235234
; CHECK-SVE: // %bb.0:
236-
; CHECK-SVE-NEXT: ldrh w8, [x0]
237-
; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
235+
; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
236+
; CHECK-SVE-NEXT: ldr h0, [x0]
238237
; CHECK-SVE-NEXT: ldr d1, [x1]
239-
; CHECK-SVE-NEXT: fmov d0, x8
240-
; CHECK-SVE-NEXT: mov v0.d[1], x9
238+
; CHECK-SVE-NEXT: mov v0.d[1], x8
241239
; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
242240
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
243241
; CHECK-SVE-NEXT: ret
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
3+
4+
; Test optimization of DUP with extended narrow loads
5+
; This should avoid GPR->SIMD transfers by loading directly into vector registers
6+
7+
define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
8+
; CHECK-LABEL: test_dup_zextload_i8_v4i32:
9+
; CHECK: // %bb.0:
10+
; CHECK-NEXT: ldr b0, [x0]
11+
; CHECK-NEXT: dup v0.4s, v0.s[0]
12+
; CHECK-NEXT: ret
13+
%load = load i8, ptr %p, align 1
14+
%ext = zext i8 %load to i32
15+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
16+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
17+
ret <4 x i32> %dup
18+
}
19+
20+
define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
21+
; CHECK-LABEL: test_dup_zextload_i16_v4i32:
22+
; CHECK: // %bb.0:
23+
; CHECK-NEXT: ldr h0, [x0]
24+
; CHECK-NEXT: dup v0.4s, v0.s[0]
25+
; CHECK-NEXT: ret
26+
%load = load i16, ptr %p, align 2
27+
%ext = zext i16 %load to i32
28+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
29+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
30+
ret <4 x i32> %dup
31+
}
32+
33+
define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
34+
; CHECK-LABEL: test_dup_zextload_i8_v2i32:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: ldr b0, [x0]
37+
; CHECK-NEXT: dup v0.2s, v0.s[0]
38+
; CHECK-NEXT: ret
39+
%load = load i8, ptr %p, align 1
40+
%ext = zext i8 %load to i32
41+
%vec = insertelement <2 x i32> undef, i32 %ext, i32 0
42+
%dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer
43+
ret <2 x i32> %dup
44+
}
45+
46+
define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
47+
; CHECK-LABEL: test_dup_zextload_i16_v2i32:
48+
; CHECK: // %bb.0:
49+
; CHECK-NEXT: ldr h0, [x0]
50+
; CHECK-NEXT: dup v0.2s, v0.s[0]
51+
; CHECK-NEXT: ret
52+
%load = load i16, ptr %p, align 2
53+
%ext = zext i16 %load to i32
54+
%vec = insertelement <2 x i32> undef, i32 %ext, i32 0
55+
%dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer
56+
ret <2 x i32> %dup
57+
}
58+
59+
define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
60+
; CHECK-LABEL: test_dup_zextload_i8_v8i16:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: ldr b0, [x0]
63+
; CHECK-NEXT: dup v0.8h, v0.h[0]
64+
; CHECK-NEXT: ret
65+
%load = load i8, ptr %p, align 1
66+
%ext = zext i8 %load to i16
67+
%vec = insertelement <8 x i16> undef, i16 %ext, i32 0
68+
%dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
69+
ret <8 x i16> %dup
70+
}
71+
72+
define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
73+
; CHECK-LABEL: test_dup_zextload_i8_v4i16:
74+
; CHECK: // %bb.0:
75+
; CHECK-NEXT: ldr b0, [x0]
76+
; CHECK-NEXT: dup v0.4h, v0.h[0]
77+
; CHECK-NEXT: ret
78+
%load = load i8, ptr %p, align 1
79+
%ext = zext i8 %load to i16
80+
%vec = insertelement <4 x i16> undef, i16 %ext, i32 0
81+
%dup = shufflevector <4 x i16> %vec, <4 x i16> undef, <4 x i32> zeroinitializer
82+
ret <4 x i16> %dup
83+
}
84+
85+
; Test with offset addressing
86+
define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
87+
; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
88+
; CHECK: // %bb.0:
89+
; CHECK-NEXT: ldr b0, [x0, #4]
90+
; CHECK-NEXT: dup v0.4s, v0.s[0]
91+
; CHECK-NEXT: ret
92+
%addr = getelementptr inbounds i8, ptr %p, i64 4
93+
%load = load i8, ptr %addr, align 1
94+
%ext = zext i8 %load to i32
95+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
96+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
97+
ret <4 x i32> %dup
98+
}
99+
100+
define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
101+
; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
102+
; CHECK: // %bb.0:
103+
; CHECK-NEXT: ldr h0, [x0, #8]
104+
; CHECK-NEXT: dup v0.4s, v0.s[0]
105+
; CHECK-NEXT: ret
106+
%addr = getelementptr inbounds i16, ptr %p, i64 4
107+
%load = load i16, ptr %addr, align 2
108+
%ext = zext i16 %load to i32
109+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
110+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
111+
ret <4 x i32> %dup
112+
}
113+
114+
; Test with register offset addressing
115+
define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
116+
; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
117+
; CHECK: // %bb.0:
118+
; CHECK-NEXT: ldr b0, [x0, x1]
119+
; CHECK-NEXT: dup v0.4s, v0.s[0]
120+
; CHECK-NEXT: ret
121+
%addr = getelementptr inbounds i8, ptr %p, i64 %offset
122+
%load = load i8, ptr %addr, align 1
123+
%ext = zext i8 %load to i32
124+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
125+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
126+
ret <4 x i32> %dup
127+
}
128+
129+
define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
130+
; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
131+
; CHECK: // %bb.0:
132+
; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
133+
; CHECK-NEXT: dup v0.4s, v0.s[0]
134+
; CHECK-NEXT: ret
135+
%addr = getelementptr inbounds i16, ptr %p, i64 %offset
136+
%load = load i16, ptr %addr, align 2
137+
%ext = zext i16 %load to i32
138+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
139+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
140+
ret <4 x i32> %dup
141+
}
142+
143+
; Negative test: sign-extended loads should not use this optimization
144+
define <4 x i32> @test_dup_sextload_i8_v4i32(ptr %p) {
145+
; CHECK-LABEL: test_dup_sextload_i8_v4i32:
146+
; CHECK: // %bb.0:
147+
; CHECK-NEXT: ldrsb w8, [x0]
148+
; CHECK-NEXT: dup v0.4s, w8
149+
; CHECK-NEXT: ret
150+
%load = load i8, ptr %p, align 1
151+
%ext = sext i8 %load to i32
152+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
153+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
154+
ret <4 x i32> %dup
155+
}
156+
157+
; Negative test: i32 loads don't need this optimization
158+
define <4 x i32> @test_dup_load_i32_v4i32(ptr %p) {
159+
; CHECK-LABEL: test_dup_load_i32_v4i32:
160+
; CHECK: // %bb.0:
161+
; CHECK-NEXT: ld1r { v0.4s }, [x0]
162+
; CHECK-NEXT: ret
163+
%load = load i32, ptr %p, align 4
164+
%vec = insertelement <4 x i32> undef, i32 %load, i32 0
165+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
166+
ret <4 x i32> %dup
167+
}
168+
169+
; Test that truncate(dup(zextload)) doesn't generate unnecessary XTN
170+
define <8 x i8> @test_truncate_dup_zextload_i8_v8i8(ptr %p) {
171+
; CHECK-LABEL: test_truncate_dup_zextload_i8_v8i8:
172+
; CHECK: // %bb.0:
173+
; CHECK-NEXT: ld1r { v0.8b }, [x0]
174+
; CHECK-NEXT: ret
175+
%load = load i8, ptr %p, align 1
176+
%ext = zext i8 %load to i16
177+
%vec = insertelement <8 x i16> undef, i16 %ext, i32 0
178+
%dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
179+
%trunc = trunc <8 x i16> %dup to <8 x i8>
180+
ret <8 x i8> %trunc
181+
}
182+
183+
; Test with i16 to i8 truncation
184+
define <8 x i8> @test_truncate_dup_zextload_i8_from_i32_v8i8(ptr %p) {
185+
; CHECK-LABEL: test_truncate_dup_zextload_i8_from_i32_v8i8:
186+
; CHECK: // %bb.0:
187+
; CHECK-NEXT: ld1r { v0.8b }, [x0]
188+
; CHECK-NEXT: ret
189+
%load = load i8, ptr %p, align 1
190+
%ext = zext i8 %load to i32
191+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
192+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
193+
%trunc = trunc <4 x i32> %dup to <4 x i8>
194+
; Widen to v8i8 to match the test output
195+
%result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
196+
ret <8 x i8> %result
197+
}
198+
199+
; Test with i16 load truncated to i8
200+
define <8 x i8> @test_truncate_dup_zextload_i16_to_i8_v8i8(ptr %p) {
201+
; CHECK-LABEL: test_truncate_dup_zextload_i16_to_i8_v8i8:
202+
; CHECK: // %bb.0:
203+
; CHECK-NEXT: ld1r { v0.8b }, [x0]
204+
; CHECK-NEXT: ret
205+
%load = load i16, ptr %p, align 2
206+
%ext = zext i16 %load to i32
207+
%vec = insertelement <4 x i32> undef, i32 %ext, i32 0
208+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
209+
%trunc = trunc <4 x i32> %dup to <4 x i8>
210+
; Widen to v8i8 to match the test output
211+
%result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
212+
ret <8 x i8> %result
213+
}
214+
215+
; Test generalized truncate(dup(scalar_to_vector)) for non-load case
216+
define <8 x i8> @test_truncate_dup_scalar_i32_to_i8_v8i8(i32 %val) {
217+
; CHECK-LABEL: test_truncate_dup_scalar_i32_to_i8_v8i8:
218+
; CHECK: // %bb.0:
219+
; CHECK-NEXT: dup v0.8b, w0
220+
; CHECK-NEXT: ret
221+
%vec = insertelement <4 x i32> undef, i32 %val, i32 0
222+
%dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
223+
%trunc = trunc <4 x i32> %dup to <4 x i8>
224+
; Widen to v8i8 to match the test output
225+
%result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
226+
ret <8 x i8> %result
227+
}
228+
229+
; Test generalized truncate(dup(scalar_to_vector)) i16 to i8
230+
define <8 x i8> @test_truncate_dup_scalar_i16_to_i8_v8i8(i16 %val) {
231+
; CHECK-LABEL: test_truncate_dup_scalar_i16_to_i8_v8i8:
232+
; CHECK: // %bb.0:
233+
; CHECK-NEXT: dup v0.8b, w0
234+
; CHECK-NEXT: ret
235+
%vec = insertelement <8 x i16> undef, i16 %val, i32 0
236+
%dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
237+
%trunc = trunc <8 x i16> %dup to <8 x i8>
238+
ret <8 x i8> %trunc
239+
}
240+

llvm/test/CodeGen/AArch64/dup.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ entry:
3232
define <2 x i8> @loaddup_v2i8(ptr %p) {
3333
; CHECK-LABEL: loaddup_v2i8:
3434
; CHECK: // %bb.0: // %entry
35-
; CHECK-NEXT: ldrb w8, [x0]
36-
; CHECK-NEXT: dup v0.2s, w8
35+
; CHECK-NEXT: ldr b0, [x0]
36+
; CHECK-NEXT: dup v0.2s, v0.s[0]
3737
; CHECK-NEXT: ret
3838
entry:
3939
%a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
189189
define <4 x i8> @loaddup_v4i8(ptr %p) {
190190
; CHECK-SD-LABEL: loaddup_v4i8:
191191
; CHECK-SD: // %bb.0: // %entry
192-
; CHECK-SD-NEXT: ldrb w8, [x0]
193-
; CHECK-SD-NEXT: dup v0.4h, w8
192+
; CHECK-SD-NEXT: ldr b0, [x0]
193+
; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
194194
; CHECK-SD-NEXT: ret
195195
;
196196
; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
444444
define <2 x i16> @loaddup_v2i16(ptr %p) {
445445
; CHECK-SD-LABEL: loaddup_v2i16:
446446
; CHECK-SD: // %bb.0: // %entry
447-
; CHECK-SD-NEXT: ldrh w8, [x0]
448-
; CHECK-SD-NEXT: dup v0.2s, w8
447+
; CHECK-SD-NEXT: ldr h0, [x0]
448+
; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
449449
; CHECK-SD-NEXT: ret
450450
;
451451
; CHECK-GI-LABEL: loaddup_v2i16:

0 commit comments

Comments
 (0)