Skip to content

Commit ca57c4d

Browse files
committed
Implemented pabc32_set_alpha for 64-bit ARM
1 parent 7ad3c58 commit ca57c4d

File tree

5 files changed

+1170
-3294
lines changed

5 files changed

+1170
-3294
lines changed

include/private/dsp/arch/aarch64/asimd/graphics/pixelfmt.h

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,112 @@ namespace lsp
129129
"v20", "v21", "v22", "v23"
130130
);
131131
}
132-
}
133-
}
132+
133+
IF_ARCH_AARCH64(
134+
static const uint32_t pabc32_set_alpha_const[] __lsp_aligned16 =
135+
{
136+
LSP_DSP_VEC4(0x00ffffff),
137+
LSP_DSP_VEC4(0x00ffffff)
138+
};
139+
);
140+
141+
void pabc32_set_alpha(void *dst, const void *src, uint8_t alpha, size_t count)
142+
{
143+
IF_ARCH_AARCH64(
144+
uint32_t a = uint32_t(alpha) << 24;
145+
);
146+
147+
ARCH_AARCH64_ASM
148+
(
149+
__ASM_EMIT("ld1r {v16.4s}, [%[a]]")
150+
__ASM_EMIT("subs %[count], %[count], #32")
151+
__ASM_EMIT("ldp q18, q19, [%[MASK]]")
152+
__ASM_EMIT("mov v17.16b, v16.16b")
153+
__ASM_EMIT("b.lo 2f")
154+
// 32x blocks
155+
__ASM_EMIT("1:")
156+
__ASM_EMIT("ldp q0, q1, [%[src], #0x00]")
157+
__ASM_EMIT("ldp q2, q3, [%[src], #0x20]")
158+
__ASM_EMIT("bif v0.16b, v16.16b, v18.16b")
159+
__ASM_EMIT("bif v1.16b, v17.16b, v19.16b")
160+
__ASM_EMIT("ldp q4, q5, [%[src], #0x40]")
161+
__ASM_EMIT("bif v2.16b, v16.16b, v18.16b")
162+
__ASM_EMIT("bif v3.16b, v17.16b, v19.16b")
163+
__ASM_EMIT("ldp q6, q7, [%[src], #0x60]")
164+
__ASM_EMIT("stp q0, q1, [%[dst], #0x00]")
165+
__ASM_EMIT("bif v4.16b, v16.16b, v18.16b")
166+
__ASM_EMIT("bif v5.16b, v17.16b, v19.16b")
167+
__ASM_EMIT("stp q2, q3, [%[dst], #0x20]")
168+
__ASM_EMIT("bif v6.16b, v16.16b, v18.16b")
169+
__ASM_EMIT("bif v7.16b, v17.16b, v19.16b")
170+
__ASM_EMIT("stp q4, q5, [%[dst], #0x40]")
171+
__ASM_EMIT("stp q6, q7, [%[dst], #0x60]")
172+
__ASM_EMIT("add %[src], %[src], #0x80")
173+
__ASM_EMIT("subs %[count], %[count], #32")
174+
__ASM_EMIT("add %[dst], %[dst], #0x80")
175+
__ASM_EMIT("b.hs 1b")
176+
// 16x block
177+
__ASM_EMIT("2:")
178+
__ASM_EMIT("adds %[count], %[count], #16")
179+
__ASM_EMIT("b.lt 4f")
180+
__ASM_EMIT("ldp q0, q1, [%[src], #0x00]")
181+
__ASM_EMIT("ldp q2, q3, [%[src], #0x20]")
182+
__ASM_EMIT("bif v0.16b, v16.16b, v18.16b")
183+
__ASM_EMIT("bif v1.16b, v17.16b, v19.16b")
184+
__ASM_EMIT("bif v2.16b, v16.16b, v18.16b")
185+
__ASM_EMIT("bif v3.16b, v17.16b, v19.16b")
186+
__ASM_EMIT("stp q0, q1, [%[dst], #0x00]")
187+
__ASM_EMIT("stp q2, q3, [%[dst], #0x20]")
188+
__ASM_EMIT("add %[src], %[src], #0x40")
189+
__ASM_EMIT("sub %[count], %[count], #16")
190+
__ASM_EMIT("add %[dst], %[dst], #0x40")
191+
// 8x block
192+
__ASM_EMIT("4:")
193+
__ASM_EMIT("adds %[count], %[count], #8")
194+
__ASM_EMIT("b.lt 6f")
195+
__ASM_EMIT("ldp q0, q1, [%[src], #0x00]")
196+
__ASM_EMIT("bif v0.16b, v16.16b, v18.16b")
197+
__ASM_EMIT("bif v1.16b, v17.16b, v19.16b")
198+
__ASM_EMIT("stp q0, q1, [%[dst], #0x00]")
199+
__ASM_EMIT("add %[src], %[src], #0x20")
200+
__ASM_EMIT("add %[dst], %[dst], #0x20")
201+
__ASM_EMIT("sub %[count], %[count], #8")
202+
// 4x block
203+
__ASM_EMIT("6:")
204+
__ASM_EMIT("adds %[count], %[count], #4")
205+
__ASM_EMIT("b.lt 8f")
206+
__ASM_EMIT("ldr q0, [%[src], #0x00]")
207+
__ASM_EMIT("bif v0.16b, v16.16b, v18.16b")
208+
__ASM_EMIT("str q0, [%[dst], #0x00]")
209+
__ASM_EMIT("add %[src], %[src], #0x10")
210+
__ASM_EMIT("add %[dst], %[dst], #0x10")
211+
__ASM_EMIT("sub %[count], %[count], #4")
212+
// 1x blocks
213+
__ASM_EMIT("8:")
214+
__ASM_EMIT("adds %[count], %[count], #3")
215+
__ASM_EMIT("b.lt 10f")
216+
__ASM_EMIT("9:")
217+
__ASM_EMIT("ld1r {v0.4s}, [%[src]]")
218+
__ASM_EMIT("bif v0.16b, v16.16b, v18.16b")
219+
__ASM_EMIT("subs %[count], %[count], #1")
220+
__ASM_EMIT("st1 {v0.s}[0], [%[dst]]")
221+
__ASM_EMIT("add %[src], %[src], #0x04")
222+
__ASM_EMIT("add %[dst], %[dst], #0x04")
223+
__ASM_EMIT("b.ge 9b")
224+
__ASM_EMIT("10:")
225+
226+
: [dst] "+r" (dst), [src] "+r" (src),
227+
[count] "+r" (count)
228+
: [MASK] "r" (&pabc32_set_alpha_const[0]),
229+
[a] "r" (&a)
230+
: "cc", "memory",
231+
"v0", "v1", "v2", "v3",
232+
"v4", "v5", "v6", "v7",
233+
"v16", "v17", "v18", "v19"
234+
);
235+
}
236+
237+
} /* namespace asimd */
238+
} /* namespace lsp */
134239

135240
#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_GRAPHICS_PIXELFMT_H_ */

0 commit comments

Comments
 (0)