@@ -129,7 +129,112 @@ namespace lsp
129
129
" v20" , " v21" , " v22" , " v23"
130
130
);
131
131
}
132
- }
133
- }
132
+
133
+ IF_ARCH_AARCH64 (
134
+ static const uint32_t pabc32_set_alpha_const[] __lsp_aligned16 =
135
+ {
136
+ LSP_DSP_VEC4 (0x00ffffff ),
137
+ LSP_DSP_VEC4 (0x00ffffff )
138
+ };
139
+ );
140
+
141
+ void pabc32_set_alpha (void *dst, const void *src, uint8_t alpha, size_t count)
142
+ {
143
+ IF_ARCH_AARCH64 (
144
+ uint32_t a = uint32_t (alpha) << 24 ;
145
+ );
146
+
147
+ ARCH_AARCH64_ASM
148
+ (
149
+ __ASM_EMIT (" ld1r {v16.4s}, [%[a]]" )
150
+ __ASM_EMIT (" subs %[count], %[count], #32" )
151
+ __ASM_EMIT (" ldp q18, q19, [%[MASK]]" )
152
+ __ASM_EMIT (" mov v17.16b, v16.16b" )
153
+ __ASM_EMIT (" b.lo 2f" )
154
+ // 32x blocks
155
+ __ASM_EMIT (" 1:" )
156
+ __ASM_EMIT (" ldp q0, q1, [%[src], #0x00]" )
157
+ __ASM_EMIT (" ldp q2, q3, [%[src], #0x20]" )
158
+ __ASM_EMIT (" bif v0.16b, v16.16b, v18.16b" )
159
+ __ASM_EMIT (" bif v1.16b, v17.16b, v19.16b" )
160
+ __ASM_EMIT (" ldp q4, q5, [%[src], #0x40]" )
161
+ __ASM_EMIT (" bif v2.16b, v16.16b, v18.16b" )
162
+ __ASM_EMIT (" bif v3.16b, v17.16b, v19.16b" )
163
+ __ASM_EMIT (" ldp q6, q7, [%[src], #0x60]" )
164
+ __ASM_EMIT (" stp q0, q1, [%[dst], #0x00]" )
165
+ __ASM_EMIT (" bif v4.16b, v16.16b, v18.16b" )
166
+ __ASM_EMIT (" bif v5.16b, v17.16b, v19.16b" )
167
+ __ASM_EMIT (" stp q2, q3, [%[dst], #0x20]" )
168
+ __ASM_EMIT (" bif v6.16b, v16.16b, v18.16b" )
169
+ __ASM_EMIT (" bif v7.16b, v17.16b, v19.16b" )
170
+ __ASM_EMIT (" stp q4, q5, [%[dst], #0x40]" )
171
+ __ASM_EMIT (" stp q6, q7, [%[dst], #0x60]" )
172
+ __ASM_EMIT (" add %[src], %[src], #0x80" )
173
+ __ASM_EMIT (" subs %[count], %[count], #32" )
174
+ __ASM_EMIT (" add %[dst], %[dst], #0x80" )
175
+ __ASM_EMIT (" b.hs 1b" )
176
+ // 16x block
177
+ __ASM_EMIT (" 2:" )
178
+ __ASM_EMIT (" adds %[count], %[count], #16" )
179
+ __ASM_EMIT (" b.lt 4f" )
180
+ __ASM_EMIT (" ldp q0, q1, [%[src], #0x00]" )
181
+ __ASM_EMIT (" ldp q2, q3, [%[src], #0x20]" )
182
+ __ASM_EMIT (" bif v0.16b, v16.16b, v18.16b" )
183
+ __ASM_EMIT (" bif v1.16b, v17.16b, v19.16b" )
184
+ __ASM_EMIT (" bif v2.16b, v16.16b, v18.16b" )
185
+ __ASM_EMIT (" bif v3.16b, v17.16b, v19.16b" )
186
+ __ASM_EMIT (" stp q0, q1, [%[dst], #0x00]" )
187
+ __ASM_EMIT (" stp q2, q3, [%[dst], #0x20]" )
188
+ __ASM_EMIT (" add %[src], %[src], #0x40" )
189
+ __ASM_EMIT (" sub %[count], %[count], #16" )
190
+ __ASM_EMIT (" add %[dst], %[dst], #0x40" )
191
+ // 8x block
192
+ __ASM_EMIT (" 4:" )
193
+ __ASM_EMIT (" adds %[count], %[count], #8" )
194
+ __ASM_EMIT (" b.lt 6f" )
195
+ __ASM_EMIT (" ldp q0, q1, [%[src], #0x00]" )
196
+ __ASM_EMIT (" bif v0.16b, v16.16b, v18.16b" )
197
+ __ASM_EMIT (" bif v1.16b, v17.16b, v19.16b" )
198
+ __ASM_EMIT (" stp q0, q1, [%[dst], #0x00]" )
199
+ __ASM_EMIT (" add %[src], %[src], #0x20" )
200
+ __ASM_EMIT (" add %[dst], %[dst], #0x20" )
201
+ __ASM_EMIT (" sub %[count], %[count], #8" )
202
+ // 4x block
203
+ __ASM_EMIT (" 6:" )
204
+ __ASM_EMIT (" adds %[count], %[count], #4" )
205
+ __ASM_EMIT (" b.lt 8f" )
206
+ __ASM_EMIT (" ldr q0, [%[src], #0x00]" )
207
+ __ASM_EMIT (" bif v0.16b, v16.16b, v18.16b" )
208
+ __ASM_EMIT (" str q0, [%[dst], #0x00]" )
209
+ __ASM_EMIT (" add %[src], %[src], #0x10" )
210
+ __ASM_EMIT (" add %[dst], %[dst], #0x10" )
211
+ __ASM_EMIT (" sub %[count], %[count], #4" )
212
+ // 1x blocks
213
+ __ASM_EMIT (" 8:" )
214
+ __ASM_EMIT (" adds %[count], %[count], #3" )
215
+ __ASM_EMIT (" b.lt 10f" )
216
+ __ASM_EMIT (" 9:" )
217
+ __ASM_EMIT (" ld1r {v0.4s}, [%[src]]" )
218
+ __ASM_EMIT (" bif v0.16b, v16.16b, v18.16b" )
219
+ __ASM_EMIT (" subs %[count], %[count], #1" )
220
+ __ASM_EMIT (" st1 {v0.s}[0], [%[dst]]" )
221
+ __ASM_EMIT (" add %[src], %[src], #0x04" )
222
+ __ASM_EMIT (" add %[dst], %[dst], #0x04" )
223
+ __ASM_EMIT (" b.ge 9b" )
224
+ __ASM_EMIT (" 10:" )
225
+
226
+ : [dst] " +r" (dst), [src] " +r" (src),
227
+ [count] " +r" (count)
228
+ : [MASK] " r" (&pabc32_set_alpha_const[0 ]),
229
+ [a] " r" (&a)
230
+ : " cc" , " memory" ,
231
+ " v0" , " v1" , " v2" , " v3" ,
232
+ " v4" , " v5" , " v6" , " v7" ,
233
+ " v16" , " v17" , " v18" , " v19"
234
+ );
235
+ }
236
+
237
+ } /* namespace asimd */
238
+ } /* namespace lsp */
134
239
135
240
#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_GRAPHICS_PIXELFMT_H_ */
0 commit comments