|
1 | 1 |
|
2 | | -/* filter_neon.S - NEON optimised filter functions |
| 2 | +/* filter_neon.S - placeholder file |
3 | 3 | * |
4 | | - * Copyright (c) 2018 Cosmin Truta |
5 | | - * Copyright (c) 2014,2017 Glenn Randers-Pehrson |
6 | | - * Written by Mans Rullgard, 2011. |
| 4 | + * Copyright (c) 2024 Cosmin Truta |
7 | 5 | * |
8 | 6 | * This code is released under the libpng license. |
9 | 7 | * For conditions of distribution and use, see the disclaimer |
10 | 8 | * and license in png.h |
11 | 9 | */ |
12 | 10 |
|
| 11 | +/* IMPORTANT NOTE: |
| 12 | + * |
| 13 | + * Historically, the hand-coded assembler implementation of Neon optimizations |
| 14 | + * in this module had not been in sync with the intrinsics-based implementation |
| 15 | + * in filter_neon_intrinsics.c and palette_neon_intrinsics.c, at least since |
| 16 | + * the introduction of riffled palette optimizations. Moreover, the assembler |
| 17 | + * code used to work on 32-bit ARM only, and it caused problems, even if empty, |
| 18 | + * on 64-bit ARM. |
| 19 | + * |
| 20 | + * All references to this module from our internal build scripts and projects |
| 21 | + * have been removed. |
| 22 | + * |
| 23 | + * For the external projects that might still expect this module to be present, |
| 24 | + * we leave this stub in place, for the remaining lifetime of libpng-1.6.x. |
| 25 | + * Everything should continue to function normally, as long as there are no |
| 26 | + * deliberate attempts to use the old hand-made assembler code. A build error |
| 27 | + * will be raised otherwise. |
| 28 | + */ |
| 29 | + |
13 | 30 | /* This is required to get the symbol renames, which are #defines, and the |
14 | 31 | * definitions (or not) of PNG_ARM_NEON_OPT and PNG_ARM_NEON_IMPLEMENTATION. |
15 | 32 | */ |
16 | 33 | #define PNG_VERSION_INFO_ONLY |
17 | 34 | #include "../pngpriv.h" |
18 | 35 |
|
19 | | -#if (defined(__linux__) || defined(__FreeBSD__)) && defined(__ELF__) |
20 | | -.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ |
21 | | -#endif |
22 | | - |
23 | 36 | #ifdef PNG_READ_SUPPORTED |
24 | | - |
25 | | -/* Assembler NEON support - only works for 32-bit ARM (i.e. it does not work for |
26 | | - * ARM64). The code in arm/filter_neon_intrinsics.c supports ARM64, however it |
27 | | - * only works if -mfpu=neon is specified on the GCC command line. See pngpriv.h |
28 | | - * for the logic which sets PNG_USE_ARM_NEON_ASM: |
29 | | - */ |
30 | 37 | #if PNG_ARM_NEON_IMPLEMENTATION == 2 /* hand-coded assembler */ |
31 | | - |
32 | 38 | #if PNG_ARM_NEON_OPT > 0 |
33 | 39 |
|
34 | | -#ifdef __ELF__ |
35 | | -# define ELF |
| 40 | +#if defined(__clang__) |
| 41 | +#define GNUC_VERSION 0 /* not gcc, although it might pretend to be */ |
| 42 | +#elif defined(__GNUC__) |
| 43 | +#define GNUC_MAJOR (__GNUC__ + 0) |
| 44 | +#define GNUC_MINOR (__GNUC_MINOR__ + 0) |
| 45 | +#define GNUC_PATCHLEVEL (__GNUC_PATCHLEVEL__ + 0) |
| 46 | +#define GNUC_VERSION (GNUC_MAJOR * 10000 + GNUC_MINOR * 100 + GNUC_PATCHLEVEL) |
36 | 47 | #else |
37 | | -# define ELF @ |
| 48 | +#define GNUC_VERSION 0 /* not gcc */ |
38 | 49 | #endif |
39 | 50 |
|
40 | | - .arch armv7-a |
41 | | - .fpu neon |
42 | | - |
43 | | -.macro func name, export=0 |
44 | | - .macro endfunc |
45 | | -ELF .size \name, . - \name |
46 | | - .endfunc |
47 | | - .purgem endfunc |
48 | | - .endm |
49 | | - .text |
50 | | - |
51 | | - /* Explicitly specifying alignment here because some versions of |
52 | | - * GAS don't align code correctly. This is harmless in correctly |
53 | | - * written versions of GAS. |
54 | | - */ |
55 | | - .align 2 |
56 | | - |
57 | | - .if \export |
58 | | - .global \name |
59 | | - .endif |
60 | | -ELF .type \name, STT_FUNC |
61 | | - .func \name |
62 | | -\name: |
63 | | -.endm |
64 | | - |
65 | | -func png_read_filter_row_sub4_neon, export=1 |
66 | | - ldr r3, [r0, #4] @ rowbytes |
67 | | - vmov.i8 d3, #0 |
68 | | -1: |
69 | | - vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] |
70 | | - vadd.u8 d0, d3, d4 |
71 | | - vadd.u8 d1, d0, d5 |
72 | | - vadd.u8 d2, d1, d6 |
73 | | - vadd.u8 d3, d2, d7 |
74 | | - vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! |
75 | | - subs r3, r3, #16 |
76 | | - bgt 1b |
77 | | - |
78 | | - bx lr |
79 | | -endfunc |
80 | | - |
81 | | -func png_read_filter_row_sub3_neon, export=1 |
82 | | - ldr r3, [r0, #4] @ rowbytes |
83 | | - vmov.i8 d3, #0 |
84 | | - mov r0, r1 |
85 | | - mov r2, #3 |
86 | | - mov r12, #12 |
87 | | - vld1.8 {q11}, [r0], r12 |
88 | | -1: |
89 | | - vext.8 d5, d22, d23, #3 |
90 | | - vadd.u8 d0, d3, d22 |
91 | | - vext.8 d6, d22, d23, #6 |
92 | | - vadd.u8 d1, d0, d5 |
93 | | - vext.8 d7, d23, d23, #1 |
94 | | - vld1.8 {q11}, [r0], r12 |
95 | | - vst1.32 {d0[0]}, [r1,:32], r2 |
96 | | - vadd.u8 d2, d1, d6 |
97 | | - vst1.32 {d1[0]}, [r1], r2 |
98 | | - vadd.u8 d3, d2, d7 |
99 | | - vst1.32 {d2[0]}, [r1], r2 |
100 | | - vst1.32 {d3[0]}, [r1], r2 |
101 | | - subs r3, r3, #12 |
102 | | - bgt 1b |
103 | | - |
104 | | - bx lr |
105 | | -endfunc |
106 | | - |
107 | | -func png_read_filter_row_up_neon, export=1 |
108 | | - ldr r3, [r0, #4] @ rowbytes |
109 | | -1: |
110 | | - vld1.8 {q0}, [r1,:128] |
111 | | - vld1.8 {q1}, [r2,:128]! |
112 | | - vadd.u8 q0, q0, q1 |
113 | | - vst1.8 {q0}, [r1,:128]! |
114 | | - subs r3, r3, #16 |
115 | | - bgt 1b |
116 | | - |
117 | | - bx lr |
118 | | -endfunc |
119 | | - |
120 | | -func png_read_filter_row_avg4_neon, export=1 |
121 | | - ldr r12, [r0, #4] @ rowbytes |
122 | | - vmov.i8 d3, #0 |
123 | | -1: |
124 | | - vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] |
125 | | - vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]! |
126 | | - vhadd.u8 d0, d3, d16 |
127 | | - vadd.u8 d0, d0, d4 |
128 | | - vhadd.u8 d1, d0, d17 |
129 | | - vadd.u8 d1, d1, d5 |
130 | | - vhadd.u8 d2, d1, d18 |
131 | | - vadd.u8 d2, d2, d6 |
132 | | - vhadd.u8 d3, d2, d19 |
133 | | - vadd.u8 d3, d3, d7 |
134 | | - vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! |
135 | | - subs r12, r12, #16 |
136 | | - bgt 1b |
137 | | - |
138 | | - bx lr |
139 | | -endfunc |
140 | | - |
141 | | -func png_read_filter_row_avg3_neon, export=1 |
142 | | - push {r4,lr} |
143 | | - ldr r12, [r0, #4] @ rowbytes |
144 | | - vmov.i8 d3, #0 |
145 | | - mov r0, r1 |
146 | | - mov r4, #3 |
147 | | - mov lr, #12 |
148 | | - vld1.8 {q11}, [r0], lr |
149 | | -1: |
150 | | - vld1.8 {q10}, [r2], lr |
151 | | - vext.8 d5, d22, d23, #3 |
152 | | - vhadd.u8 d0, d3, d20 |
153 | | - vext.8 d17, d20, d21, #3 |
154 | | - vadd.u8 d0, d0, d22 |
155 | | - vext.8 d6, d22, d23, #6 |
156 | | - vhadd.u8 d1, d0, d17 |
157 | | - vext.8 d18, d20, d21, #6 |
158 | | - vadd.u8 d1, d1, d5 |
159 | | - vext.8 d7, d23, d23, #1 |
160 | | - vld1.8 {q11}, [r0], lr |
161 | | - vst1.32 {d0[0]}, [r1,:32], r4 |
162 | | - vhadd.u8 d2, d1, d18 |
163 | | - vst1.32 {d1[0]}, [r1], r4 |
164 | | - vext.8 d19, d21, d21, #1 |
165 | | - vadd.u8 d2, d2, d6 |
166 | | - vhadd.u8 d3, d2, d19 |
167 | | - vst1.32 {d2[0]}, [r1], r4 |
168 | | - vadd.u8 d3, d3, d7 |
169 | | - vst1.32 {d3[0]}, [r1], r4 |
170 | | - subs r12, r12, #12 |
171 | | - bgt 1b |
172 | | - |
173 | | - pop {r4,pc} |
174 | | -endfunc |
175 | | - |
176 | | -.macro paeth rx, ra, rb, rc |
177 | | - vaddl.u8 q12, \ra, \rb @ a + b |
178 | | - vaddl.u8 q15, \rc, \rc @ 2*c |
179 | | - vabdl.u8 q13, \rb, \rc @ pa |
180 | | - vabdl.u8 q14, \ra, \rc @ pb |
181 | | - vabd.u16 q15, q12, q15 @ pc |
182 | | - vcle.u16 q12, q13, q14 @ pa <= pb |
183 | | - vcle.u16 q13, q13, q15 @ pa <= pc |
184 | | - vcle.u16 q14, q14, q15 @ pb <= pc |
185 | | - vand q12, q12, q13 @ pa <= pb && pa <= pc |
186 | | - vmovn.u16 d28, q14 |
187 | | - vmovn.u16 \rx, q12 |
188 | | - vbsl d28, \rb, \rc |
189 | | - vbsl \rx, \ra, d28 |
190 | | -.endm |
191 | | - |
192 | | -func png_read_filter_row_paeth4_neon, export=1 |
193 | | - ldr r12, [r0, #4] @ rowbytes |
194 | | - vmov.i8 d3, #0 |
195 | | - vmov.i8 d20, #0 |
196 | | -1: |
197 | | - vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] |
198 | | - vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]! |
199 | | - paeth d0, d3, d16, d20 |
200 | | - vadd.u8 d0, d0, d4 |
201 | | - paeth d1, d0, d17, d16 |
202 | | - vadd.u8 d1, d1, d5 |
203 | | - paeth d2, d1, d18, d17 |
204 | | - vadd.u8 d2, d2, d6 |
205 | | - paeth d3, d2, d19, d18 |
206 | | - vmov d20, d19 |
207 | | - vadd.u8 d3, d3, d7 |
208 | | - vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! |
209 | | - subs r12, r12, #16 |
210 | | - bgt 1b |
211 | | - |
212 | | - bx lr |
213 | | -endfunc |
214 | | - |
215 | | -func png_read_filter_row_paeth3_neon, export=1 |
216 | | - push {r4,lr} |
217 | | - ldr r12, [r0, #4] @ rowbytes |
218 | | - vmov.i8 d3, #0 |
219 | | - vmov.i8 d4, #0 |
220 | | - mov r0, r1 |
221 | | - mov r4, #3 |
222 | | - mov lr, #12 |
223 | | - vld1.8 {q11}, [r0], lr |
224 | | -1: |
225 | | - vld1.8 {q10}, [r2], lr |
226 | | - paeth d0, d3, d20, d4 |
227 | | - vext.8 d5, d22, d23, #3 |
228 | | - vadd.u8 d0, d0, d22 |
229 | | - vext.8 d17, d20, d21, #3 |
230 | | - paeth d1, d0, d17, d20 |
231 | | - vst1.32 {d0[0]}, [r1,:32], r4 |
232 | | - vext.8 d6, d22, d23, #6 |
233 | | - vadd.u8 d1, d1, d5 |
234 | | - vext.8 d18, d20, d21, #6 |
235 | | - paeth d2, d1, d18, d17 |
236 | | - vext.8 d7, d23, d23, #1 |
237 | | - vld1.8 {q11}, [r0], lr |
238 | | - vst1.32 {d1[0]}, [r1], r4 |
239 | | - vadd.u8 d2, d2, d6 |
240 | | - vext.8 d19, d21, d21, #1 |
241 | | - paeth d3, d2, d19, d18 |
242 | | - vst1.32 {d2[0]}, [r1], r4 |
243 | | - vmov d4, d19 |
244 | | - vadd.u8 d3, d3, d7 |
245 | | - vst1.32 {d3[0]}, [r1], r4 |
246 | | - subs r12, r12, #12 |
247 | | - bgt 1b |
| 51 | +#if (GNUC_VERSION > 0) && (GNUC_VERSION < 40300) |
| 52 | +#error "PNG_ARM_NEON is not supported with gcc versions earlier than 4.3.0" |
| 53 | +#elif GNUC_VERSION == 40504 |
| 54 | +#error "PNG_ARM_NEON is not supported with gcc version 4.5.4" |
| 55 | +#else |
| 56 | +#error "Please use 'arm/*_neon_intrinsics.c' for PNG_ARM_NEON support" |
| 57 | +#endif |
248 | 58 |
|
249 | | - pop {r4,pc} |
250 | | -endfunc |
251 | 59 | #endif /* PNG_ARM_NEON_OPT > 0 */ |
252 | | -#endif /* PNG_ARM_NEON_IMPLEMENTATION == 2 (assembler) */ |
| 60 | +#endif /* PNG_ARM_NEON_IMPLEMENTATION == 2 */ |
253 | 61 | #endif /* READ */ |
0 commit comments