11/* palette_neon_intrinsics.c - NEON optimised palette expansion functions
22 *
3- * Copyright (c) 2018-2019 Cosmin Truta
3+ * Copyright (c) 2018-2026 Cosmin Truta
44 * Copyright (c) 2017-2018 Arm Holdings. All rights reserved.
55 * Written by Richard Townsend <Richard.Townsend@arm.com>, February 2017.
66 *
@@ -49,12 +49,12 @@ png_riffle_palette_neon(png_structrp png_ptr)
4949 w .val [0 ] = v .val [0 ];
5050 w .val [1 ] = v .val [1 ];
5151 w .val [2 ] = v .val [2 ];
52- vst4q_u8 (riffled_palette + ( i << 2 ) , w );
52+ vst4q_u8 (riffled_palette + i * 4 , w );
5353 }
5454
5555 /* Fix up the missing transparency values. */
5656 for (i = 0 ; i < num_trans ; i ++ )
57- riffled_palette [( i << 2 ) + 3 ] = trans_alpha [i ];
57+ riffled_palette [i * 4 + 3 ] = trans_alpha [i ];
5858}
5959
6060/* Expands a palettized row into RGBA8. */
@@ -78,27 +78,26 @@ png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
7878 * The NEON part writes forward from a given position, so we have
7979 * to seek this back by 4 pixels x 4 bytes.
8080 */
81- * ddp = * ddp - (( pixels_per_chunk * sizeof ( png_uint_32 )) - 1 );
81+ * ddp = * ddp - (pixels_per_chunk * 4 - 1 );
8282
83- for (i = 0 ; i < row_width ; i += pixels_per_chunk )
83+ for (i = 0 ; i + pixels_per_chunk <= row_width ; i += pixels_per_chunk )
8484 {
8585 uint32x4_t cur ;
86- png_bytep sp = * ssp - i , dp = * ddp - ( i << 2 ) ;
86+ png_bytep sp = * ssp - i , dp = * ddp - i * 4 ;
8787 cur = vld1q_dup_u32 (riffled_palette + * (sp - 3 ));
8888 cur = vld1q_lane_u32 (riffled_palette + * (sp - 2 ), cur , 1 );
8989 cur = vld1q_lane_u32 (riffled_palette + * (sp - 1 ), cur , 2 );
9090 cur = vld1q_lane_u32 (riffled_palette + * (sp - 0 ), cur , 3 );
9191 vst1q_u32 ((void * )dp , cur );
9292 }
93- if (i != row_width )
94- {
95- /* Remove the amount that wasn't processed. */
96- i -= pixels_per_chunk ;
97- }
9893
99- /* Decrement output pointers. */
94+ /* Undo the pre-adjustment of *ddp before the pointer handoff,
95+ * so the scalar fallback in pngrtran.c receives a dp that points
96+ * to the correct position.
97+ */
98+ * ddp = * ddp + (pixels_per_chunk * 4 - 1 );
10099 * ssp = * ssp - i ;
101- * ddp = * ddp - ( i << 2 ) ;
100+ * ddp = * ddp - i * 4 ;
102101 return i ;
103102}
104103
@@ -119,32 +118,30 @@ png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
119118 return 0 ;
120119
121120 /* Seeking this back by 8 pixels x 3 bytes. */
122- * ddp = * ddp - (( pixels_per_chunk * sizeof ( png_color )) - 1 );
121+ * ddp = * ddp - (pixels_per_chunk * 3 - 1 );
123122
124- for (i = 0 ; i < row_width ; i += pixels_per_chunk )
123+ for (i = 0 ; i + pixels_per_chunk <= row_width ; i += pixels_per_chunk )
125124 {
126125 uint8x8x3_t cur ;
127- png_bytep sp = * ssp - i , dp = * ddp - (( i << 1 ) + i ) ;
128- cur = vld3_dup_u8 (palette + sizeof ( png_color ) * ( * ( sp - 7 )) );
129- cur = vld3_lane_u8 (palette + sizeof ( png_color ) * ( * ( sp - 6 )) , cur , 1 );
130- cur = vld3_lane_u8 (palette + sizeof ( png_color ) * ( * ( sp - 5 )) , cur , 2 );
131- cur = vld3_lane_u8 (palette + sizeof ( png_color ) * ( * ( sp - 4 )) , cur , 3 );
132- cur = vld3_lane_u8 (palette + sizeof ( png_color ) * ( * ( sp - 3 )) , cur , 4 );
133- cur = vld3_lane_u8 (palette + sizeof ( png_color ) * ( * ( sp - 2 )) , cur , 5 );
134- cur = vld3_lane_u8 (palette + sizeof ( png_color ) * ( * ( sp - 1 )) , cur , 6 );
135- cur = vld3_lane_u8 (palette + sizeof ( png_color ) * ( * ( sp - 0 )) , cur , 7 );
126+ png_bytep sp = * ssp - i , dp = * ddp - i * 3 ;
127+ cur = vld3_dup_u8 (palette + * ( sp - 7 ) * 3 );
128+ cur = vld3_lane_u8 (palette + * ( sp - 6 ) * 3 , cur , 1 );
129+ cur = vld3_lane_u8 (palette + * ( sp - 5 ) * 3 , cur , 2 );
130+ cur = vld3_lane_u8 (palette + * ( sp - 4 ) * 3 , cur , 3 );
131+ cur = vld3_lane_u8 (palette + * ( sp - 3 ) * 3 , cur , 4 );
132+ cur = vld3_lane_u8 (palette + * ( sp - 2 ) * 3 , cur , 5 );
133+ cur = vld3_lane_u8 (palette + * ( sp - 1 ) * 3 , cur , 6 );
134+ cur = vld3_lane_u8 (palette + * ( sp - 0 ) * 3 , cur , 7 );
136135 vst3_u8 ((void * )dp , cur );
137136 }
138137
139- if (i != row_width )
140- {
141- /* Remove the amount that wasn't processed. */
142- i -= pixels_per_chunk ;
143- }
144-
145- /* Decrement output pointers. */
138+ /* Undo the pre-adjustment of *ddp before the pointer handoff,
139+ * so the scalar fallback in pngrtran.c receives a dp that points
140+ * to the correct position.
141+ */
142+ * ddp = * ddp + (pixels_per_chunk * 3 - 1 );
146143 * ssp = * ssp - i ;
147- * ddp = * ddp - (( i << 1 ) + i ) ;
144+ * ddp = * ddp - i * 3 ;
148145 return i ;
149146}
150147
0 commit comments