@@ -50,24 +50,34 @@ namespace H264Sharp
5050 ) {
5151
5252 unsigned char * buffer = dst;
53+ /*
54+ * buffer[yIndex++] = ((25 * b + 129 * g + 66 * r) >> 8) + 16;
55+ buffer[yIndex++] = ((25 * b1 + 129 * g1 + 66 * r1) >> 8) + 16;
5356
57+ buffer[uIndex++] = ((112 * r + -94 * g + -18 * b) >> 8) + 128;
58+ buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
59+ */
5460 // SIMD constants for YUV conversion
55- const uint16x8_t kR_Y = vdupq_n_u16 (66 );
56- const uint16x8_t kG_Y = vdupq_n_u16 (129 );
5761 const uint16x8_t kB_Y = vdupq_n_u16 (25 );
58- const uint16x8_t kR_U = vdupq_n_u16 (112 );
62+ const uint16x8_t kG_Y = vdupq_n_u16 (129 );
63+ const uint16x8_t kR_Y = vdupq_n_u16 (66 );
64+ const uint16x8_t offset_Y = vdupq_n_u16 (16 );
65+
66+
67+ const int16x8_t kR_U = vdupq_n_s16 (112 );
5968 const int16x8_t kG_U = vdupq_n_s16 (-94 );
60- const int16x8_t kB_U = vdupq_n_s16 (-18 );
69+ const int16x8_t kB_U = vdupq_n_s16 (-18 );
70+
6171 const int16x8_t kR_V = vdupq_n_s16 (-38 );
6272 const int16x8_t kG_V = vdupq_n_s16 (-74 );
63- const uint16x8_t kB_V = vdupq_n_u16 (112 );
64- const uint16x8_t offset_Y = vdupq_n_u16 ( 16 );
65- const uint16x8_t offset_UV = vdupq_n_u16 (128 );
73+ const int16x8_t kB_V = vdupq_n_s16 (112 );
74+
75+ const int16x8_t offset_UV = vdupq_n_s16 (128 );
6676
6777 int index = 0 ;
6878 int yIndex = 0 ;
6979 int uIndex = width* height;
70- int vIndex = uIndex + width * height/ 4 ;
80+ int vIndex = uIndex + (uIndex>> 2 ) ;
7181 // Loop over the specified range of rows
7282 for (int row = begin; row < end; row+=2 ) {
7383 // first row includes UV
@@ -88,7 +98,7 @@ namespace H264Sharp
8898 g = pixels.val [G_INDEX];
8999 b = pixels.val [B_INDEX];
90100 }
91-
101+
92102
93103 // Widen to 16 bits unsigned
94104 uint16x8_t r_low = vmovl_u8 (vget_low_u8 (r));
@@ -99,7 +109,7 @@ namespace H264Sharp
99109 uint16x8_t b_high = vmovl_u8 (vget_high_u8 (b));
100110
101111 // Y Channel (Unsigned because we can overflow)
102- uint16x8_t y_low = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_low, kR_Y ), vmulq_u16 (g_low, kG_Y )), vmulq_u16 (b_low, kB_Y ));
112+ uint16x8_t y_low = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_low, kR_Y ), vmulq_u16 (g_low, kG_Y )), vmulq_u16 (b_low, kB_Y ));
103113 uint16x8_t y_high = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_high, kR_Y ), vmulq_u16 (g_high, kG_Y )), vmulq_u16 (b_high, kB_Y ));
104114 // div 256(shift 8) + offset
105115 y_low = vshrq_n_u16 (y_low, 8 );
@@ -108,6 +118,7 @@ namespace H264Sharp
108118 y_high = vaddq_u16 (y_high, offset_Y);
109119
110120 // shrink combine strore
121+
111122 vst1q_u8 (&buffer[yIndex], vcombine_u8 (vqmovn_u16 (y_low), vqmovn_u16 (y_high)));
112123 yIndex += 16 ;
113124
@@ -118,13 +129,11 @@ namespace H264Sharp
118129
119130 // Compute U channel (average over 2x2 blocks)
120131 int16x8_t u = vaddq_s16 (vaddq_s16 (vmulq_s16 (r_low_signed, kR_U ), vmulq_s16 (g_low_signed, kG_U )), vmulq_s16 (b_low_signed, kB_U ));
121- u = vshrq_n_s16 (u, 8 );
122- u = vaddq_s16 (u, offset_UV);
132+ u = vaddq_s16 (vshrq_n_s16 (u, 8 ), offset_UV);
123133
124134 // Compute V channel
125- int16x8_t v = vaddq_s16 (vaddq_s16 (vmulq_s16 (r_low_signed, kR_V ), vmulq_s16 (g_low_signed, kG_V )), vmulq_s16 (b_low_signed, kB_V ));
126- v = vshrq_n_s16 (v, 8 );
127- v = vaddq_s16 (v, offset_UV);
135+ int16x8_t v = vaddq_s16 (vaddq_s16 (vmulq_s16 (b_low_signed, kB_V ), vmulq_s16 (g_low_signed, kG_V )), vmulq_s16 (r_low_signed, kR_V ));
136+ v = vaddq_s16 (vshrq_n_s16 (v, 8 ), offset_UV);
128137
129138 // Store U and V
130139 vst1_u8 (&buffer[uIndex], vqmovun_s16 (u));
@@ -136,12 +145,11 @@ namespace H264Sharp
136145 }
137146
138147 // add stride offset here..
139- // index += row* stride - index ;
148+ index = stride * (row+ 1 ) ;
140149 // second row only Y
141150 for (int i = 0 ; i < width; i += 16 ) {
142151 // Load 16 pixels (48 bytes) from src
143152 uint8x16_t r, g, b;
144-
145153 if constexpr (NUM_CH == 4 ) {
146154 uint8x16x4_t pixels = vld4q_u8 (&src[index]);
147155
@@ -157,26 +165,33 @@ namespace H264Sharp
157165 b = pixels.val [B_INDEX];
158166 }
159167
168+
169+ // Widen to 16 bits unsigned
160170 uint16x8_t r_low = vmovl_u8 (vget_low_u8 (r));
161171 uint16x8_t r_high = vmovl_u8 (vget_high_u8 (r));
162172 uint16x8_t g_low = vmovl_u8 (vget_low_u8 (g));
163173 uint16x8_t g_high = vmovl_u8 (vget_high_u8 (g));
164174 uint16x8_t b_low = vmovl_u8 (vget_low_u8 (b));
165175 uint16x8_t b_high = vmovl_u8 (vget_high_u8 (b));
166176
167- // Y Channel (Unsigned because we can overflow the signed )
177+ // Y Channel (Unsigned because we can overflow)
168178 uint16x8_t y_low = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_low, kR_Y ), vmulq_u16 (g_low, kG_Y )), vmulq_u16 (b_low, kB_Y ));
169179 uint16x8_t y_high = vaddq_u16 (vaddq_u16 (vmulq_u16 (r_high, kR_Y ), vmulq_u16 (g_high, kG_Y )), vmulq_u16 (b_high, kB_Y ));
180+ // div 256(shift 8) + offset
170181 y_low = vshrq_n_u16 (y_low, 8 );
171182 y_high = vshrq_n_u16 (y_high, 8 );
172183 y_low = vaddq_u16 (y_low, offset_Y);
173184 y_high = vaddq_u16 (y_high, offset_Y);
174185
186+ // shrink combine strore
175187 vst1q_u8 (&buffer[yIndex], vcombine_u8 (vqmovn_u16 (y_low), vqmovn_u16 (y_high)));
176188 yIndex += 16 ;
177189
178190 index += NUM_CH * 16 ;
179191 }
192+
193+ index = stride * (row + 2 );
194+
180195 }
181196 }
182197
0 commit comments