|
| 1 | +#include "Rgb2Yuv.h" |
| 2 | +#include <arm_neon.h> |
| 3 | +#include <cstdint> |
| 4 | + |
| 5 | +namespace H264Sharp |
| 6 | +{ |
| 7 | + void Rgb2Yuv::BGRAtoYUV420PlanarNeon(const unsigned char* bgra, unsigned char* dst, int width, int height, int stride, int threadCount) |
| 8 | + { |
| 9 | + RGBA2YUVP_ParallelBody_SIMD<2,1,0>(bgra, dst, width, height, stride, 0, height); |
| 10 | + } |
| 11 | + void Rgb2Yuv::BGRtoYUV420PlanarNeon(unsigned char* bgr, unsigned char* dst, int width, int height, int stride, int threadCount) |
| 12 | + { |
| 13 | + RGBA2YUVP_ParallelBody_SIMD<2,1,0>(bgr, dst, width, height, stride, 0, height); |
| 14 | + } |
| 15 | + void Rgb2Yuv::RGBAtoYUV420PlanarNeon(unsigned char* rgba, unsigned char* dst, int width, int height, int stride, int threadCount) |
| 16 | + { |
| 17 | + RGBA2YUVP_ParallelBody_SIMD<0, 1, 2>(rgba, dst, width, height, stride, 0, height); |
| 18 | + } |
| 19 | + void Rgb2Yuv::RGBtoYUV420PlanarNeon(unsigned char* rgb, unsigned char* dst, int width, int height, int stride, int threadCount) |
| 20 | + { |
| 21 | + RGB2YUVP_ParallelBody_SIMD<0, 1, 2>(rgb, dst, width, height, stride, 0, height); |
| 22 | + } |
| 23 | + |
| 24 | + |
| 25 | + // Template function for SIMD-based RGB to YUV conversion |
| 26 | + template <int R_INDEX, int G_INDEX, int B_INDEX> |
| 27 | + inline void RGB2YUVP_ParallelBody_SIMD( |
| 28 | + const unsigned char* src, |
| 29 | + unsigned char* dst, |
| 30 | + const int width, |
| 31 | + const int height, |
| 32 | + const int stride, |
| 33 | + const int begin, |
| 34 | + const int end |
| 35 | + ) { |
| 36 | + const int wi = width / 2; // Width for U/V channels |
| 37 | + const int yPlaneSize = width * height; |
| 38 | + const int uvPlaneSize = yPlaneSize / 4; |
| 39 | + |
| 40 | + unsigned char* buffer = dst; |
| 41 | + |
| 42 | + // SIMD constants for YUV conversion |
| 43 | + const uint16x8_t kR_Y = vdupq_n_u16(66); |
| 44 | + const uint16x8_t kG_Y = vdupq_n_u16(129); |
| 45 | + const uint16x8_t kB_Y = vdupq_n_u16(25); |
| 46 | + const uint16x8_t kR_U = vdupq_n_u16(112); |
| 47 | + const int16x8_t kG_U = vdupq_n_s16(-94); |
| 48 | + const int16x8_t kB_U = vdupq_n_s16(-18); |
| 49 | + const int16x8_t kR_V = vdupq_n_s16(-38); |
| 50 | + const int16x8_t kG_V = vdupq_n_s16(-74); |
| 51 | + const uint16x8_t kB_V = vdupq_n_u16(112); |
| 52 | + const uint16x8_t offset_Y = vdupq_n_u16(16); |
| 53 | + const uint16x8_t offset_UV = vdupq_n_u16(128); |
| 54 | + |
| 55 | + // Loop over the specified range of rows |
| 56 | + for (int row = begin; row < end; row+=2) { |
| 57 | + const int uvLineIndex = row * wi; |
| 58 | + const int yIndexStart = row * width; |
| 59 | + const int readIndexStart = row * 2 * stride; |
| 60 | + |
| 61 | + int yIndex = yIndexStart; |
| 62 | + int uIndex = yPlaneSize + uvLineIndex; |
| 63 | + int vIndex = yPlaneSize + uvPlaneSize + uvLineIndex; |
| 64 | + int index = readIndexStart; |
| 65 | + // first row includes UV |
| 66 | + for (int i = 0; i < wi; i += 8) { |
| 67 | + // Load 16 pixels (48 bytes) from src |
| 68 | + uint8x16x3_t pixels = vld3q_u8(&src[index]); |
| 69 | + |
| 70 | + uint8x16_t r = pixels.val[R_INDEX]; |
| 71 | + uint8x16_t g = pixels.val[G_INDEX]; |
| 72 | + uint8x16_t b = pixels.val[B_INDEX]; |
| 73 | + |
| 74 | + //Widen to 16 bits unsigned |
| 75 | + uint16x8_t r_low = vmovl_u8(vget_low_u8(r)); |
| 76 | + uint16x8_t r_high = vmovl_u8(vget_high_u8(r)); |
| 77 | + uint16x8_t g_low = vmovl_u8(vget_low_u8(g)); |
| 78 | + uint16x8_t g_high = vmovl_u8(vget_high_u8(g)); |
| 79 | + uint16x8_t b_low = vmovl_u8(vget_low_u8(b)); |
| 80 | + uint16x8_t b_high = vmovl_u8(vget_high_u8(b)); |
| 81 | + |
| 82 | + // Y Channel (Unsigned because we can overflow) |
| 83 | + uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y)); |
| 84 | + uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y)); |
| 85 | + // div 256(shift 8) + offset |
| 86 | + y_low = vshrq_n_u16(y_low, 8); |
| 87 | + y_high = vshrq_n_u16(y_high, 8); |
| 88 | + y_low = vaddq_u16(y_low, offset_Y); |
| 89 | + y_high = vaddq_u16(y_high, offset_Y); |
| 90 | + |
| 91 | + // shrink combine strore |
| 92 | + vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high))); |
| 93 | + yIndex += 16; |
| 94 | + |
| 95 | + // we need signed here |
| 96 | + int16x8_t r_low_signed = vreinterpretq_s16_u16(r_low); |
| 97 | + int16x8_t g_low_signed = vreinterpretq_s16_u16(g_low); |
| 98 | + int16x8_t b_low_signed = vreinterpretq_s16_u16(b_low); |
| 99 | + |
| 100 | + // Compute U channel (average over 2x2 blocks) |
| 101 | + int16x8_t u = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_U), vmulq_s16(g_low_signed, kG_U)), vmulq_s16(b_low_signed, kB_U)); |
| 102 | + u = vshrq_n_s16(u, 8); |
| 103 | + u = vaddq_s16(u, offset_UV); |
| 104 | + |
| 105 | + // Compute V channel |
| 106 | + int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(b_low_signed, kB_V)); |
| 107 | + v = vshrq_n_s16(v, 8); |
| 108 | + v = vaddq_s16(v, offset_UV); |
| 109 | + |
| 110 | + // Store U and V |
| 111 | + vst1_u8(&buffer[uIndex], vqmovun_s16(u)); |
| 112 | + vst1_u8(&buffer[vIndex], vqmovun_s16(v)); |
| 113 | + uIndex += 8; |
| 114 | + vIndex += 8; |
| 115 | + |
| 116 | + index += 48; |
| 117 | + } |
| 118 | + //second row only Y |
| 119 | + for (int i = 0; i < wi; i += 8) { |
| 120 | + // Load 16 pixels (48 bytes) from src |
| 121 | + uint8x16x3_t pixels = vld3q_u8(&src[index]); |
| 122 | + |
| 123 | + uint8x16_t r = pixels.val[R_INDEX]; |
| 124 | + uint8x16_t g = pixels.val[G_INDEX]; |
| 125 | + uint8x16_t b = pixels.val[B_INDEX]; |
| 126 | + |
| 127 | + uint16x8_t r_low = vmovl_u8(vget_low_u8(r)); |
| 128 | + uint16x8_t r_high = vmovl_u8(vget_high_u8(r)); |
| 129 | + uint16x8_t g_low = vmovl_u8(vget_low_u8(g)); |
| 130 | + uint16x8_t g_high = vmovl_u8(vget_high_u8(g)); |
| 131 | + uint16x8_t b_low = vmovl_u8(vget_low_u8(b)); |
| 132 | + uint16x8_t b_high = vmovl_u8(vget_high_u8(b)); |
| 133 | + |
| 134 | + // Y Channel (Unsigned because we can overflow the signed) |
| 135 | + uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y)); |
| 136 | + uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y)); |
| 137 | + y_low = vshrq_n_u16(y_low, 8); |
| 138 | + y_high = vshrq_n_u16(y_high, 8); |
| 139 | + y_low = vaddq_u16(y_low, offset_Y); |
| 140 | + y_high = vaddq_u16(y_high, offset_Y); |
| 141 | + |
| 142 | + vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high))); |
| 143 | + yIndex += 16; |
| 144 | + |
| 145 | + index += 48; |
| 146 | + } |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + // Template function for SIMD-based RGB to YUV conversion |
| 151 | + template <int R_INDEX, int G_INDEX, int B_INDEX> |
| 152 | + inline void RGBA2YUVP_ParallelBody_SIMD( |
| 153 | + const unsigned char* src, |
| 154 | + unsigned char* dst, |
| 155 | + const int width, |
| 156 | + const int height, |
| 157 | + const int stride, |
| 158 | + const int begin, |
| 159 | + const int end |
| 160 | + ) { |
| 161 | + const int wi = width / 2; // Width for U/V channels |
| 162 | + const int yPlaneSize = width * height; |
| 163 | + const int uvPlaneSize = yPlaneSize / 4; |
| 164 | + |
| 165 | + unsigned char* buffer = dst; |
| 166 | + |
| 167 | + // SIMD constants for YUV conversion |
| 168 | + const uint16x8_t kR_Y = vdupq_n_u16(66); |
| 169 | + const uint16x8_t kG_Y = vdupq_n_u16(129); |
| 170 | + const uint16x8_t kB_Y = vdupq_n_u16(25); |
| 171 | + const uint16x8_t kR_U = vdupq_n_u16(112); |
| 172 | + const int16x8_t kG_U = vdupq_n_s16(-94); |
| 173 | + const int16x8_t kB_U = vdupq_n_s16(-18); |
| 174 | + const int16x8_t kR_V = vdupq_n_s16(-38); |
| 175 | + const int16x8_t kG_V = vdupq_n_s16(-74); |
| 176 | + const uint16x8_t kB_V = vdupq_n_u16(112); |
| 177 | + const uint16x8_t offset_Y = vdupq_n_u16(16); |
| 178 | + const uint16x8_t offset_UV = vdupq_n_u16(128); |
| 179 | + |
| 180 | + // Loop over the specified range of rows |
| 181 | + for (int row = begin; row < end; row += 2) { |
| 182 | + const int uvLineIndex = row * wi; |
| 183 | + const int yIndexStart = row * width; |
| 184 | + const int readIndexStart = row * 2 * stride; |
| 185 | + |
| 186 | + int yIndex = yIndexStart; |
| 187 | + int uIndex = yPlaneSize + uvLineIndex; |
| 188 | + int vIndex = yPlaneSize + uvPlaneSize + uvLineIndex; |
| 189 | + int index = readIndexStart; |
| 190 | + // first row includes UV |
| 191 | + for (int i = 0; i < wi; i += 8) |
| 192 | + { |
| 193 | + // Load 24 pixels (64 bytes) from src |
| 194 | + uint8x16x4_t pixels = vld3q_u8(&src[index]); |
| 195 | + |
| 196 | + uint8x16_t r = pixels.val[R_INDEX]; |
| 197 | + uint8x16_t g = pixels.val[G_INDEX]; |
| 198 | + uint8x16_t b = pixels.val[B_INDEX]; |
| 199 | + |
| 200 | + |
| 201 | + uint16x8_t r_low = vmovl_u8(vget_low_u8(r)); |
| 202 | + uint16x8_t r_high = vmovl_u8(vget_high_u8(r)); |
| 203 | + uint16x8_t g_low = vmovl_u8(vget_low_u8(g)); |
| 204 | + uint16x8_t g_high = vmovl_u8(vget_high_u8(g)); |
| 205 | + uint16x8_t b_low = vmovl_u8(vget_low_u8(b)); |
| 206 | + uint16x8_t b_high = vmovl_u8(vget_high_u8(b)); |
| 207 | + |
| 208 | + // Y Channel (Unsigned because we can overflow the signed) |
| 209 | + uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y)); |
| 210 | + uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y)); |
| 211 | + y_low = vshrq_n_u16(y_low, 8); |
| 212 | + y_high = vshrq_n_u16(y_high, 8); |
| 213 | + y_low = vaddq_u16(y_low, offset_Y); |
| 214 | + y_high = vaddq_u16(y_high, offset_Y); |
| 215 | + |
| 216 | + vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high))); |
| 217 | + yIndex += 16; |
| 218 | + |
| 219 | + // we need signed here |
| 220 | + int16x8_t r_low_signed = vreinterpretq_s16_u16(r_low); |
| 221 | + int16x8_t g_low_signed = vreinterpretq_s16_u16(g_low); |
| 222 | + int16x8_t b_low_signed = vreinterpretq_s16_u16(b_low); |
| 223 | + |
| 224 | + // Compute U channel (average over 2x2 blocks) |
| 225 | + int16x8_t u = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_U), vmulq_s16(g_low_signed, kG_U)), vmulq_s16(b_low_signed, kB_U)); |
| 226 | + u = vshrq_n_s16(u, 8); |
| 227 | + u = vaddq_s16(u, offset_UV); |
| 228 | + |
| 229 | + // Compute V channel |
| 230 | + int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(b_low_signed, kB_V)); |
| 231 | + v = vshrq_n_s16(v, 8); |
| 232 | + v = vaddq_s16(v, offset_UV); |
| 233 | + |
| 234 | + // Store U and V |
| 235 | + vst1_u8(&buffer[uIndex], vqmovun_s16(u)); |
| 236 | + vst1_u8(&buffer[vIndex], vqmovun_s16(v)); |
| 237 | + uIndex += 8; |
| 238 | + vIndex += 8; |
| 239 | + |
| 240 | + index += 64; |
| 241 | + } |
| 242 | + //second row only Y |
| 243 | + for (int i = 0; i < wi; i += 8) { |
| 244 | + // Load 24 pixels (64 bytes) from src |
| 245 | + uint8x16x3_t pixels = vld3q_u8(&src[index]); |
| 246 | + |
| 247 | + uint8x16_t r = pixels.val[R_INDEX]; |
| 248 | + uint8x16_t g = pixels.val[G_INDEX]; |
| 249 | + uint8x16_t b = pixels.val[B_INDEX]; |
| 250 | + |
| 251 | + uint16x8_t r_low = vmovl_u8(vget_low_u8(r)); |
| 252 | + uint16x8_t r_high = vmovl_u8(vget_high_u8(r)); |
| 253 | + uint16x8_t g_low = vmovl_u8(vget_low_u8(g)); |
| 254 | + uint16x8_t g_high = vmovl_u8(vget_high_u8(g)); |
| 255 | + uint16x8_t b_low = vmovl_u8(vget_low_u8(b)); |
| 256 | + uint16x8_t b_high = vmovl_u8(vget_high_u8(b)); |
| 257 | + |
| 258 | + // Y Channel (Unsigned because we can overflow the signed) |
| 259 | + uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y)); |
| 260 | + uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y)); |
| 261 | + y_low = vshrq_n_u16(y_low, 8); |
| 262 | + y_high = vshrq_n_u16(y_high, 8); |
| 263 | + y_low = vaddq_u16(y_low, offset_Y); |
| 264 | + y_high = vaddq_u16(y_high, offset_Y); |
| 265 | + |
| 266 | + vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high))); |
| 267 | + yIndex += 16; |
| 268 | + index += 64; |
| 269 | + } |
| 270 | + } |
| 271 | + } |
| 272 | + |
| 273 | +} |
0 commit comments