fixed allignment issue on Yuv2RGB Neon

ReferenceType · ReferenceType · commit 1e368b6349ce · 2025-01-25T17:15:33.000+01:00
diff --git a/Examples/CrossPlatformTest/Program.cs b/Examples/CrossPlatformTest/Program.cs
@@ -15,7 +15,7 @@ internal class Program
         static void Main(string[] args)
         {
            
-            Converter.EnableNEON = false;
+            Converter.EnableNEON = true;
             Converter.NumThreads = 1;
 
             H264Encoder encoder = new H264Encoder();
diff --git a/H264SharpNative/CMakeLists.txt b/H264SharpNative/CMakeLists.txt
@@ -62,7 +62,7 @@ add_library (CMakeProject1 SHARED
 if (CMAKE_VERSION VERSION_GREATER 3.12)
   set_property(TARGET CMakeProject1 PROPERTY CXX_STANDARD 17)
 endif()
-#output name 
+#output name  
 #H264SharpNative-linux64.so 
 #H264SharpNative-linux64.so 
 
diff --git a/H264SharpNative/Converter.cpp b/H264SharpNative/Converter.cpp
@@ -49,7 +49,7 @@ namespace H264Sharp {
                         width,
                         height, numThreads);
                 else
-                    Yuv2Rgb::ConvertYUVToRGB_NEON(
+                    Yuv2Rgb::ConvertYUVToRGB_NEONv2(
                         y_ptr,
                         u_ptr,
                         v_ptr,
diff --git a/H264SharpNative/Yuv2Rgb.h b/H264SharpNative/Yuv2Rgb.h
@@ -47,6 +47,8 @@ extern const unsigned int yuv2rgb565_table1[];
 
             static void ConvertYUVToRGB_NEON(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
                 uint8_t* rgb_buffer, int width, int height);
+            static void ConvertYUVToRGB_NEONv2(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
+                uint8_t* rgb_buffer, int width, int height);
             static void ConvertYUVToRGB_NEON_Parallel(const uint8_t* y_plane, const uint8_t* u_plane, const uint8_t* v_plane,
                 uint8_t* rgb_buffer, int width, int height, int numThreads);
             
diff --git a/H264SharpNative/Yuv2RgbNEON.cpp b/H264SharpNative/Yuv2RgbNEON.cpp
@@ -8,21 +8,132 @@
 
 namespace H264Sharp
 {
-
+    /*
+    * R = CLAMP((Y-16)*1.164 +           1.596*V)
+      G = CLAMP((Y-16)*1.164 - 0.391*U - 0.813*V)
+      B = CLAMP((Y-16)*1.164 + 2.018*U          )
+    */
     // BT.601-7 studio range constants
     const int16x8_t const_16 = vdupq_n_s16(16);
     const int16x8_t const_128 = vdupq_n_s16(128);
-    const int16x8_t const_0 = vdupq_n_s16(0);
-    const int16x8_t const_255 = vdupq_n_s16(255);
+    const uint8x8_t const_125 = vdup_n_u8(125);
     // divide them by 2 here
     // Precalculated fixed-point coefficients
     // FP scaling: 1 << 7 (128) to maintain precision
     const int16_t y_factor = 149;      // 1.164 * 128
-    const int16_t v_to_r_coeff = 204;  // 1.596 * 128
-    const int16_t u_to_g_coeff = 50;   // 0.391 * 128
-    const int16_t v_to_g_coeff = 104;  // 0.813 * 128
-    const int16_t u_to_b_coeff = 258;  // 2.018 * 128
-   
+    const int16_t v_to_r_coeff = 102;  // 1.596 * 64
+    const int16_t u_to_g_coeff = 25;   // 0.391 * 64
+    const int16_t v_to_g_coeff = 52;  // 0.813 * 64
+    const int16_t u_to_b_coeff = 129;  // 2.018 * 64
+
+    void Yuv2Rgb::ConvertYUVToRGB_NEONv2(
+        const uint8_t* y_plane,
+        const uint8_t* u_plane,
+        const uint8_t* v_plane,
+        uint8_t* rgb_buffer,
+        int width,
+        int height)
+    {
+        const int uv_width = width / 2;
+        for (int y = 0; y < height; y += 2) {
+            const uint8_t* y_row1 = y_plane + y * width;
+            const uint8_t* y_row2 = y_row1 + width;
+            const uint8_t* u_row = u_plane + (y / 2) * uv_width;
+            const uint8_t* v_row = v_plane + (y / 2) * uv_width;
+            uint8_t* rgb_row1 = rgb_buffer + y * width * 3;
+            uint8_t* rgb_row2 = rgb_row1 + width * 3;
+
+            for (int x = 0; x < width; x += 16) {
+
+                // Load 8 U and V values
+                uint8x8_t u_vals8 = vld1_u8(u_row + (x / 2));
+                uint8x8_t v_vals8 = vld1_u8(v_row + (x / 2));
+
+                // Process U/V (widen then -128)
+                int16x8_t u_vals = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(u_vals8)), const_128);
+                int16x8_t v_vals = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(v_vals8)), const_128);
+
+                // duplicate [1,2,3,4,5,6,7,8]=> [1,1,2,2,3,3,4,4] , [5,5,6,6,7,7,8,8]
+                int16x8_t u_valsl = vzip1q_s16(u_vals, u_vals);
+                int16x8_t u_valsh = vzip2q_s16(u_vals, u_vals);
+
+                int16x8_t v_valsl = vzip1q_s16(v_vals, v_vals);
+                int16x8_t v_valsh = vzip2q_s16(v_vals, v_vals);
+
+                // multiply UV with the scaling
+                int16x8_t u_vals_ugl = vshrq_n_s16(vmulq_n_s16(u_valsl, u_to_g_coeff),6);
+                int16x8_t u_vals_ubl = vshrq_n_s16(vmulq_n_s16(u_valsl, u_to_b_coeff),6);
+                int16x8_t v_vals_vgl = vshrq_n_s16(vmulq_n_s16(v_valsl, v_to_g_coeff), 6);
+                int16x8_t v_vals_vrl = vshrq_n_s16(vmulq_n_s16(v_valsl, v_to_r_coeff), 6);
+
+                int16x8_t u_vals_ugh = vshrq_n_s16(vmulq_n_s16(u_valsh, u_to_g_coeff), 6);
+                int16x8_t u_vals_ubh = vshrq_n_s16(vmulq_n_s16(u_valsh, u_to_b_coeff), 6);
+                int16x8_t v_vals_vgh = vshrq_n_s16(vmulq_n_s16(v_valsh, v_to_g_coeff), 6);
+                int16x8_t v_vals_vrh = vshrq_n_s16(vmulq_n_s16(v_valsh, v_to_r_coeff), 6);
+
+                // Load 16 Y values for two rows
+                uint8x16_t y_vals1 = vld1q_u8(y_row1 + x);
+                uint8x16_t y_vals2 = vld1q_u8(y_row2 + x);
+
+
+                // Convert Y to 16-bit and adjust range
+                int16x8_t y_vals_16_1l = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_vals1))), const_16);
+                int16x8_t y_vals_16_1h = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y_vals1))), const_16);
+                //second row
+                int16x8_t y_vals_16_2l = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_vals2))), const_16);
+                int16x8_t y_vals_16_2h = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(y_vals2))), const_16);
+
+                // Scale Y (-16 and multiply by 1.164)
+                y_vals_16_1l = vshrq_n_s16(vmulq_n_s16(y_vals_16_1l, y_factor), 7);
+                y_vals_16_1h = vshrq_n_s16(vmulq_n_s16(y_vals_16_1h, y_factor), 7);
+                y_vals_16_2l = vshrq_n_s16(vmulq_n_s16(y_vals_16_2l, y_factor), 7);
+                y_vals_16_2h = vshrq_n_s16(vmulq_n_s16(y_vals_16_2h, y_factor), 7);
+
+                // Calculate RGB for first 8 pixels
+                int16x8_t r1l = vaddq_s16(y_vals_16_1l, v_vals_vrl);
+                int16x8_t g1l = vsubq_s16(vsubq_s16(y_vals_16_1l, u_vals_ugl), v_vals_vgl);
+                int16x8_t b1l = vaddq_s16(y_vals_16_1l, u_vals_ubl);
+               
+                int16x8_t r1h = vaddq_s16(y_vals_16_1h, v_vals_vrh);
+                int16x8_t g1h = vsubq_s16(vsubq_s16(y_vals_16_1h, u_vals_ugh), v_vals_vgh);
+                int16x8_t b1h = vaddq_s16(y_vals_16_1h, u_vals_ubh);
+
+                // Calculate RGB for second row
+                int16x8_t r2l = vaddq_s16(y_vals_16_2l, v_vals_vrl);
+                int16x8_t g2l = vsubq_s16(vsubq_s16(y_vals_16_2l, u_vals_ugl), v_vals_vgl);
+                int16x8_t b2l = vaddq_s16(y_vals_16_2l, u_vals_ubl);
+
+                int16x8_t r2h = vaddq_s16(y_vals_16_2h, v_vals_vrh);
+                int16x8_t g2h = vsubq_s16(vsubq_s16(y_vals_16_2h, u_vals_ugh), v_vals_vgh);
+                int16x8_t b2h = vaddq_s16(y_vals_16_2h, u_vals_ubh);
+
+                // Clamp values between 0 and 255
+                // Store first row (in BGR order)
+                uint8x8x3_t rgb1l, rgb1h;
+                rgb1l.val[0] = vqmovun_s16(b1l);
+                rgb1l.val[1] = vqmovun_s16(g1l);
+                rgb1l.val[2] = vqmovun_s16(r1l);
+
+                rgb1h.val[0] = vqmovun_s16(b1h);
+                rgb1h.val[1] = vqmovun_s16(g1h);
+                rgb1h.val[2] = vqmovun_s16(r1h);
+
+                vst3_u8(rgb_row1 + x * 3, rgb1l);
+                vst3_u8(rgb_row1 + (x * 3) + 24, rgb1h);
+
+                rgb1l.val[0] = vqmovun_s16(b2l);
+                rgb1l.val[1] = vqmovun_s16(g2l);
+                rgb1l.val[2] = vqmovun_s16(r2l);
+
+                rgb1h.val[0] = vqmovun_s16(b2h);
+                rgb1h.val[1] = vqmovun_s16(g2h);
+                rgb1h.val[2] = vqmovun_s16(r2h);
+
+                vst3_u8(rgb_row2 + x * 3, rgb1l);
+                vst3_u8(rgb_row2 + (x * 3) + 24, rgb1h);
+            }
+        }
+    }
 
     void Yuv2Rgb::ConvertYUVToRGB_NEON(
         const uint8_t* y_plane,

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ internal class Program`
`15`	`15`	`static void Main(string[] args)`
`16`	`16`	`{`
`17`	`17`
`18`		`- Converter.EnableNEON = false;`
	`18`	`+ Converter.EnableNEON = true;`
`19`	`19`	`Converter.NumThreads = 1;`
`20`	`20`
`21`	`21`	`H264Encoder encoder = new H264Encoder();`