-

ReferenceType · ReferenceType · commit c2216efba089 · 2025-01-24T06:59:57.000+01:00
diff --git a/Examples/CrossPlatformTest/Program.cs b/Examples/CrossPlatformTest/Program.cs
@@ -15,7 +15,7 @@ internal class Program
         static void Main(string[] args)
         {
            
-            Converter.EnableNEON = true;
+            Converter.EnableNEON = false;
             Converter.NumThreads = 1;
 
             H264Encoder encoder = new H264Encoder();
diff --git a/H264SharpNative/CMakeLists.txt b/H264SharpNative/CMakeLists.txt
@@ -52,7 +52,7 @@ add_library (CMakeProject1 SHARED
 "Encoder.cpp"
 "ImageTypes.cpp"
 "pch.cpp"
-"Yuv2Rgb.cpp"
+"Yuv2Rgb.cpp" 
 "Yuv2RgbSSE.cpp"
 "Yuv2RgbNEON.cpp"
 "ThreadPool.cpp"
diff --git a/H264SharpNative/Rgb2YuvNEON.cpp b/H264SharpNative/Rgb2YuvNEON.cpp
@@ -50,24 +50,34 @@ namespace H264Sharp
     ) {
         
         unsigned char* buffer = dst;
+        /*
+               * buffer[yIndex++] = ((25 * b + 129 * g + 66 * r) >> 8) + 16;
+           buffer[yIndex++] = ((25 * b1 + 129 * g1 + 66 * r1) >> 8) + 16;
 
+           buffer[uIndex++] = ((112 * r + -94 * g + -18 * b) >> 8) + 128;
+           buffer[vIndex++] = ((-38 * r + -74 * g + 112 * b) >> 8) + 128;
+               */
         // SIMD constants for YUV conversion
-        const uint16x8_t kR_Y = vdupq_n_u16(66);
-        const uint16x8_t kG_Y = vdupq_n_u16(129);
         const uint16x8_t kB_Y = vdupq_n_u16(25);
-        const uint16x8_t kR_U = vdupq_n_u16(112);  
+        const uint16x8_t kG_Y = vdupq_n_u16(129);
+        const uint16x8_t kR_Y = vdupq_n_u16(66);
+        const uint16x8_t offset_Y = vdupq_n_u16(16);
+
+
+        const int16x8_t kR_U = vdupq_n_s16(112);  
         const int16x8_t kG_U = vdupq_n_s16(-94);   
-        const int16x8_t kB_U = vdupq_n_s16(-18);   
+        const int16x8_t kB_U = vdupq_n_s16(-18);  
+
         const int16x8_t kR_V = vdupq_n_s16(-38);  
         const int16x8_t kG_V = vdupq_n_s16(-74);   
-        const uint16x8_t kB_V = vdupq_n_u16(112);  
-        const uint16x8_t offset_Y = vdupq_n_u16(16);
-        const uint16x8_t offset_UV = vdupq_n_u16(128);
+        const int16x8_t kB_V = vdupq_n_s16(112);  
+
+        const int16x8_t offset_UV = vdupq_n_s16(128);
 
         int index = 0;
         int yIndex = 0;
         int uIndex = width* height;
-        int vIndex = uIndex + width * height/4;
+        int vIndex = uIndex + (uIndex>>2);
         // Loop over the specified range of rows
         for (int row = begin; row < end; row+=2) {
             // first row includes UV
@@ -88,7 +98,7 @@ namespace H264Sharp
                      g = pixels.val[G_INDEX];
                      b = pixels.val[B_INDEX];
                 }
-                
+               
 
                 //Widen to 16 bits unsigned
                 uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
@@ -99,7 +109,7 @@ namespace H264Sharp
                 uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
 
                 // Y Channel (Unsigned because we can overflow) 
-                uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
+                uint16x8_t y_low  = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
                 uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
                 // div 256(shift 8) + offset
                 y_low = vshrq_n_u16(y_low, 8);
@@ -108,6 +118,7 @@ namespace H264Sharp
                 y_high = vaddq_u16(y_high, offset_Y);
 
                 // shrink combine strore
+
                 vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
                 yIndex += 16;
 
@@ -118,13 +129,11 @@ namespace H264Sharp
 
                 // Compute U channel (average over 2x2 blocks)
                 int16x8_t u = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_U), vmulq_s16(g_low_signed, kG_U)), vmulq_s16(b_low_signed, kB_U));
-                u = vshrq_n_s16(u, 8);
-                u = vaddq_s16(u, offset_UV);
+                u = vaddq_s16(vshrq_n_s16(u, 8), offset_UV);
 
                 // Compute V channel
-                int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(r_low_signed, kR_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(b_low_signed, kB_V));
-                v = vshrq_n_s16(v, 8);
-                v = vaddq_s16(v, offset_UV);
+                int16x8_t v = vaddq_s16(vaddq_s16(vmulq_s16(b_low_signed, kB_V), vmulq_s16(g_low_signed, kG_V)), vmulq_s16(r_low_signed, kR_V));
+                v = vaddq_s16(vshrq_n_s16(v, 8), offset_UV);
 
                 // Store U and V
                 vst1_u8(&buffer[uIndex], vqmovun_s16(u));
@@ -136,12 +145,11 @@ namespace H264Sharp
             }
 
             // add stride offset here..
-            //index += row*stride - index;
+            index = stride * (row+1);
             //second row only Y
             for (int i = 0; i < width; i += 16) {
                 // Load 16 pixels (48 bytes) from src
                 uint8x16_t r, g, b;
-
                 if constexpr (NUM_CH == 4) {
                     uint8x16x4_t pixels = vld4q_u8(&src[index]);
 
@@ -157,26 +165,33 @@ namespace H264Sharp
                     b = pixels.val[B_INDEX];
                 }
 
+
+                //Widen to 16 bits unsigned
                 uint16x8_t r_low = vmovl_u8(vget_low_u8(r));
                 uint16x8_t r_high = vmovl_u8(vget_high_u8(r));
                 uint16x8_t g_low = vmovl_u8(vget_low_u8(g));
                 uint16x8_t g_high = vmovl_u8(vget_high_u8(g));
                 uint16x8_t b_low = vmovl_u8(vget_low_u8(b));
                 uint16x8_t b_high = vmovl_u8(vget_high_u8(b));
 
-                // Y Channel (Unsigned because we can overflow the signed) 
+                // Y Channel (Unsigned because we can overflow) 
                 uint16x8_t y_low = vaddq_u16(vaddq_u16(vmulq_u16(r_low, kR_Y), vmulq_u16(g_low, kG_Y)), vmulq_u16(b_low, kB_Y));
                 uint16x8_t y_high = vaddq_u16(vaddq_u16(vmulq_u16(r_high, kR_Y), vmulq_u16(g_high, kG_Y)), vmulq_u16(b_high, kB_Y));
+                // div 256(shift 8) + offset
                 y_low = vshrq_n_u16(y_low, 8);
                 y_high = vshrq_n_u16(y_high, 8);
                 y_low = vaddq_u16(y_low, offset_Y);
                 y_high = vaddq_u16(y_high, offset_Y);
 
+                // shrink combine strore
                 vst1q_u8(&buffer[yIndex], vcombine_u8(vqmovn_u16(y_low), vqmovn_u16(y_high)));
                 yIndex += 16;
 
                 index += NUM_CH * 16;
             }
+            
+            index = stride * (row + 2);
+
         }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ internal class Program`
`15`	`15`	`static void Main(string[] args)`
`16`	`16`	`{`
`17`	`17`
`18`		`- Converter.EnableNEON = true;`
	`18`	`+ Converter.EnableNEON = false;`
`19`	`19`	`Converter.NumThreads = 1;`
`20`	`20`
`21`	`21`	`H264Encoder encoder = new H264Encoder();`