22using System . Drawing ;
33using System . Drawing . Drawing2D ;
44using System . Drawing . Imaging ;
5+ using System . Numerics ;
56using System . Runtime . CompilerServices ;
67using System . Runtime . InteropServices ;
78using System . Threading . Tasks ;
@@ -247,23 +248,44 @@ private void ExtractColumns24Bpp(int stride, int width, int height)
247248 {
248249 unchecked
249250 {
250- uint totalR = 0 , totalG = 0 , totalB = 0 ;
251251 int pixelCount = height ;
252252 int columnOffset = x * 3 ;
253-
254253 int y = 0 ;
255- for ( ; y < height - 3 ; y += 4 )
254+
255+ Vector < ulong > sumR = Vector < ulong > . Zero ;
256+ Vector < ulong > sumG = Vector < ulong > . Zero ;
257+ Vector < ulong > sumB = Vector < ulong > . Zero ;
258+ int vectorSize = Vector < byte > . Count ;
259+
260+ // Vectorized sum
261+ for ( ; y <= height - vectorSize ; y += vectorSize )
256262 {
257- int offset1 = y * stride + columnOffset ;
258- int offset2 = ( y + 1 ) * stride + columnOffset ;
259- int offset3 = ( y + 2 ) * stride + columnOffset ;
260- int offset4 = ( y + 3 ) * stride + columnOffset ;
261-
262- totalB += ( uint ) pixelBuffer [ offset1 ] + pixelBuffer [ offset2 ] + pixelBuffer [ offset3 ] + pixelBuffer [ offset4 ] ;
263- totalG += ( uint ) pixelBuffer [ offset1 + 1 ] + pixelBuffer [ offset2 + 1 ] + pixelBuffer [ offset3 + 1 ] + pixelBuffer [ offset4 + 1 ] ;
264- totalR += ( uint ) pixelBuffer [ offset1 + 2 ] + pixelBuffer [ offset2 + 2 ] + pixelBuffer [ offset3 + 2 ] + pixelBuffer [ offset4 + 2 ] ;
263+ Span < byte > colBytes = stackalloc byte [ vectorSize * 3 ] ;
264+ for ( int v = 0 ; v < vectorSize ; v ++ )
265+ {
266+ int offset = ( y + v ) * stride + columnOffset ;
267+ colBytes [ v * 3 + 0 ] = pixelBuffer [ offset ] ;
268+ colBytes [ v * 3 + 1 ] = pixelBuffer [ offset + 1 ] ;
269+ colBytes [ v * 3 + 2 ] = pixelBuffer [ offset + 2 ] ;
270+ }
271+
272+ var vec = new Vector < byte > ( colBytes ) ;
273+
274+ // Extract R, G, B channels and sum
275+ ulong r = 0 , g = 0 , b = 0 ;
276+ for ( int v = 0 ; v < vectorSize ; v ++ )
277+ {
278+ b += vec [ v * 3 + 0 ] ;
279+ g += vec [ v * 3 + 1 ] ;
280+ r += vec [ v * 3 + 2 ] ;
281+ }
282+ sumR += new Vector < ulong > ( r ) ;
283+ sumG += new Vector < ulong > ( g ) ;
284+ sumB += new Vector < ulong > ( b ) ;
265285 }
266286
287+ // Scalar sum for remaining pixels
288+ ulong totalR = 0 , totalG = 0 , totalB = 0 ;
267289 for ( ; y < height ; y ++ )
268290 {
269291 int offset = y * stride + columnOffset ;
@@ -272,56 +294,69 @@ private void ExtractColumns24Bpp(int stride, int width, int height)
272294 totalR += pixelBuffer [ offset + 2 ] ;
273295 }
274296
275- byte avgR = ( byte ) ( totalR / pixelCount ) ;
276- byte avgG = ( byte ) ( totalG / pixelCount ) ;
277- byte avgB = ( byte ) ( totalB / pixelCount ) ;
297+ // Add vectorized sums
298+ for ( int i = 0 ; i < Vector < ulong > . Count ; i ++ )
299+ {
300+ totalR += sumR [ i ] ;
301+ totalG += sumG [ i ] ;
302+ totalB += sumB [ i ] ;
303+ }
304+
305+ byte avgR = ( byte ) ( totalR / ( ulong ) pixelCount ) ;
306+ byte avgG = ( byte ) ( totalG / ( ulong ) pixelCount ) ;
307+ byte avgB = ( byte ) ( totalB / ( ulong ) pixelCount ) ;
278308
279309 rawColors [ x ] = new OpenRGB . NET . Color ( avgR , avgG , avgB ) ;
280310 }
281311 } ) ;
282312 }
283313
314+
315+ // Move the stackalloc and Vector<byte> creation outside the vectorized loop
284316 [ MethodImpl ( MethodImplOptions . AggressiveOptimization ) ]
285317 private void ExtractColumns32Bpp ( int stride , int width , int height )
286318 {
287319 Parallel . For ( 0 , width , new ParallelOptions { MaxDegreeOfParallelism = Environment . ProcessorCount } , x =>
288320 {
289321 unchecked
290322 {
291- uint totalR = 0 , totalG = 0 , totalB = 0 ;
292323 int pixelCount = height ;
293324 int columnOffset = x * 4 ;
325+ ulong totalR = 0 , totalG = 0 , totalB = 0 ;
294326
327+ // Process in blocks for cache efficiency
328+ int blockSize = 32 ; // Tune for your CPU cache
295329 int y = 0 ;
296- for ( ; y < height - 3 ; y += 4 )
330+ for ( ; y <= height - blockSize ; y += blockSize )
297331 {
298- int offset1 = y * stride + columnOffset ;
299- int offset2 = ( y + 1 ) * stride + columnOffset ;
300- int offset3 = ( y + 2 ) * stride + columnOffset ;
301- int offset4 = ( y + 3 ) * stride + columnOffset ;
302-
303- totalB += ( uint ) ( pixelBuffer [ offset1 ] + pixelBuffer [ offset2 ] + pixelBuffer [ offset3 ] + pixelBuffer [ offset4 ] ) ;
304- totalG += ( uint ) pixelBuffer [ offset1 + 1 ] + pixelBuffer [ offset2 + 1 ] + pixelBuffer [ offset3 + 1 ] + pixelBuffer [ offset4 + 1 ] ;
305- totalR += ( uint ) pixelBuffer [ offset1 + 2 ] + pixelBuffer [ offset2 + 2 ] + pixelBuffer [ offset3 + 2 ] + pixelBuffer [ offset4 + 2 ] ;
332+ for ( int b = 0 ; b < blockSize ; b ++ )
333+ {
334+ int offset = ( y + b ) * stride + columnOffset ;
335+ totalB += pixelBuffer ! [ offset ] ;
336+ totalG += pixelBuffer ! [ offset + 1 ] ;
337+ totalR += pixelBuffer ! [ offset + 2 ] ;
338+ }
306339 }
307-
340+ // Process remaining pixels
308341 for ( ; y < height ; y ++ )
309342 {
310343 int offset = y * stride + columnOffset ;
311- totalB += pixelBuffer [ offset ] ;
312- totalG += pixelBuffer [ offset + 1 ] ;
313- totalR += pixelBuffer [ offset + 2 ] ;
344+ totalB += pixelBuffer ! [ offset ] ;
345+ totalG += pixelBuffer ! [ offset + 1 ] ;
346+ totalR += pixelBuffer ! [ offset + 2 ] ;
314347 }
315348
316- byte avgR = ( byte ) ( totalR / pixelCount ) ;
317- byte avgG = ( byte ) ( totalG / pixelCount ) ;
318- byte avgB = ( byte ) ( totalB / pixelCount ) ;
349+ byte avgR = ( byte ) ( totalR / ( ulong ) pixelCount ) ;
350+ byte avgG = ( byte ) ( totalG / ( ulong ) pixelCount ) ;
351+ byte avgB = ( byte ) ( totalB / ( ulong ) pixelCount ) ;
319352
320353 rawColors [ x ] = new OpenRGB . NET . Color ( avgR , avgG , avgB ) ;
321354 }
322355 } ) ;
323356 }
324357
358+
359+
325360 [ MethodImpl ( MethodImplOptions . AggressiveOptimization ) ]
326361 private void ProcessColumnsWithEffects ( int width , double brightness , double vibrance , double contrast , int darkThreshold , double darkFactor )
327362 {
@@ -394,19 +429,48 @@ private OpenRGB.NET.Color FastApplyEffects(byte r, byte g, byte b, double bright
394429 }
395430
396431 [ MethodImpl ( MethodImplOptions . AggressiveOptimization ) ]
432+ // Vectorized LUT initialization
397433 private void InitializeLuts ( double brightness , double contrast )
398434 {
399- for ( int i = 0 ; i < 256 ; i ++ )
435+ // Vectorized brightness LUT
436+ if ( Vector . IsHardwareAccelerated )
400437 {
438+ var brightnessVec = new Vector < float > ( ( float ) brightness ) ;
439+ int vecSize = Vector < float > . Count ;
440+ int i = 0 ;
441+ for ( ; i <= 256 - vecSize ; i += vecSize )
442+ {
443+ var indices = new Vector < float > ( Enumerable . Range ( i , vecSize ) . Select ( x => ( float ) x ) . ToArray ( ) ) ;
444+ var result = Vector . Multiply ( indices , brightnessVec ) ;
445+ for ( int j = 0 ; j < vecSize ; j ++ )
446+ {
447+ brightnessLut [ i + j ] = ( byte ) Math . Clamp ( ( int ) result [ j ] , 0 , 255 ) ;
448+ }
449+ }
450+ // Handle any remaining elements
451+ for ( ; i < 256 ; i ++ )
452+ {
453+ int brightVal = ( int ) ( i * brightness ) ;
454+ brightnessLut [ i ] = ( byte ) Math . Clamp ( brightVal , 0 , 255 ) ;
455+ }
456+ }
457+ else
458+ {
459+ for ( int i = 0 ; i < 256 ; i ++ )
460+ {
461+ int brightVal = ( int ) ( i * brightness ) ;
462+ brightnessLut [ i ] = ( byte ) Math . Clamp ( brightVal , 0 , 255 ) ;
463+ }
464+ }
401465
402- int brightVal = ( int ) ( i * brightness ) ;
403- brightnessLut [ i ] = ( byte ) Math . Min ( Math . Max ( brightVal , 0 ) , 255 ) ;
404-
466+ // Contrast LUT (not vectorized)
467+ for ( int i = 0 ; i < 256 ; i ++ )
468+ {
405469 if ( Math . Abs ( contrast - 1.0 ) > 0.001 )
406470 {
407471 double normalized = i / 255.0 ;
408472 double adjusted = Math . Pow ( normalized , contrast ) * 255.0 ;
409- contrastLut [ i ] = ( byte ) Math . Min ( Math . Max ( ( int ) adjusted , 0 ) , 255 ) ;
473+ contrastLut [ i ] = ( byte ) Math . Clamp ( ( int ) adjusted , 0 , 255 ) ;
410474 }
411475 else
412476 {
0 commit comments