22using System . Drawing ;
33using System . Drawing . Drawing2D ;
44using System . Drawing . Imaging ;
5- using System . Numerics ;
65using System . Runtime . CompilerServices ;
76using System . Runtime . InteropServices ;
87using System . Threading . Tasks ;
@@ -248,44 +247,23 @@ private void ExtractColumns24Bpp(int stride, int width, int height)
248247 {
249248 unchecked
250249 {
250+ uint totalR = 0 , totalG = 0 , totalB = 0 ;
251251 int pixelCount = height ;
252252 int columnOffset = x * 3 ;
253- int y = 0 ;
254-
255- Vector < ulong > sumR = Vector < ulong > . Zero ;
256- Vector < ulong > sumG = Vector < ulong > . Zero ;
257- Vector < ulong > sumB = Vector < ulong > . Zero ;
258- int vectorSize = Vector < byte > . Count ;
259253
260- // Vectorized sum
261- for ( ; y <= height - vectorSize ; y += vectorSize )
254+ int y = 0 ;
255+ for ( ; y < height - 3 ; y += 4 )
262256 {
263- Span < byte > colBytes = stackalloc byte [ vectorSize * 3 ] ;
264- for ( int v = 0 ; v < vectorSize ; v ++ )
265- {
266- int offset = ( y + v ) * stride + columnOffset ;
267- colBytes [ v * 3 + 0 ] = pixelBuffer [ offset ] ;
268- colBytes [ v * 3 + 1 ] = pixelBuffer [ offset + 1 ] ;
269- colBytes [ v * 3 + 2 ] = pixelBuffer [ offset + 2 ] ;
270- }
271-
272- var vec = new Vector < byte > ( colBytes ) ;
273-
274- // Extract R, G, B channels and sum
275- ulong r = 0 , g = 0 , b = 0 ;
276- for ( int v = 0 ; v < vectorSize ; v ++ )
277- {
278- b += vec [ v * 3 + 0 ] ;
279- g += vec [ v * 3 + 1 ] ;
280- r += vec [ v * 3 + 2 ] ;
281- }
282- sumR += new Vector < ulong > ( r ) ;
283- sumG += new Vector < ulong > ( g ) ;
284- sumB += new Vector < ulong > ( b ) ;
257+ int offset1 = y * stride + columnOffset ;
258+ int offset2 = ( y + 1 ) * stride + columnOffset ;
259+ int offset3 = ( y + 2 ) * stride + columnOffset ;
260+ int offset4 = ( y + 3 ) * stride + columnOffset ;
261+
262+ totalB += ( uint ) pixelBuffer [ offset1 ] + pixelBuffer [ offset2 ] + pixelBuffer [ offset3 ] + pixelBuffer [ offset4 ] ;
263+ totalG += ( uint ) pixelBuffer [ offset1 + 1 ] + pixelBuffer [ offset2 + 1 ] + pixelBuffer [ offset3 + 1 ] + pixelBuffer [ offset4 + 1 ] ;
264+ totalR += ( uint ) pixelBuffer [ offset1 + 2 ] + pixelBuffer [ offset2 + 2 ] + pixelBuffer [ offset3 + 2 ] + pixelBuffer [ offset4 + 2 ] ;
285265 }
286266
287- // Scalar sum for remaining pixels
288- ulong totalR = 0 , totalG = 0 , totalB = 0 ;
289267 for ( ; y < height ; y ++ )
290268 {
291269 int offset = y * stride + columnOffset ;
@@ -294,69 +272,56 @@ private void ExtractColumns24Bpp(int stride, int width, int height)
294272 totalR += pixelBuffer [ offset + 2 ] ;
295273 }
296274
297- // Add vectorized sums
298- for ( int i = 0 ; i < Vector < ulong > . Count ; i ++ )
299- {
300- totalR += sumR [ i ] ;
301- totalG += sumG [ i ] ;
302- totalB += sumB [ i ] ;
303- }
304-
305- byte avgR = ( byte ) ( totalR / ( ulong ) pixelCount ) ;
306- byte avgG = ( byte ) ( totalG / ( ulong ) pixelCount ) ;
307- byte avgB = ( byte ) ( totalB / ( ulong ) pixelCount ) ;
275+ byte avgR = ( byte ) ( totalR / pixelCount ) ;
276+ byte avgG = ( byte ) ( totalG / pixelCount ) ;
277+ byte avgB = ( byte ) ( totalB / pixelCount ) ;
308278
309279 rawColors [ x ] = new OpenRGB . NET . Color ( avgR , avgG , avgB ) ;
310280 }
311281 } ) ;
312282 }
313283
314-
315- // Move the stackalloc and Vector<byte> creation outside the vectorized loop
316284 [ MethodImpl ( MethodImplOptions . AggressiveOptimization ) ]
317285 private void ExtractColumns32Bpp ( int stride , int width , int height )
318286 {
319287 Parallel . For ( 0 , width , new ParallelOptions { MaxDegreeOfParallelism = Environment . ProcessorCount } , x =>
320288 {
321289 unchecked
322290 {
291+ uint totalR = 0 , totalG = 0 , totalB = 0 ;
323292 int pixelCount = height ;
324293 int columnOffset = x * 4 ;
325- ulong totalR = 0 , totalG = 0 , totalB = 0 ;
326294
327- // Process in blocks for cache efficiency
328- int blockSize = 32 ; // Tune for your CPU cache
329295 int y = 0 ;
330- for ( ; y <= height - blockSize ; y += blockSize )
296+ for ( ; y < height - 3 ; y += 4 )
331297 {
332- for ( int b = 0 ; b < blockSize ; b ++ )
333- {
334- int offset = ( y + b ) * stride + columnOffset ;
335- totalB += pixelBuffer ! [ offset ] ;
336- totalG += pixelBuffer ! [ offset + 1 ] ;
337- totalR += pixelBuffer ! [ offset + 2 ] ;
338- }
298+ int offset1 = y * stride + columnOffset ;
299+ int offset2 = ( y + 1 ) * stride + columnOffset ;
300+ int offset3 = ( y + 2 ) * stride + columnOffset ;
301+ int offset4 = ( y + 3 ) * stride + columnOffset ;
302+
303+ totalB += ( uint ) ( pixelBuffer [ offset1 ] + pixelBuffer [ offset2 ] + pixelBuffer [ offset3 ] + pixelBuffer [ offset4 ] ) ;
304+ totalG += ( uint ) pixelBuffer [ offset1 + 1 ] + pixelBuffer [ offset2 + 1 ] + pixelBuffer [ offset3 + 1 ] + pixelBuffer [ offset4 + 1 ] ;
305+ totalR += ( uint ) pixelBuffer [ offset1 + 2 ] + pixelBuffer [ offset2 + 2 ] + pixelBuffer [ offset3 + 2 ] + pixelBuffer [ offset4 + 2 ] ;
339306 }
340- // Process remaining pixels
307+
341308 for ( ; y < height ; y ++ )
342309 {
343310 int offset = y * stride + columnOffset ;
344- totalB += pixelBuffer ! [ offset ] ;
345- totalG += pixelBuffer ! [ offset + 1 ] ;
346- totalR += pixelBuffer ! [ offset + 2 ] ;
311+ totalB += pixelBuffer [ offset ] ;
312+ totalG += pixelBuffer [ offset + 1 ] ;
313+ totalR += pixelBuffer [ offset + 2 ] ;
347314 }
348315
349- byte avgR = ( byte ) ( totalR / ( ulong ) pixelCount ) ;
350- byte avgG = ( byte ) ( totalG / ( ulong ) pixelCount ) ;
351- byte avgB = ( byte ) ( totalB / ( ulong ) pixelCount ) ;
316+ byte avgR = ( byte ) ( totalR / pixelCount ) ;
317+ byte avgG = ( byte ) ( totalG / pixelCount ) ;
318+ byte avgB = ( byte ) ( totalB / pixelCount ) ;
352319
353320 rawColors [ x ] = new OpenRGB . NET . Color ( avgR , avgG , avgB ) ;
354321 }
355322 } ) ;
356323 }
357324
358-
359-
360325 [ MethodImpl ( MethodImplOptions . AggressiveOptimization ) ]
361326 private void ProcessColumnsWithEffects ( int width , double brightness , double vibrance , double contrast , int darkThreshold , double darkFactor )
362327 {
@@ -429,48 +394,19 @@ private OpenRGB.NET.Color FastApplyEffects(byte r, byte g, byte b, double bright
429394 }
430395
431396 [ MethodImpl ( MethodImplOptions . AggressiveOptimization ) ]
432- // Vectorized LUT initialization
433397 private void InitializeLuts ( double brightness , double contrast )
434398 {
435- // Vectorized brightness LUT
436- if ( Vector . IsHardwareAccelerated )
437- {
438- var brightnessVec = new Vector < float > ( ( float ) brightness ) ;
439- int vecSize = Vector < float > . Count ;
440- int i = 0 ;
441- for ( ; i <= 256 - vecSize ; i += vecSize )
442- {
443- var indices = new Vector < float > ( Enumerable . Range ( i , vecSize ) . Select ( x => ( float ) x ) . ToArray ( ) ) ;
444- var result = Vector . Multiply ( indices , brightnessVec ) ;
445- for ( int j = 0 ; j < vecSize ; j ++ )
446- {
447- brightnessLut [ i + j ] = ( byte ) Math . Clamp ( ( int ) result [ j ] , 0 , 255 ) ;
448- }
449- }
450- // Handle any remaining elements
451- for ( ; i < 256 ; i ++ )
452- {
453- int brightVal = ( int ) ( i * brightness ) ;
454- brightnessLut [ i ] = ( byte ) Math . Clamp ( brightVal , 0 , 255 ) ;
455- }
456- }
457- else
458- {
459- for ( int i = 0 ; i < 256 ; i ++ )
460- {
461- int brightVal = ( int ) ( i * brightness ) ;
462- brightnessLut [ i ] = ( byte ) Math . Clamp ( brightVal , 0 , 255 ) ;
463- }
464- }
465-
466- // Contrast LUT (not vectorized)
467399 for ( int i = 0 ; i < 256 ; i ++ )
468400 {
401+
402+ int brightVal = ( int ) ( i * brightness ) ;
403+ brightnessLut [ i ] = ( byte ) Math . Min ( Math . Max ( brightVal , 0 ) , 255 ) ;
404+
469405 if ( Math . Abs ( contrast - 1.0 ) > 0.001 )
470406 {
471407 double normalized = i / 255.0 ;
472408 double adjusted = Math . Pow ( normalized , contrast ) * 255.0 ;
473- contrastLut [ i ] = ( byte ) Math . Clamp ( ( int ) adjusted , 0 , 255 ) ;
409+ contrastLut [ i ] = ( byte ) Math . Min ( Math . Max ( ( int ) adjusted , 0 ) , 255 ) ;
474410 }
475411 else
476412 {
0 commit comments