@@ -412,23 +412,142 @@ private static Buffer2DRegion<byte> TrimTransparentPixels(Buffer2D<byte> buffer,
412
412
int bottom = int . MaxValue ;
413
413
int left = int . MaxValue ;
414
414
int right = int . MinValue ;
415
-
416
- // Run through th buffer in a single pass. Use variables to track the min/max values.
417
415
int minY = - 1 ;
418
416
bool isTransparentRow = true ;
417
+
418
+ // Run through the buffer in a single pass. Use variables to track the min/max values.
419
419
for ( int y = 0 ; y < buffer . Height ; y ++ )
420
420
{
421
421
isTransparentRow = true ;
422
422
Span < byte > rowSpan = buffer . DangerousGetRowSpan ( y ) ;
423
+ ref byte rowPtr = ref MemoryMarshal . GetReference ( rowSpan ) ;
424
+ nint rowLength = ( nint ) ( uint ) rowSpan . Length ;
425
+ nint x = 0 ;
426
+
427
+ #if NET7_0_OR_GREATER
428
+ if ( Vector128 . IsHardwareAccelerated && rowLength >= Vector128 < byte > . Count )
429
+ {
430
+ Vector256 < byte > trimmableVec256 = Vector256 . Create ( trimmableIndex ) ;
431
+
432
+ if ( Vector256 . IsHardwareAccelerated && rowLength >= Vector256 < byte > . Count )
433
+ {
434
+ do
435
+ {
436
+ Vector256 < byte > vec = Vector256 . LoadUnsafe ( ref rowPtr , ( nuint ) x ) ;
437
+ Vector256 < byte > notEquals = ~ Vector256 . Equals ( vec , trimmableVec256 ) ;
438
+ uint mask = notEquals . ExtractMostSignificantBits ( ) ;
439
+
440
+ if ( mask != 0 )
441
+ {
442
+ isTransparentRow = false ;
443
+ nint start = x + ( nint ) uint . TrailingZeroCount ( mask ) ;
444
+ nint end = ( nint ) uint . LeadingZeroCount ( mask ) ;
445
+
446
+ // end is from the end, but we need the index from the beginning
447
+ end = x + Vector256 < byte > . Count - 1 - end ;
448
+
449
+ left = Math . Min ( left , ( int ) start ) ;
450
+ right = Math . Max ( right , ( int ) end ) ;
451
+ }
452
+
453
+ x += Vector256 < byte > . Count ;
454
+ }
455
+ while ( x <= rowLength - Vector256 < byte > . Count ) ;
456
+ }
457
+
458
+ Vector128 < byte > trimmableVec = Vector256 . IsHardwareAccelerated
459
+ ? trimmableVec256 . GetLower ( )
460
+ : Vector128 . Create ( trimmableIndex ) ;
461
+
462
+ while ( x <= rowLength - Vector128 < byte > . Count )
463
+ {
464
+ Vector128 < byte > vec = Vector128 . LoadUnsafe ( ref rowPtr , ( nuint ) x ) ;
465
+ Vector128 < byte > notEquals = ~ Vector128 . Equals ( vec , trimmableVec ) ;
466
+ uint mask = notEquals . ExtractMostSignificantBits ( ) ;
467
+
468
+ if ( mask != 0 )
469
+ {
470
+ isTransparentRow = false ;
471
+ nint start = x + ( nint ) uint . TrailingZeroCount ( mask ) ;
472
+ nint end = ( nint ) uint . LeadingZeroCount ( mask ) - Vector128 < byte > . Count ;
473
+
474
+ // end is from the end, but we need the index from the beginning
475
+ end = x + Vector128 < byte > . Count - 1 - end ;
476
+
477
+ left = Math . Min ( left , ( int ) start ) ;
478
+ right = Math . Max ( right , ( int ) end ) ;
479
+ }
480
+
481
+ x += Vector128 < byte > . Count ;
482
+ }
483
+ }
484
+ #else
485
+ if ( Sse41 . IsSupported && rowLength >= Vector128 < byte > . Count )
486
+ {
487
+ Vector256 < byte > trimmableVec256 = Vector256 . Create ( trimmableIndex ) ;
488
+
489
+ if ( Avx2 . IsSupported && rowLength >= Vector256 < byte > . Count )
490
+ {
491
+ do
492
+ {
493
+ Vector256 < byte > vec = Unsafe . ReadUnaligned < Vector256 < byte > > ( ref Unsafe . Add ( ref rowPtr , x ) ) ;
494
+ Vector256 < byte > notEquals = Avx2 . CompareEqual ( vec , trimmableVec256 ) ;
495
+ notEquals = Avx2 . Xor ( notEquals , Vector256 < byte > . AllBitsSet ) ;
496
+ int mask = Avx2 . MoveMask ( notEquals ) ;
497
+
498
+ if ( mask != 0 )
499
+ {
500
+ isTransparentRow = false ;
501
+ nint start = x + ( nint ) ( uint ) BitOperations . TrailingZeroCount ( mask ) ;
502
+ nint end = ( nint ) ( uint ) BitOperations . LeadingZeroCount ( ( uint ) mask ) ;
503
+
504
+ // end is from the end, but we need the index from the beginning
505
+ end = x + Vector256 < byte > . Count - 1 - end ;
506
+
507
+ left = Math . Min ( left , ( int ) start ) ;
508
+ right = Math . Max ( right , ( int ) end ) ;
509
+ }
510
+
511
+ x += Vector256 < byte > . Count ;
512
+ }
513
+ while ( x <= rowLength - Vector256 < byte > . Count ) ;
514
+ }
515
+
516
+ Vector128 < byte > trimmableVec = Sse41 . IsSupported
517
+ ? trimmableVec256 . GetLower ( )
518
+ : Vector128 . Create ( trimmableIndex ) ;
519
+
520
+ while ( x <= rowLength - Vector128 < byte > . Count )
521
+ {
522
+ Vector128 < byte > vec = Unsafe . ReadUnaligned < Vector128 < byte > > ( ref Unsafe . Add ( ref rowPtr , x ) ) ;
523
+ Vector128 < byte > notEquals = Sse2 . CompareEqual ( vec , trimmableVec ) ;
524
+ notEquals = Sse2 . Xor ( notEquals , Vector128 < byte > . AllBitsSet ) ;
525
+ int mask = Sse2 . MoveMask ( notEquals ) ;
526
+
527
+ if ( mask != 0 )
528
+ {
529
+ isTransparentRow = false ;
530
+ nint start = x + ( nint ) ( uint ) BitOperations . TrailingZeroCount ( mask ) ;
531
+ nint end = ( nint ) ( uint ) BitOperations . LeadingZeroCount ( ( uint ) mask ) - Vector128 < byte > . Count ;
423
532
424
- // TODO: It may be possible to optimize this inner loop using SIMD.
425
- for ( int x = 0 ; x < rowSpan . Length ; x ++ )
533
+ // end is from the end, but we need the index from the beginning
534
+ end = x + Vector128 < byte > . Count - 1 - end ;
535
+
536
+ left = Math . Min ( left , ( int ) start ) ;
537
+ right = Math . Max ( right , ( int ) end ) ;
538
+ }
539
+
540
+ x += Vector128 < byte > . Count ;
541
+ }
542
+ }
543
+ #endif
544
+ for ( ; x < rowLength ; ++ x )
426
545
{
427
- if ( rowSpan [ x ] != trimmableIndex )
546
+ if ( Unsafe . Add ( ref rowPtr , x ) != trimmableIndex )
428
547
{
429
548
isTransparentRow = false ;
430
- left = Math . Min ( left , x ) ;
431
- right = Math . Max ( right , x ) ;
549
+ left = Math . Min ( left , ( int ) x ) ;
550
+ right = Math . Max ( right , ( int ) x ) ;
432
551
}
433
552
}
434
553
0 commit comments