@@ -366,7 +366,7 @@ update_indices_avx2(DataBlock *block)
366366 __m256i idx = _mm256_cvttps_epi32 (_mm256_mul_ps (
367367 _mm256_sub_ps (one_v , _mm256_div_ps (t , max_t )), num_frames_v ));
368368
369- _mm256_storeu_si256 (( __m256i * ) indices , idx );
369+ _mm256_storeu_si256 (indices , idx );
370370
371371 lifetimes += 8 ;
372372 max_lifetimes += 8 ;
@@ -394,6 +394,148 @@ update_indices_avx2(DataBlock *block)
394394
395395#if defined(__AVX2__ ) && defined(HAVE_IMMINTRIN_H ) && \
396396 !defined(SDL_DISABLE_IMMINTRIN_H )
397+ void inline blit_add_avx2_1x1 (uint32_t * srcp32 , uint32_t * dstp32 )
398+ {
399+ __m128i src128 = _mm_cvtsi32_si128 (* srcp32 );
400+ __m128i dst128 = _mm_cvtsi32_si128 (* dstp32 );
401+
402+ dst128 = _mm_adds_epu8 (src128 , dst128 );
403+
404+ * dstp32 = _mm_cvtsi128_si32 (dst128 );
405+ }
406+
407+ void inline blit_add_avx2_2x2 (uint32_t * srcp32 , uint32_t * dstp32 , int src_skip ,
408+ int dst_skip )
409+ {
410+ __m128i src128 = _mm_loadl_epi64 ((__m128i * )srcp32 );
411+ __m128i dst128 = _mm_loadl_epi64 ((__m128i * )dstp32 );
412+
413+ dst128 = _mm_adds_epu8 (src128 , dst128 );
414+
415+ _mm_storel_epi64 ((__m128i * )dstp32 , dst128 );
416+
417+ srcp32 += 2 + src_skip ;
418+ dstp32 += 2 + dst_skip ;
419+
420+ src128 = _mm_loadl_epi64 ((__m128i * )srcp32 );
421+ dst128 = _mm_loadl_epi64 ((__m128i * )dstp32 );
422+
423+ dst128 = _mm_adds_epu8 (src128 , dst128 );
424+
425+ _mm_storel_epi64 ((__m128i * )dstp32 , dst128 );
426+ }
427+
428+ void inline blit_add_avx2_3x3 (uint32_t * srcp32 , uint32_t * dstp32 , int src_skip ,
429+ int dst_skip )
430+ {
431+ __m128i src128 = _mm_loadl_epi64 ((__m128i * )srcp32 );
432+ __m128i dst128 = _mm_loadl_epi64 ((__m128i * )dstp32 );
433+
434+ dst128 = _mm_adds_epu8 (src128 , dst128 );
435+
436+ _mm_storel_epi64 ((__m128i * )dstp32 , dst128 );
437+
438+ srcp32 += 2 ;
439+ dstp32 += 2 ;
440+
441+ src128 = _mm_cvtsi32_si128 (* srcp32 );
442+ dst128 = _mm_cvtsi32_si128 (* dstp32 );
443+
444+ dst128 = _mm_adds_epu8 (src128 , dst128 );
445+
446+ * dstp32 = _mm_cvtsi128_si32 (dst128 );
447+
448+ srcp32 += 1 + src_skip ;
449+ dstp32 += 1 + dst_skip ;
450+
451+ src128 = _mm_loadl_epi64 ((__m128i * )srcp32 );
452+ dst128 = _mm_loadl_epi64 ((__m128i * )dstp32 );
453+
454+ dst128 = _mm_adds_epu8 (src128 , dst128 );
455+
456+ _mm_storel_epi64 ((__m128i * )dstp32 , dst128 );
457+
458+ srcp32 += 2 ;
459+ dstp32 += 2 ;
460+
461+ src128 = _mm_cvtsi32_si128 (* srcp32 );
462+ dst128 = _mm_cvtsi32_si128 (* dstp32 );
463+
464+ dst128 = _mm_adds_epu8 (src128 , dst128 );
465+
466+ * dstp32 = _mm_cvtsi128_si32 (dst128 );
467+ }
468+
469+ void inline blit_add_avx2_4x4 (uint32_t * srcp32 , uint32_t * dstp32 , int src_skip ,
470+ int dst_skip )
471+ {
472+ __m128i src128 ;
473+ __m128i dst128 ;
474+ UNROLL_3 ({
475+ src128 = _mm_loadu_si128 ((__m128i * )srcp32 );
476+ dst128 = _mm_loadu_si128 ((__m128i * )dstp32 );
477+
478+ dst128 = _mm_adds_epu8 (src128 , dst128 );
479+
480+ _mm_storeu_si128 ((__m128i * )dstp32 , dst128 );
481+
482+ srcp32 += 4 + src_skip ;
483+ dstp32 += 4 + dst_skip ;
484+ })
485+
486+ src128 = _mm_loadu_si128 ((__m128i * )srcp32 );
487+ dst128 = _mm_loadu_si128 ((__m128i * )dstp32 );
488+
489+ dst128 = _mm_adds_epu8 (src128 , dst128 );
490+
491+ _mm_storeu_si128 ((__m128i * )dstp32 , dst128 );
492+ }
493+
494+ void inline blit_add_avx2_5x5 (uint32_t * srcp32 , uint32_t * dstp32 , int src_skip ,
495+ int dst_skip )
496+ {
497+ __m128i src128 ;
498+ __m128i dst128 ;
499+ UNROLL_4 ({
500+ src128 = _mm_loadu_si128 ((__m128i * )srcp32 );
501+ dst128 = _mm_loadu_si128 ((__m128i * )dstp32 );
502+
503+ dst128 = _mm_adds_epu8 (src128 , dst128 );
504+
505+ _mm_storeu_si128 ((__m128i * )dstp32 , dst128 );
506+
507+ srcp32 += 4 ;
508+ dstp32 += 4 ;
509+
510+ src128 = _mm_cvtsi32_si128 (* srcp32 );
511+ dst128 = _mm_cvtsi32_si128 (* dstp32 );
512+
513+ dst128 = _mm_adds_epu8 (src128 , dst128 );
514+
515+ * dstp32 = _mm_cvtsi128_si32 (dst128 );
516+
517+ srcp32 += 1 + src_skip ;
518+ dstp32 += 1 + dst_skip ;
519+ })
520+
521+ src128 = _mm_loadu_si128 ((__m128i * )srcp32 );
522+ dst128 = _mm_loadu_si128 ((__m128i * )dstp32 );
523+
524+ dst128 = _mm_adds_epu8 (src128 , dst128 );
525+
526+ _mm_storeu_si128 ((__m128i * )dstp32 , dst128 );
527+
528+ srcp32 += 4 ;
529+ dstp32 += 4 ;
530+
531+ src128 = _mm_cvtsi32_si128 (* srcp32 );
532+ dst128 = _mm_cvtsi32_si128 (* dstp32 );
533+
534+ dst128 = _mm_adds_epu8 (src128 , dst128 );
535+
536+ * dstp32 = _mm_cvtsi128_si32 (dst128 );
537+ }
538+
397539void
398540blit_fragments_add_avx2 (FragmentationMap * frag_map , PyObject * * animation ,
399541 int dst_skip )
@@ -411,9 +553,30 @@ blit_fragments_add_avx2(FragmentationMap *frag_map, PyObject **animation,
411553
412554 for (int j = 0 ; j < fragment -> length ; j ++ ) {
413555 BlitDestination * item = & destinations [j ];
414-
415556 uint32_t * srcp32 = src_start ;
416557 uint32_t * dstp32 = item -> pixels ;
558+
559+ if (item -> width == 1 && item -> rows == 1 ) {
560+ blit_add_avx2_1x1 (srcp32 , dstp32 );
561+ continue ;
562+ }
563+ else if (item -> width == 2 && item -> rows == 2 ) {
564+ blit_add_avx2_2x2 (srcp32 , dstp32 , src_skip , actual_dst_skip );
565+ continue ;
566+ }
567+ else if (item -> width == 3 && item -> rows == 3 ) {
568+ blit_add_avx2_3x3 (srcp32 , dstp32 , src_skip , actual_dst_skip );
569+ continue ;
570+ }
571+ else if (item -> width == 4 && item -> rows == 4 ) {
572+ blit_add_avx2_4x4 (srcp32 , dstp32 , src_skip , actual_dst_skip );
573+ continue ;
574+ }
575+ else if (item -> width == 5 && item -> rows == 5 ) {
576+ blit_add_avx2_5x5 (srcp32 , dstp32 , src_skip , actual_dst_skip );
577+ continue ;
578+ }
579+
417580 int h = item -> rows ;
418581
419582 const int n_iters_8 = item -> width / 8 ;
0 commit comments