Skip to content

Commit 7361e7e

Browse files
committed
Fixed scalar ADD blitter.
Added special cases for blitting 1X1, 2X2, 3X3, 4X4, 5X5 surfaces.
1 parent 5b1a301 commit 7361e7e

File tree

5 files changed

+467
-35
lines changed

5 files changed

+467
-35
lines changed

src/data_block.c

Lines changed: 106 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,78 @@ blit_fragments_blitcopy(FragmentationMap *frag_map, pgSurfaceObject *dest,
236236

237237
uint32_t *srcp32 = src_start;
238238
uint32_t *dstp32 = item->pixels;
239+
240+
if (item->width == 1 && item->rows == 1) {
241+
*dstp32 = *srcp32;
242+
continue;
243+
}
244+
else if (item->width == 2 && item->rows == 2) {
245+
dstp32[0] = srcp32[0];
246+
dstp32[1] = srcp32[1];
247+
248+
srcp32 += src_skip;
249+
dstp32 += dst_skip;
250+
251+
dstp32[0] = srcp32[0];
252+
dstp32[1] = srcp32[1];
253+
254+
continue;
255+
}
256+
else if (item->width == 3 && item->rows == 3) {
257+
UNROLL_2({
258+
dstp32[0] = srcp32[0];
259+
dstp32[1] = srcp32[1];
260+
dstp32[2] = srcp32[2];
261+
262+
srcp32 += src_skip;
263+
dstp32 += dst_skip;
264+
})
265+
266+
dstp32[0] = srcp32[0];
267+
dstp32[1] = srcp32[1];
268+
dstp32[2] = srcp32[2];
269+
270+
continue;
271+
}
272+
else if (item->width == 4 && item->rows == 4) {
273+
UNROLL_3({
274+
dstp32[0] = srcp32[0];
275+
dstp32[1] = srcp32[1];
276+
dstp32[2] = srcp32[2];
277+
dstp32[3] = srcp32[3];
278+
279+
srcp32 += src_skip;
280+
dstp32 += dst_skip;
281+
})
282+
283+
dstp32[0] = srcp32[0];
284+
dstp32[1] = srcp32[1];
285+
dstp32[2] = srcp32[2];
286+
dstp32[3] = srcp32[3];
287+
288+
continue;
289+
}
290+
else if (item->width == 5 && item->rows == 5) {
291+
UNROLL_4({
292+
dstp32[0] = srcp32[0];
293+
dstp32[1] = srcp32[1];
294+
dstp32[2] = srcp32[2];
295+
dstp32[3] = srcp32[3];
296+
dstp32[4] = srcp32[4];
297+
298+
srcp32 += src_skip;
299+
dstp32 += dst_skip;
300+
})
301+
302+
dstp32[0] = srcp32[0];
303+
dstp32[1] = srcp32[1];
304+
dstp32[2] = srcp32[2];
305+
dstp32[3] = srcp32[3];
306+
dstp32[4] = srcp32[4];
307+
308+
continue;
309+
}
310+
239311
int h = item->rows;
240312
const int copy_w = item->width * 4;
241313

@@ -279,43 +351,54 @@ blit_fragments_add_scalar(FragmentationMap *frag_map, PyObject **animation,
279351
{
280352
Fragment *fragments = frag_map->fragments;
281353
BlitDestination *destinations = frag_map->destinations;
354+
SDL_PixelFormat *fmt = ((pgSurfaceObject *)animation[0])->surf->format;
355+
356+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
357+
const int Ridx = fmt->Rshift >> 3;
358+
const int Gidx = fmt->Gshift >> 3;
359+
const int Bidx = fmt->Bshift >> 3;
360+
#else
361+
const int Ridx = 3 - (fmt->Rshift >> 3);
362+
const int Gidx = 3 - (fmt->Gshift >> 3);
363+
const int Bidx = 3 - (fmt->Bshift >> 3);
364+
#endif
282365

283366
for (int i = 0; i < frag_map->used_f; i++) {
284367
Fragment *fragment = &fragments[i];
285368
SDL_Surface *src_surf =
286369
((pgSurfaceObject *)animation[fragment->animation_index])->surf;
287-
const int src_skip = src_surf->pitch / 4;
288-
uint32_t *const src_start = (uint32_t *)src_surf->pixels;
370+
const int src_skip = src_surf->pitch - src_surf->w * 4;
371+
uint8_t *const src_start = (uint8_t *)src_surf->pixels;
289372

290373
for (int j = 0; j < fragment->length; j++) {
291374
BlitDestination *item = &destinations[j];
292375

293-
uint32_t *srcp32 = src_start;
294-
uint32_t *dstp32 = item->pixels;
376+
uint8_t *srcp8 = src_start;
377+
uint8_t *dstp8 = (uint8_t *)item->pixels;
378+
295379
int h = item->rows;
296-
const int w = item->width;
380+
const int actual_dst_skip = 4 * (dst_skip - item->width);
297381

298382
while (h--) {
299-
for (int k = 0; k < w; k++) {
300-
uint32_t src = srcp32[k];
301-
uint32_t dst = dstp32[k];
302-
uint32_t r = (src >> src_surf->format->Rshift) & 0xff;
303-
uint32_t g = (src >> src_surf->format->Gshift) & 0xff;
304-
uint32_t b = (src >> src_surf->format->Bshift) & 0xff;
305-
306-
uint32_t r2 = (dst >> src_surf->format->Rshift) & 0xff;
307-
uint32_t g2 = (dst >> src_surf->format->Gshift) & 0xff;
308-
uint32_t b2 = (dst >> src_surf->format->Bshift) & 0xff;
309-
310-
r = r + r2 > 255 ? 255 : r + r2;
311-
g = g + g2 > 255 ? 255 : g + g2;
312-
b = b + b2 > 255 ? 255 : b + b2;
313-
314-
dstp32[k] = (b << 16) | (g << 8) | r | 0xff000000;
383+
for (int k = 0; k < item->width; k++) {
384+
uint8_t sr = srcp8[Ridx];
385+
uint8_t sg = srcp8[Gidx];
386+
uint8_t sb = srcp8[Bidx];
387+
388+
uint8_t dr = dstp8[Ridx];
389+
uint8_t dg = dstp8[Gidx];
390+
uint8_t db = dstp8[Bidx];
391+
392+
dstp8[Ridx] = sr + dr > 255 ? 255 : sr + dr;
393+
dstp8[Gidx] = sg + dg > 255 ? 255 : sg + dg;
394+
dstp8[Bidx] = sb + db > 255 ? 255 : sb + db;
395+
396+
srcp8 += 4;
397+
dstp8 += 4;
315398
}
316399

317-
srcp32 += src_skip;
318-
dstp32 += dst_skip;
400+
srcp8 += src_skip;
401+
dstp8 += actual_dst_skip;
319402
}
320403
}
321404

src/include/data_block.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,21 @@
44
#include "MT19937.h"
55
#include "emitter.h"
66

7+
#define UNROLL_2(x) \
8+
x; \
9+
x;
10+
11+
#define UNROLL_3(x) \
12+
x; \
13+
x; \
14+
x;
15+
16+
#define UNROLL_4(x) \
17+
x; \
18+
x; \
19+
x; \
20+
x;
21+
722
typedef struct {
823
int animation_index;
924
int length;

src/updaters_simd_avx2.c

Lines changed: 165 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ update_indices_avx2(DataBlock *block)
366366
__m256i idx = _mm256_cvttps_epi32(_mm256_mul_ps(
367367
_mm256_sub_ps(one_v, _mm256_div_ps(t, max_t)), num_frames_v));
368368

369-
_mm256_storeu_si256((__m256i *)indices, idx);
369+
_mm256_storeu_si256(indices, idx);
370370

371371
lifetimes += 8;
372372
max_lifetimes += 8;
@@ -394,6 +394,148 @@ update_indices_avx2(DataBlock *block)
394394

395395
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
396396
!defined(SDL_DISABLE_IMMINTRIN_H)
397+
void inline blit_add_avx2_1x1(uint32_t *srcp32, uint32_t *dstp32)
398+
{
399+
__m128i src128 = _mm_cvtsi32_si128(*srcp32);
400+
__m128i dst128 = _mm_cvtsi32_si128(*dstp32);
401+
402+
dst128 = _mm_adds_epu8(src128, dst128);
403+
404+
*dstp32 = _mm_cvtsi128_si32(dst128);
405+
}
406+
407+
void inline blit_add_avx2_2x2(uint32_t *srcp32, uint32_t *dstp32, int src_skip,
408+
int dst_skip)
409+
{
410+
__m128i src128 = _mm_loadl_epi64((__m128i *)srcp32);
411+
__m128i dst128 = _mm_loadl_epi64((__m128i *)dstp32);
412+
413+
dst128 = _mm_adds_epu8(src128, dst128);
414+
415+
_mm_storel_epi64((__m128i *)dstp32, dst128);
416+
417+
srcp32 += 2 + src_skip;
418+
dstp32 += 2 + dst_skip;
419+
420+
src128 = _mm_loadl_epi64((__m128i *)srcp32);
421+
dst128 = _mm_loadl_epi64((__m128i *)dstp32);
422+
423+
dst128 = _mm_adds_epu8(src128, dst128);
424+
425+
_mm_storel_epi64((__m128i *)dstp32, dst128);
426+
}
427+
428+
void inline blit_add_avx2_3x3(uint32_t *srcp32, uint32_t *dstp32, int src_skip,
429+
int dst_skip)
430+
{
431+
__m128i src128 = _mm_loadl_epi64((__m128i *)srcp32);
432+
__m128i dst128 = _mm_loadl_epi64((__m128i *)dstp32);
433+
434+
dst128 = _mm_adds_epu8(src128, dst128);
435+
436+
_mm_storel_epi64((__m128i *)dstp32, dst128);
437+
438+
srcp32 += 2;
439+
dstp32 += 2;
440+
441+
src128 = _mm_cvtsi32_si128(*srcp32);
442+
dst128 = _mm_cvtsi32_si128(*dstp32);
443+
444+
dst128 = _mm_adds_epu8(src128, dst128);
445+
446+
*dstp32 = _mm_cvtsi128_si32(dst128);
447+
448+
srcp32 += 1 + src_skip;
449+
dstp32 += 1 + dst_skip;
450+
451+
src128 = _mm_loadl_epi64((__m128i *)srcp32);
452+
dst128 = _mm_loadl_epi64((__m128i *)dstp32);
453+
454+
dst128 = _mm_adds_epu8(src128, dst128);
455+
456+
_mm_storel_epi64((__m128i *)dstp32, dst128);
457+
458+
srcp32 += 2;
459+
dstp32 += 2;
460+
461+
src128 = _mm_cvtsi32_si128(*srcp32);
462+
dst128 = _mm_cvtsi32_si128(*dstp32);
463+
464+
dst128 = _mm_adds_epu8(src128, dst128);
465+
466+
*dstp32 = _mm_cvtsi128_si32(dst128);
467+
}
468+
469+
void inline blit_add_avx2_4x4(uint32_t *srcp32, uint32_t *dstp32, int src_skip,
470+
int dst_skip)
471+
{
472+
__m128i src128;
473+
__m128i dst128;
474+
UNROLL_3({
475+
src128 = _mm_loadu_si128((__m128i *)srcp32);
476+
dst128 = _mm_loadu_si128((__m128i *)dstp32);
477+
478+
dst128 = _mm_adds_epu8(src128, dst128);
479+
480+
_mm_storeu_si128((__m128i *)dstp32, dst128);
481+
482+
srcp32 += 4 + src_skip;
483+
dstp32 += 4 + dst_skip;
484+
})
485+
486+
src128 = _mm_loadu_si128((__m128i *)srcp32);
487+
dst128 = _mm_loadu_si128((__m128i *)dstp32);
488+
489+
dst128 = _mm_adds_epu8(src128, dst128);
490+
491+
_mm_storeu_si128((__m128i *)dstp32, dst128);
492+
}
493+
494+
void inline blit_add_avx2_5x5(uint32_t *srcp32, uint32_t *dstp32, int src_skip,
495+
int dst_skip)
496+
{
497+
__m128i src128;
498+
__m128i dst128;
499+
UNROLL_4({
500+
src128 = _mm_loadu_si128((__m128i *)srcp32);
501+
dst128 = _mm_loadu_si128((__m128i *)dstp32);
502+
503+
dst128 = _mm_adds_epu8(src128, dst128);
504+
505+
_mm_storeu_si128((__m128i *)dstp32, dst128);
506+
507+
srcp32 += 4;
508+
dstp32 += 4;
509+
510+
src128 = _mm_cvtsi32_si128(*srcp32);
511+
dst128 = _mm_cvtsi32_si128(*dstp32);
512+
513+
dst128 = _mm_adds_epu8(src128, dst128);
514+
515+
*dstp32 = _mm_cvtsi128_si32(dst128);
516+
517+
srcp32 += 1 + src_skip;
518+
dstp32 += 1 + dst_skip;
519+
})
520+
521+
src128 = _mm_loadu_si128((__m128i *)srcp32);
522+
dst128 = _mm_loadu_si128((__m128i *)dstp32);
523+
524+
dst128 = _mm_adds_epu8(src128, dst128);
525+
526+
_mm_storeu_si128((__m128i *)dstp32, dst128);
527+
528+
srcp32 += 4;
529+
dstp32 += 4;
530+
531+
src128 = _mm_cvtsi32_si128(*srcp32);
532+
dst128 = _mm_cvtsi32_si128(*dstp32);
533+
534+
dst128 = _mm_adds_epu8(src128, dst128);
535+
536+
*dstp32 = _mm_cvtsi128_si32(dst128);
537+
}
538+
397539
void
398540
blit_fragments_add_avx2(FragmentationMap *frag_map, PyObject **animation,
399541
int dst_skip)
@@ -411,9 +553,30 @@ blit_fragments_add_avx2(FragmentationMap *frag_map, PyObject **animation,
411553

412554
for (int j = 0; j < fragment->length; j++) {
413555
BlitDestination *item = &destinations[j];
414-
415556
uint32_t *srcp32 = src_start;
416557
uint32_t *dstp32 = item->pixels;
558+
559+
if (item->width == 1 && item->rows == 1) {
560+
blit_add_avx2_1x1(srcp32, dstp32);
561+
continue;
562+
}
563+
else if (item->width == 2 && item->rows == 2) {
564+
blit_add_avx2_2x2(srcp32, dstp32, src_skip, actual_dst_skip);
565+
continue;
566+
}
567+
else if (item->width == 3 && item->rows == 3) {
568+
blit_add_avx2_3x3(srcp32, dstp32, src_skip, actual_dst_skip);
569+
continue;
570+
}
571+
else if (item->width == 4 && item->rows == 4) {
572+
blit_add_avx2_4x4(srcp32, dstp32, src_skip, actual_dst_skip);
573+
continue;
574+
}
575+
else if (item->width == 5 && item->rows == 5) {
576+
blit_add_avx2_5x5(srcp32, dstp32, src_skip, actual_dst_skip);
577+
continue;
578+
}
579+
417580
int h = item->rows;
418581

419582
const int n_iters_8 = item->width / 8;

0 commit comments

Comments
 (0)