Skip to content

Commit a056e6f

Browse files
committed
Merge branch 'BlendSrcOverRGBA_SSE2'
2 parents ec66793 + 859c48c commit a056e6f

File tree

2 files changed

+156
-2
lines changed

2 files changed

+156
-2
lines changed

apps/gdalalg_raster_blend.cpp

Lines changed: 137 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
#include <algorithm>
2020
#include <array>
2121
#include <limits>
22+
#include <utility>
2223

2324
#if defined(__x86_64) || defined(_M_X64)
2425
#define HAVE_SSE2
26+
#include <immintrin.h>
2527
#endif
2628
#ifdef HAVE_SSE2
2729
#include "gdalsse_priv.h"
@@ -635,6 +637,126 @@ constexpr auto gTabInvDstA = []()
635637
return arr;
636638
}();
637639

640+
/************************************************************************/
641+
/* BlendSrcOverRGBA_SSE2() */
642+
/************************************************************************/
643+
644+
#ifdef HAVE_SSE2
645+
static int BlendSrcOverRGBA_SSE2(const GByte *CPL_RESTRICT pabyR,
646+
const GByte *CPL_RESTRICT pabyG,
647+
const GByte *CPL_RESTRICT pabyB,
648+
const GByte *CPL_RESTRICT pabyA,
649+
const GByte *CPL_RESTRICT pabyOverlayR,
650+
const GByte *CPL_RESTRICT pabyOverlayG,
651+
const GByte *CPL_RESTRICT pabyOverlayB,
652+
const GByte *CPL_RESTRICT pabyOverlayA,
653+
GByte *CPL_RESTRICT pabyDst,
654+
GPtrDiff_t nBandSpace, int N, int nOpacity)
655+
{
656+
// See scalar code after call to BlendSrcOverRGBA_SSE2() below for the
657+
// non-obfuscated formulas...
658+
659+
const auto load_and_unpack = [](const void *p)
660+
{
661+
const auto zero = _mm_setzero_si128();
662+
auto overlayA = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
663+
return std::make_pair(_mm_unpacklo_epi8(overlayA, zero),
664+
_mm_unpackhi_epi8(overlayA, zero));
665+
};
666+
667+
const auto pack_and_store = [](void *p, __m128i lo, __m128i hi) {
668+
_mm_storeu_si128(reinterpret_cast<__m128i *>(p),
669+
_mm_packus_epi16(lo, hi));
670+
};
671+
672+
const auto mul16bit_8bit_result = [](__m128i a, __m128i b)
673+
{
674+
const auto r255 = _mm_set1_epi16(255);
675+
return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(a, b), r255), 8);
676+
};
677+
678+
const auto opacity = _mm_set1_epi16(static_cast<int16_t>(nOpacity));
679+
const auto r255 = _mm_set1_epi16(255);
680+
const int16_t *tabInvDstASigned =
681+
reinterpret_cast<const int16_t *>(gTabInvDstA.data());
682+
constexpr int REG_WIDTH = static_cast<int>(sizeof(opacity));
683+
684+
int i = 0;
685+
for (; i <= N - REG_WIDTH; i += REG_WIDTH)
686+
{
687+
auto [overlayA_lo, overlayA_hi] = load_and_unpack(pabyOverlayA + i);
688+
auto [srcA_lo, srcA_hi] = load_and_unpack(pabyA + i);
689+
overlayA_lo = mul16bit_8bit_result(overlayA_lo, opacity);
690+
overlayA_hi = mul16bit_8bit_result(overlayA_hi, opacity);
691+
auto srcAMul255MinusOverlayA_lo =
692+
mul16bit_8bit_result(srcA_lo, _mm_sub_epi16(r255, overlayA_lo));
693+
auto srcAMul255MinusOverlayA_hi =
694+
mul16bit_8bit_result(srcA_hi, _mm_sub_epi16(r255, overlayA_hi));
695+
auto dstA_lo = _mm_add_epi16(overlayA_lo, srcAMul255MinusOverlayA_lo);
696+
auto dstA_hi = _mm_add_epi16(overlayA_hi, srcAMul255MinusOverlayA_hi);
697+
698+
// The & 0xff should not be necessary. This is mostly a safety
699+
// belt if the above math yields a result outside [0, 255]...
700+
dstA_lo = _mm_and_si128(dstA_lo, r255);
701+
dstA_hi = _mm_and_si128(dstA_hi, r255);
702+
703+
// This would be the equivalent of a "_mm_i16gather_epi16" operation
704+
// which does not exist...
705+
// invDstA_{i} = [tabInvDstASigned[dstA_{i}] for i in range(8)]
706+
auto invDstA_lo = _mm_undefined_si128();
707+
auto invDstA_hi = _mm_undefined_si128();
708+
#define SET_INVDSTA(k) \
709+
do \
710+
{ \
711+
const int idxLo = _mm_extract_epi16(dstA_lo, k); \
712+
const int idxHi = _mm_extract_epi16(dstA_hi, k); \
713+
invDstA_lo = _mm_insert_epi16(invDstA_lo, tabInvDstASigned[idxLo], k); \
714+
invDstA_hi = _mm_insert_epi16(invDstA_hi, tabInvDstASigned[idxHi], k); \
715+
} while (0)
716+
717+
SET_INVDSTA(0);
718+
SET_INVDSTA(1);
719+
SET_INVDSTA(2);
720+
SET_INVDSTA(3);
721+
SET_INVDSTA(4);
722+
SET_INVDSTA(5);
723+
SET_INVDSTA(6);
724+
SET_INVDSTA(7);
725+
726+
pack_and_store(pabyDst + i + 3 * nBandSpace, dstA_lo, dstA_hi);
727+
728+
#define PROCESS_COMPONENT(pabySrc, pabyOverlay, iBand) \
729+
do \
730+
{ \
731+
auto [src_lo, src_hi] = load_and_unpack((pabySrc) + i); \
732+
auto [overlay_lo, overlay_hi] = load_and_unpack((pabyOverlay) + i); \
733+
auto dst_lo = _mm_srli_epi16( \
734+
_mm_add_epi16( \
735+
_mm_add_epi16( \
736+
_mm_mullo_epi16(overlay_lo, overlayA_lo), \
737+
_mm_mullo_epi16(src_lo, srcAMul255MinusOverlayA_lo)), \
738+
r255), \
739+
8); \
740+
auto dst_hi = _mm_srli_epi16( \
741+
_mm_add_epi16( \
742+
_mm_add_epi16( \
743+
_mm_mullo_epi16(overlay_hi, overlayA_hi), \
744+
_mm_mullo_epi16(src_hi, srcAMul255MinusOverlayA_hi)), \
745+
r255), \
746+
8); \
747+
dst_lo = mul16bit_8bit_result(dst_lo, invDstA_lo); \
748+
dst_hi = mul16bit_8bit_result(dst_hi, invDstA_hi); \
749+
pack_and_store(pabyDst + i + (iBand)*nBandSpace, dst_lo, dst_hi); \
750+
} while (0)
751+
752+
PROCESS_COMPONENT(pabyR, pabyOverlayR, 0);
753+
PROCESS_COMPONENT(pabyG, pabyOverlayG, 1);
754+
PROCESS_COMPONENT(pabyB, pabyOverlayB, 2);
755+
}
756+
return i;
757+
}
758+
#endif
759+
638760
/************************************************************************/
639761
/* BlendDataset::IRasterIO() */
640762
/************************************************************************/
@@ -740,8 +862,21 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
740862
for (int j = 0; j < nBufYSize; ++j)
741863
{
742864
auto nDstOffset = j * nLineSpace;
743-
for (int i = 0; i < nBufXSize;
744-
++i, ++nSrcIdx, nDstOffset += nPixelSpace)
865+
int i = 0;
866+
#ifdef HAVE_SSE2
867+
if (nPixelSpace == 1)
868+
{
869+
i = BlendSrcOverRGBA_SSE2(
870+
pabyR + nSrcIdx, pabyG + nSrcIdx, pabyB + nSrcIdx,
871+
pabyA + nSrcIdx, pabyOverlayR + nSrcIdx,
872+
pabyOverlayG + nSrcIdx, pabyOverlayB + nSrcIdx,
873+
pabyOverlayA + nSrcIdx, pabyDst + nDstOffset, nBandSpace,
874+
nBufXSize, nOpacity);
875+
nSrcIdx += i;
876+
nDstOffset += i;
877+
}
878+
#endif
879+
for (; i < nBufXSize; ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
745880
{
746881
const int nOverlayR = pabyOverlayR[nSrcIdx];
747882
const int nOverlayG = pabyOverlayG[nSrcIdx];

autotest/utilities/test_gdalalg_raster_blend.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,3 +729,22 @@ def test_gdalalg_raster_blend_src_over():
729729
out_ds = alg.Output()
730730
assert out_ds.RasterCount == 1
731731
assert struct.unpack("B" * 1, out_ds.ReadRaster(0, 0, 1, 1)) == (189,)
732+
733+
734+
def test_gdalalg_raster_blend_src_over_stefan_full_rgba():
735+
736+
with gdal.Run(
737+
"raster",
738+
"blend",
739+
input="../gcore/data/stefan_full_rgba.tif",
740+
overlay="../gcore/data/stefan_full_rgba.tif",
741+
output_format="stream",
742+
opacity=75,
743+
) as alg:
744+
out_ds = alg.Output()
745+
assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(4)] == [
746+
13392,
747+
59283,
748+
36167,
749+
12963,
750+
]

0 commit comments

Comments
 (0)