Skip to content

Commit e2f0db2

Browse files
committed
gdal raster blend: improve src-over performance for RGB and RGBA cases
1 parent ef9b05e commit e2f0db2

File tree

2 files changed

+223
-31
lines changed

2 files changed

+223
-31
lines changed

apps/gdalalg_raster_blend.cpp

Lines changed: 158 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "gdal_priv.h"
1818

1919
#include <algorithm>
20+
#include <array>
2021
#include <limits>
2122

2223
#if defined(__x86_64) || defined(_M_X64)
@@ -508,7 +509,9 @@ BlendDataset::BlendDataset(GDALDataset &oColorDS, GDALDataset &oOverlayDS,
508509
SetDescription(CPLSPrintf("Blend %s width %s", m_oColorDS.GetDescription(),
509510
m_oOverlayDS.GetDescription()));
510511
if (nBands > 1)
512+
{
511513
SetMetadataItem("INTERLEAVE", "PIXEL", "IMAGE_STRUCTURE");
514+
}
512515

513516
if (bCanCreateOvr)
514517
{
@@ -612,6 +615,26 @@ bool BlendDataset::AcquireSourcePixels(int nXOff, int nYOff, int nXSize,
612615
return bOK;
613616
}
614617

618+
/************************************************************************/
619+
/* gTabInvDstA */
620+
/************************************************************************/
621+
622+
constexpr int SHIFT_DIV_DSTA = 8;
623+
624+
// Table of (255 * 256 + k/2) / k values for k in [0,255]
625+
constexpr auto gTabInvDstA = []()
626+
{
627+
std::array<uint16_t, 256> arr{};
628+
629+
arr[0] = 0;
630+
for (int k = 1; k <= 255; ++k)
631+
{
632+
arr[k] = static_cast<uint16_t>(((255 << SHIFT_DIV_DSTA) + (k / 2)) / k);
633+
}
634+
635+
return arr;
636+
}();
637+
615638
/************************************************************************/
616639
/* BlendDataset::IRasterIO() */
617640
/************************************************************************/
@@ -636,7 +659,7 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
636659
return eErr;
637660
}
638661

639-
GByte *const pabyDst = static_cast<GByte *>(pData);
662+
GByte *const CPL_RESTRICT pabyDst = static_cast<GByte *>(pData);
640663
const int nColorCount = m_oColorDS.GetRasterCount();
641664
const int nOverlayCount = m_oOverlayDS.GetRasterCount();
642665
if (nOverlayCount == 1 && m_opacity255Scale == 255 &&
@@ -693,6 +716,77 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
693716

694717
return CE_None;
695718
}
719+
else if (nOverlayCount == 4 && nColorCount == 4 && m_operator == SRC_OVER &&
720+
eRWFlag == GF_Read && eBufType == GDT_Byte &&
721+
nBandCount == nBands && IsAllBands(nBands, panBandMap) &&
722+
AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize, nBufXSize,
723+
nBufYSize, psExtraArg))
724+
{
725+
const int nOpacity = m_opacity255Scale;
726+
const size_t nPixelCount = static_cast<size_t>(nBufXSize) * nBufYSize;
727+
const GByte *CPL_RESTRICT pabyR = m_abyBuffer.data();
728+
const GByte *CPL_RESTRICT pabyG = m_abyBuffer.data() + nPixelCount;
729+
const GByte *CPL_RESTRICT pabyB = m_abyBuffer.data() + nPixelCount * 2;
730+
const GByte *CPL_RESTRICT pabyA = m_abyBuffer.data() + nPixelCount * 3;
731+
const GByte *CPL_RESTRICT pabyOverlayR =
732+
m_abyBuffer.data() + nPixelCount * nColorCount;
733+
const GByte *CPL_RESTRICT pabyOverlayG =
734+
m_abyBuffer.data() + nPixelCount * (nColorCount + 1);
735+
const GByte *CPL_RESTRICT pabyOverlayB =
736+
m_abyBuffer.data() + nPixelCount * (nColorCount + 2);
737+
const GByte *CPL_RESTRICT pabyOverlayA =
738+
m_abyBuffer.data() + nPixelCount * (nColorCount + 3);
739+
size_t nSrcIdx = 0;
740+
for (int j = 0; j < nBufYSize; ++j)
741+
{
742+
auto nDstOffset = j * nLineSpace;
743+
for (int i = 0; i < nBufXSize;
744+
++i, ++nSrcIdx, nDstOffset += nPixelSpace)
745+
{
746+
const int nOverlayR = pabyOverlayR[nSrcIdx];
747+
const int nOverlayG = pabyOverlayG[nSrcIdx];
748+
const int nOverlayB = pabyOverlayB[nSrcIdx];
749+
const int nOverlayA =
750+
(pabyOverlayA[nSrcIdx] * nOpacity + 255) / 256;
751+
const int nR = pabyR[nSrcIdx];
752+
const int nG = pabyG[nSrcIdx];
753+
const int nB = pabyB[nSrcIdx];
754+
const int nA = pabyA[nSrcIdx];
755+
const int nSrcAMul255MinusOverlayA =
756+
(nA * (255 - nOverlayA) + 255) / 256;
757+
const unsigned nDstA = nOverlayA + nSrcAMul255MinusOverlayA;
758+
unsigned nDstR = (nOverlayR * nOverlayA +
759+
nR * nSrcAMul255MinusOverlayA + 255) /
760+
256;
761+
unsigned nDstG = (nOverlayG * nOverlayA +
762+
nG * nSrcAMul255MinusOverlayA + 255) /
763+
256;
764+
unsigned nDstB = (nOverlayB * nOverlayA +
765+
nB * nSrcAMul255MinusOverlayA + 255) /
766+
256;
767+
const uint16_t nInvDstA =
768+
gTabInvDstA[nDstA &
769+
0xff]; // (255 << SHIFT_DIV_DSTA) / nDstA;
770+
constexpr unsigned ROUND_OFFSET_DIV_DSTA =
771+
((1 << SHIFT_DIV_DSTA) - 1);
772+
nDstR = (nDstR * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
773+
SHIFT_DIV_DSTA;
774+
nDstG = (nDstG * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
775+
SHIFT_DIV_DSTA;
776+
nDstB = (nDstB * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
777+
SHIFT_DIV_DSTA;
778+
pabyDst[nDstOffset + 0 * nBandSpace] =
779+
static_cast<GByte>(nDstR);
780+
pabyDst[nDstOffset + 1 * nBandSpace] =
781+
static_cast<GByte>(nDstG);
782+
pabyDst[nDstOffset + 2 * nBandSpace] =
783+
static_cast<GByte>(nDstB);
784+
pabyDst[nDstOffset + 3 * nBandSpace] =
785+
static_cast<GByte>(nDstA);
786+
}
787+
}
788+
return CE_None;
789+
}
696790
else if (m_ioError)
697791
{
698792
return CE_Failure;
@@ -708,6 +802,29 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
708802
}
709803
}
710804

805+
/************************************************************************/
806+
/* SrcOverRGBOneComponent() */
807+
/************************************************************************/
808+
809+
// GCC and clang do a god job a auto vectorizing the below function
810+
#if defined(__GNUC__) && !defined(__clang__)
811+
__attribute__((optimize("tree-vectorize")))
812+
#endif
813+
static void
814+
SrcOverRGB(const uint8_t *const __restrict pabyOverlay,
815+
const uint8_t *const __restrict pabySrc,
816+
uint8_t *const __restrict pabyDst, const size_t N,
817+
const uint8_t nOpacity)
818+
{
819+
for (size_t i = 0; i < N; ++i)
820+
{
821+
const uint8_t nOverlay = pabyOverlay[i];
822+
const uint8_t nSrc = pabySrc[i];
823+
pabyDst[i] = static_cast<uint8_t>(
824+
(nOverlay * nOpacity + nSrc * (255 - nOpacity) + 255) / 256);
825+
}
826+
}
827+
711828
/************************************************************************/
712829
/* BlendBand::IRasterIO() */
713830
/************************************************************************/
@@ -754,6 +871,46 @@ CPLErr BlendBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
754871
nBufYSize, eBufType, nPixelSpace, nLineSpace, psExtraArg);
755872
}
756873
}
874+
else if (nOverlayCount == 3 && nColorCount == 3 &&
875+
m_oBlendDataset.m_operator == SRC_OVER && eRWFlag == GF_Read &&
876+
eBufType == GDT_Byte &&
877+
m_oBlendDataset.AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize,
878+
nBufXSize, nBufYSize,
879+
psExtraArg))
880+
{
881+
const int nOpacity = m_oBlendDataset.m_opacity255Scale;
882+
const GByte *const CPL_RESTRICT pabySrc =
883+
m_oBlendDataset.m_abyBuffer.data() + nPixelCount * (nBand - 1);
884+
const GByte *const CPL_RESTRICT pabyOverlay =
885+
m_oBlendDataset.m_abyBuffer.data() +
886+
nPixelCount * (nColorCount + nBand - 1);
887+
GByte *const CPL_RESTRICT pabyDst = static_cast<GByte *>(pData);
888+
size_t nSrcIdx = 0;
889+
for (int j = 0; j < nBufYSize; ++j)
890+
{
891+
auto nDstOffset = j * nLineSpace;
892+
if (nPixelSpace == 1)
893+
{
894+
SrcOverRGB(pabyOverlay + nSrcIdx, pabySrc + nSrcIdx,
895+
pabyDst + nDstOffset, nBufXSize,
896+
static_cast<uint8_t>(nOpacity));
897+
nSrcIdx += nBufXSize;
898+
}
899+
else
900+
{
901+
for (int i = 0; i < nBufXSize;
902+
++i, ++nSrcIdx, nDstOffset += nPixelSpace)
903+
{
904+
const int nOverlay = pabyOverlay[nSrcIdx];
905+
const int nSrc = pabySrc[nSrcIdx];
906+
pabyDst[nDstOffset] = static_cast<GByte>(
907+
(nOverlay * nOpacity + nSrc * (255 - nOpacity) + 255) /
908+
256);
909+
}
910+
}
911+
}
912+
return CE_None;
913+
}
757914
else if (eRWFlag == GF_Read && eBufType == GDT_Byte &&
758915
m_oBlendDataset.AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize,
759916
nBufXSize, nBufYSize,

0 commit comments

Comments
 (0)