1717#include " gdal_priv.h"
1818
1919#include < algorithm>
20+ #include < array>
2021#include < limits>
2122
2223#if defined(__x86_64) || defined(_M_X64)
@@ -508,7 +509,9 @@ BlendDataset::BlendDataset(GDALDataset &oColorDS, GDALDataset &oOverlayDS,
508509 SetDescription (CPLSPrintf (" Blend %s width %s" , m_oColorDS.GetDescription (),
509510 m_oOverlayDS.GetDescription ()));
510511 if (nBands > 1 )
512+ {
511513 SetMetadataItem (" INTERLEAVE" , " PIXEL" , " IMAGE_STRUCTURE" );
514+ }
512515
513516 if (bCanCreateOvr)
514517 {
@@ -612,6 +615,26 @@ bool BlendDataset::AcquireSourcePixels(int nXOff, int nYOff, int nXSize,
612615 return bOK;
613616}
614617
618+ /* ***********************************************************************/
619+ /* gTabInvDstA */
620+ /* ***********************************************************************/
621+
622+ constexpr int SHIFT_DIV_DSTA = 8 ;
623+
624+ // Table of (255 * 256 + k/2) / k values for k in [0,255]
625+ constexpr auto gTabInvDstA = []()
626+ {
627+ std::array<uint16_t , 256 > arr{};
628+
629+ arr[0 ] = 0 ;
630+ for (int k = 1 ; k <= 255 ; ++k)
631+ {
632+ arr[k] = static_cast <uint16_t >(((255 << SHIFT_DIV_DSTA) + (k / 2 )) / k);
633+ }
634+
635+ return arr;
636+ }();
637+
615638/* ***********************************************************************/
616639/* BlendDataset::IRasterIO() */
617640/* ***********************************************************************/
@@ -636,7 +659,7 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
636659 return eErr;
637660 }
638661
639- GByte *const pabyDst = static_cast <GByte *>(pData);
662+ GByte *const CPL_RESTRICT pabyDst = static_cast <GByte *>(pData);
640663 const int nColorCount = m_oColorDS.GetRasterCount ();
641664 const int nOverlayCount = m_oOverlayDS.GetRasterCount ();
642665 if (nOverlayCount == 1 && m_opacity255Scale == 255 &&
@@ -693,6 +716,77 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
693716
694717 return CE_None;
695718 }
719+ else if (nOverlayCount == 4 && nColorCount == 4 && m_operator == SRC_OVER &&
720+ eRWFlag == GF_Read && eBufType == GDT_Byte &&
721+ nBandCount == nBands && IsAllBands (nBands, panBandMap) &&
722+ AcquireSourcePixels (nXOff, nYOff, nXSize, nYSize, nBufXSize,
723+ nBufYSize, psExtraArg))
724+ {
725+ const int nOpacity = m_opacity255Scale;
726+ const size_t nPixelCount = static_cast <size_t >(nBufXSize) * nBufYSize;
727+ const GByte *CPL_RESTRICT pabyR = m_abyBuffer.data ();
728+ const GByte *CPL_RESTRICT pabyG = m_abyBuffer.data () + nPixelCount;
729+ const GByte *CPL_RESTRICT pabyB = m_abyBuffer.data () + nPixelCount * 2 ;
730+ const GByte *CPL_RESTRICT pabyA = m_abyBuffer.data () + nPixelCount * 3 ;
731+ const GByte *CPL_RESTRICT pabyOverlayR =
732+ m_abyBuffer.data () + nPixelCount * nColorCount;
733+ const GByte *CPL_RESTRICT pabyOverlayG =
734+ m_abyBuffer.data () + nPixelCount * (nColorCount + 1 );
735+ const GByte *CPL_RESTRICT pabyOverlayB =
736+ m_abyBuffer.data () + nPixelCount * (nColorCount + 2 );
737+ const GByte *CPL_RESTRICT pabyOverlayA =
738+ m_abyBuffer.data () + nPixelCount * (nColorCount + 3 );
739+ size_t nSrcIdx = 0 ;
740+ for (int j = 0 ; j < nBufYSize; ++j)
741+ {
742+ auto nDstOffset = j * nLineSpace;
743+ for (int i = 0 ; i < nBufXSize;
744+ ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
745+ {
746+ const int nOverlayR = pabyOverlayR[nSrcIdx];
747+ const int nOverlayG = pabyOverlayG[nSrcIdx];
748+ const int nOverlayB = pabyOverlayB[nSrcIdx];
749+ const int nOverlayA =
750+ (pabyOverlayA[nSrcIdx] * nOpacity + 255 ) / 256 ;
751+ const int nR = pabyR[nSrcIdx];
752+ const int nG = pabyG[nSrcIdx];
753+ const int nB = pabyB[nSrcIdx];
754+ const int nA = pabyA[nSrcIdx];
755+ const int nSrcAMul255MinusOverlayA =
756+ (nA * (255 - nOverlayA) + 255 ) / 256 ;
757+ const unsigned nDstA = nOverlayA + nSrcAMul255MinusOverlayA;
758+ unsigned nDstR = (nOverlayR * nOverlayA +
759+ nR * nSrcAMul255MinusOverlayA + 255 ) /
760+ 256 ;
761+ unsigned nDstG = (nOverlayG * nOverlayA +
762+ nG * nSrcAMul255MinusOverlayA + 255 ) /
763+ 256 ;
764+ unsigned nDstB = (nOverlayB * nOverlayA +
765+ nB * nSrcAMul255MinusOverlayA + 255 ) /
766+ 256 ;
767+ const uint16_t nInvDstA =
768+ gTabInvDstA [nDstA &
769+ 0xff ]; // (255 << SHIFT_DIV_DSTA) / nDstA;
770+ constexpr unsigned ROUND_OFFSET_DIV_DSTA =
771+ ((1 << SHIFT_DIV_DSTA) - 1 );
772+ nDstR = (nDstR * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
773+ SHIFT_DIV_DSTA;
774+ nDstG = (nDstG * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
775+ SHIFT_DIV_DSTA;
776+ nDstB = (nDstB * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
777+ SHIFT_DIV_DSTA;
778+ pabyDst[nDstOffset + 0 * nBandSpace] =
779+ static_cast <GByte>(nDstR);
780+ pabyDst[nDstOffset + 1 * nBandSpace] =
781+ static_cast <GByte>(nDstG);
782+ pabyDst[nDstOffset + 2 * nBandSpace] =
783+ static_cast <GByte>(nDstB);
784+ pabyDst[nDstOffset + 3 * nBandSpace] =
785+ static_cast <GByte>(nDstA);
786+ }
787+ }
788+ return CE_None;
789+ }
696790 else if (m_ioError)
697791 {
698792 return CE_Failure;
@@ -708,6 +802,29 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
708802 }
709803}
710804
805+ /* ***********************************************************************/
806+ /* SrcOverRGBOneComponent() */
807+ /* ***********************************************************************/
808+
809+ // GCC and clang do a god job a auto vectorizing the below function
810+ #if defined(__GNUC__) && !defined(__clang__)
811+ __attribute__ ((optimize(" tree-vectorize" )))
812+ #endif
813+ static void
814+ SrcOverRGB (const uint8_t *const __restrict pabyOverlay,
815+ const uint8_t *const __restrict pabySrc,
816+ uint8_t *const __restrict pabyDst, const size_t N,
817+ const uint8_t nOpacity)
818+ {
819+ for (size_t i = 0 ; i < N; ++i)
820+ {
821+ const uint8_t nOverlay = pabyOverlay[i];
822+ const uint8_t nSrc = pabySrc[i];
823+ pabyDst[i] = static_cast <uint8_t >(
824+ (nOverlay * nOpacity + nSrc * (255 - nOpacity) + 255 ) / 256 );
825+ }
826+ }
827+
711828/* ***********************************************************************/
712829/* BlendBand::IRasterIO() */
713830/* ***********************************************************************/
@@ -754,6 +871,46 @@ CPLErr BlendBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
754871 nBufYSize, eBufType, nPixelSpace, nLineSpace, psExtraArg);
755872 }
756873 }
874+ else if (nOverlayCount == 3 && nColorCount == 3 &&
875+ m_oBlendDataset.m_operator == SRC_OVER && eRWFlag == GF_Read &&
876+ eBufType == GDT_Byte &&
877+ m_oBlendDataset.AcquireSourcePixels (nXOff, nYOff, nXSize, nYSize,
878+ nBufXSize, nBufYSize,
879+ psExtraArg))
880+ {
881+ const int nOpacity = m_oBlendDataset.m_opacity255Scale ;
882+ const GByte *const CPL_RESTRICT pabySrc =
883+ m_oBlendDataset.m_abyBuffer .data () + nPixelCount * (nBand - 1 );
884+ const GByte *const CPL_RESTRICT pabyOverlay =
885+ m_oBlendDataset.m_abyBuffer .data () +
886+ nPixelCount * (nColorCount + nBand - 1 );
887+ GByte *const CPL_RESTRICT pabyDst = static_cast <GByte *>(pData);
888+ size_t nSrcIdx = 0 ;
889+ for (int j = 0 ; j < nBufYSize; ++j)
890+ {
891+ auto nDstOffset = j * nLineSpace;
892+ if (nPixelSpace == 1 )
893+ {
894+ SrcOverRGB (pabyOverlay + nSrcIdx, pabySrc + nSrcIdx,
895+ pabyDst + nDstOffset, nBufXSize,
896+ static_cast <uint8_t >(nOpacity));
897+ nSrcIdx += nBufXSize;
898+ }
899+ else
900+ {
901+ for (int i = 0 ; i < nBufXSize;
902+ ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
903+ {
904+ const int nOverlay = pabyOverlay[nSrcIdx];
905+ const int nSrc = pabySrc[nSrcIdx];
906+ pabyDst[nDstOffset] = static_cast <GByte>(
907+ (nOverlay * nOpacity + nSrc * (255 - nOpacity) + 255 ) /
908+ 256 );
909+ }
910+ }
911+ }
912+ return CE_None;
913+ }
757914 else if (eRWFlag == GF_Read && eBufType == GDT_Byte &&
758915 m_oBlendDataset.AcquireSourcePixels (nXOff, nYOff, nXSize, nYSize,
759916 nBufXSize, nBufYSize,
0 commit comments