|
19 | 19 | #include <algorithm> |
20 | 20 | #include <array> |
21 | 21 | #include <limits> |
| 22 | +#include <utility> |
22 | 23 |
|
23 | 24 | #if defined(__x86_64) || defined(_M_X64) |
24 | 25 | #define HAVE_SSE2 |
| 26 | +#include <immintrin.h> |
25 | 27 | #endif |
26 | 28 | #ifdef HAVE_SSE2 |
27 | 29 | #include "gdalsse_priv.h" |
@@ -635,6 +637,126 @@ constexpr auto gTabInvDstA = []() |
635 | 637 | return arr; |
636 | 638 | }(); |
637 | 639 |
|
| 640 | +/************************************************************************/ |
| 641 | +/* BlendSrcOverRGBA_SSE2() */ |
| 642 | +/************************************************************************/ |
| 643 | + |
| 644 | +#ifdef HAVE_SSE2 |
| 645 | +static int BlendSrcOverRGBA_SSE2(const GByte *CPL_RESTRICT pabyR, |
| 646 | + const GByte *CPL_RESTRICT pabyG, |
| 647 | + const GByte *CPL_RESTRICT pabyB, |
| 648 | + const GByte *CPL_RESTRICT pabyA, |
| 649 | + const GByte *CPL_RESTRICT pabyOverlayR, |
| 650 | + const GByte *CPL_RESTRICT pabyOverlayG, |
| 651 | + const GByte *CPL_RESTRICT pabyOverlayB, |
| 652 | + const GByte *CPL_RESTRICT pabyOverlayA, |
| 653 | + GByte *CPL_RESTRICT pabyDst, |
| 654 | + GPtrDiff_t nBandSpace, int N, int nOpacity) |
| 655 | +{ |
| 656 | + // See scalar code after call to BlendSrcOverRGBA_SSE2() below for the |
| 657 | + // non-obfuscated formulas... |
| 658 | + |
| 659 | + const auto load_and_unpack = [](const void *p) |
| 660 | + { |
| 661 | + const auto zero = _mm_setzero_si128(); |
| 662 | + auto overlayA = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p)); |
| 663 | + return std::make_pair(_mm_unpacklo_epi8(overlayA, zero), |
| 664 | + _mm_unpackhi_epi8(overlayA, zero)); |
| 665 | + }; |
| 666 | + |
| 667 | + const auto pack_and_store = [](void *p, __m128i lo, __m128i hi) { |
| 668 | + _mm_storeu_si128(reinterpret_cast<__m128i *>(p), |
| 669 | + _mm_packus_epi16(lo, hi)); |
| 670 | + }; |
| 671 | + |
| 672 | + const auto mul16bit_8bit_result = [](__m128i a, __m128i b) |
| 673 | + { |
| 674 | + const auto r255 = _mm_set1_epi16(255); |
| 675 | + return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(a, b), r255), 8); |
| 676 | + }; |
| 677 | + |
| 678 | + const auto opacity = _mm_set1_epi16(static_cast<int16_t>(nOpacity)); |
| 679 | + const auto r255 = _mm_set1_epi16(255); |
| 680 | + const int16_t *tabInvDstASigned = |
| 681 | + reinterpret_cast<const int16_t *>(gTabInvDstA.data()); |
| 682 | + constexpr int REG_WIDTH = static_cast<int>(sizeof(opacity)); |
| 683 | + |
| 684 | + int i = 0; |
| 685 | + for (; i <= N - REG_WIDTH; i += REG_WIDTH) |
| 686 | + { |
| 687 | + auto [overlayA_lo, overlayA_hi] = load_and_unpack(pabyOverlayA + i); |
| 688 | + auto [srcA_lo, srcA_hi] = load_and_unpack(pabyA + i); |
| 689 | + overlayA_lo = mul16bit_8bit_result(overlayA_lo, opacity); |
| 690 | + overlayA_hi = mul16bit_8bit_result(overlayA_hi, opacity); |
| 691 | + auto srcAMul255MinusOverlayA_lo = |
| 692 | + mul16bit_8bit_result(srcA_lo, _mm_sub_epi16(r255, overlayA_lo)); |
| 693 | + auto srcAMul255MinusOverlayA_hi = |
| 694 | + mul16bit_8bit_result(srcA_hi, _mm_sub_epi16(r255, overlayA_hi)); |
| 695 | + auto dstA_lo = _mm_add_epi16(overlayA_lo, srcAMul255MinusOverlayA_lo); |
| 696 | + auto dstA_hi = _mm_add_epi16(overlayA_hi, srcAMul255MinusOverlayA_hi); |
| 697 | + |
| 698 | + // The & 0xff should not be necessary. This is mostly a safety |
| 699 | + // belt if the above math yields a result outside [0, 255]... |
| 700 | + dstA_lo = _mm_and_si128(dstA_lo, r255); |
| 701 | + dstA_hi = _mm_and_si128(dstA_hi, r255); |
| 702 | + |
| 703 | + // This would be the equivalent of a "_mm_i16gather_epi16" operation |
| 704 | + // which does not exist... |
| 705 | + // invDstA_{i} = [tabInvDstASigned[dstA_{i}] for i in range(8)] |
| 706 | + auto invDstA_lo = _mm_undefined_si128(); |
| 707 | + auto invDstA_hi = _mm_undefined_si128(); |
| 708 | +#define SET_INVDSTA(k) \ |
| 709 | + do \ |
| 710 | + { \ |
| 711 | + const int idxLo = _mm_extract_epi16(dstA_lo, k); \ |
| 712 | + const int idxHi = _mm_extract_epi16(dstA_hi, k); \ |
| 713 | + invDstA_lo = _mm_insert_epi16(invDstA_lo, tabInvDstASigned[idxLo], k); \ |
| 714 | + invDstA_hi = _mm_insert_epi16(invDstA_hi, tabInvDstASigned[idxHi], k); \ |
| 715 | + } while (0) |
| 716 | + |
| 717 | + SET_INVDSTA(0); |
| 718 | + SET_INVDSTA(1); |
| 719 | + SET_INVDSTA(2); |
| 720 | + SET_INVDSTA(3); |
| 721 | + SET_INVDSTA(4); |
| 722 | + SET_INVDSTA(5); |
| 723 | + SET_INVDSTA(6); |
| 724 | + SET_INVDSTA(7); |
| 725 | + |
| 726 | + pack_and_store(pabyDst + i + 3 * nBandSpace, dstA_lo, dstA_hi); |
| 727 | + |
| 728 | +#define PROCESS_COMPONENT(pabySrc, pabyOverlay, iBand) \ |
| 729 | + do \ |
| 730 | + { \ |
| 731 | + auto [src_lo, src_hi] = load_and_unpack((pabySrc) + i); \ |
| 732 | + auto [overlay_lo, overlay_hi] = load_and_unpack((pabyOverlay) + i); \ |
| 733 | + auto dst_lo = _mm_srli_epi16( \ |
| 734 | + _mm_add_epi16( \ |
| 735 | + _mm_add_epi16( \ |
| 736 | + _mm_mullo_epi16(overlay_lo, overlayA_lo), \ |
| 737 | + _mm_mullo_epi16(src_lo, srcAMul255MinusOverlayA_lo)), \ |
| 738 | + r255), \ |
| 739 | + 8); \ |
| 740 | + auto dst_hi = _mm_srli_epi16( \ |
| 741 | + _mm_add_epi16( \ |
| 742 | + _mm_add_epi16( \ |
| 743 | + _mm_mullo_epi16(overlay_hi, overlayA_hi), \ |
| 744 | + _mm_mullo_epi16(src_hi, srcAMul255MinusOverlayA_hi)), \ |
| 745 | + r255), \ |
| 746 | + 8); \ |
| 747 | + dst_lo = mul16bit_8bit_result(dst_lo, invDstA_lo); \ |
| 748 | + dst_hi = mul16bit_8bit_result(dst_hi, invDstA_hi); \ |
| 749 | + pack_and_store(pabyDst + i + (iBand)*nBandSpace, dst_lo, dst_hi); \ |
| 750 | + } while (0) |
| 751 | + |
| 752 | + PROCESS_COMPONENT(pabyR, pabyOverlayR, 0); |
| 753 | + PROCESS_COMPONENT(pabyG, pabyOverlayG, 1); |
| 754 | + PROCESS_COMPONENT(pabyB, pabyOverlayB, 2); |
| 755 | + } |
| 756 | + return i; |
| 757 | +} |
| 758 | +#endif |
| 759 | + |
638 | 760 | /************************************************************************/ |
639 | 761 | /* BlendDataset::IRasterIO() */ |
640 | 762 | /************************************************************************/ |
@@ -740,8 +862,21 @@ CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff, |
740 | 862 | for (int j = 0; j < nBufYSize; ++j) |
741 | 863 | { |
742 | 864 | auto nDstOffset = j * nLineSpace; |
743 | | - for (int i = 0; i < nBufXSize; |
744 | | - ++i, ++nSrcIdx, nDstOffset += nPixelSpace) |
| 865 | + int i = 0; |
| 866 | +#ifdef HAVE_SSE2 |
| 867 | + if (nPixelSpace == 1) |
| 868 | + { |
| 869 | + i = BlendSrcOverRGBA_SSE2( |
| 870 | + pabyR + nSrcIdx, pabyG + nSrcIdx, pabyB + nSrcIdx, |
| 871 | + pabyA + nSrcIdx, pabyOverlayR + nSrcIdx, |
| 872 | + pabyOverlayG + nSrcIdx, pabyOverlayB + nSrcIdx, |
| 873 | + pabyOverlayA + nSrcIdx, pabyDst + nDstOffset, nBandSpace, |
| 874 | + nBufXSize, nOpacity); |
| 875 | + nSrcIdx += i; |
| 876 | + nDstOffset += i; |
| 877 | + } |
| 878 | +#endif |
| 879 | + for (; i < nBufXSize; ++i, ++nSrcIdx, nDstOffset += nPixelSpace) |
745 | 880 | { |
746 | 881 | const int nOverlayR = pabyOverlayR[nSrcIdx]; |
747 | 882 | const int nOverlayG = pabyOverlayG[nSrcIdx]; |
|
0 commit comments