@@ -855,11 +855,30 @@ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* co
855855{
856856 const BYTE * const pStart = pIn ;
857857 const BYTE * const pInLoopLimit = pInLimit - (sizeof (size_t )-1 );
858+ #if defined(ZSTD_ARCH_X86_SSE2 )
859+ const BYTE * const pInLimit16 = pInLimit - (sizeof (__m128i )- 1 );
860+ #endif
858861
859862 if (pIn < pInLoopLimit ) {
860863 { size_t const diff = MEM_readST (pMatch ) ^ MEM_readST (pIn );
861864 if (diff ) return ZSTD_NbCommonBytes (diff ); }
862865 pIn += sizeof (size_t ); pMatch += sizeof (size_t );
866+ #if defined(ZSTD_ARCH_X86_SSE2 )
867+ if ((size_t )(pInLimit - pIn ) >= 32 ) {
868+ while (pIn < pInLimit16 ) {
869+ __m128i const matchVec = _mm_loadu_si128 ((const __m128i * )pMatch );
870+ __m128i const inVec = _mm_loadu_si128 ((const __m128i * )pIn );
871+ U32 const matchMask = (U32 )_mm_movemask_epi8 (_mm_cmpeq_epi8 (matchVec , inVec ));
872+ if (matchMask != 0xFFFF ) {
873+ U32 const diffMask = ~matchMask & 0xFFFF ;
874+ pIn += ZSTD_countTrailingZeros32 (diffMask );
875+ return (size_t )(pIn - pStart );
876+ }
877+ pIn += sizeof (__m128i );
878+ pMatch += sizeof (__m128i );
879+ }
880+ }
881+ #endif
863882 while (pIn < pInLoopLimit ) {
864883 size_t const diff = MEM_readST (pMatch ) ^ MEM_readST (pIn );
865884 if (!diff ) { pIn += sizeof (size_t ); pMatch += sizeof (size_t ); continue ; }
0 commit comments