From bcc4d60fa3dabb6841ce9ae3b1dfc16645d98c77 Mon Sep 17 00:00:00 2001 From: Sergey Nikolaev Date: Tue, 25 Nov 2025 17:24:18 +0700 Subject: [PATCH 1/3] feat: optimize infix hash builder with improved radix sort and increased arena size - Enhanced the infix hash builder by implementing an optimized radix sort for better performance. - Increased the arena size from 1M to 16M buckets to accommodate larger datasets. - Added a new operator for Infix_t to support comparisons. - Improved cache locality by pre-extracting key words during sorting. --- src/dict/infix/infix_builder.cpp | 121 ++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 3 deletions(-) diff --git a/src/dict/infix/infix_builder.cpp b/src/dict/infix/infix_builder.cpp index ec0b9fee67..f41f5578f5 100644 --- a/src/dict/infix/infix_builder.cpp +++ b/src/dict/infix/infix_builder.cpp @@ -12,17 +12,20 @@ #include "infix_builder.h" #include "sphinxint.h" +#include "sphinxutils.h" #include "std/crc32.h" #include "fileio.h" #include +#include +#include ////////////////////////////////////////////////////////////////////////// // KEYWORDS STORING DICTIONARY, INFIX HASH BUILDER ////////////////////////////////////////////////////////////////////////// -static constexpr int INFIX_ARENA_LENGTH = 1048576; +static constexpr int INFIX_ARENA_LENGTH = 1 << 24; // 16M buckets (was 1M) template struct Infix_t @@ -42,6 +45,11 @@ struct Infix_t { return m_Data == rhs.m_Data; } + + bool operator< ( const Infix_t& rhs ) const noexcept + { + return m_Data < rhs.m_Data; + } }; @@ -346,6 +354,111 @@ void InfixBuilder_c::AddWord ( const BYTE* pWord, int iWordLength, int iCh } } +// Optimized radix sort using 16-bit (2-byte) passes to reduce memory scans +// This reduces passes from 12 to 6 for SIZE=3, and from 20 to 10 for SIZE=5 +template +void RadixSortIndices ( CSphVector& dIndex, const CSphSwapVector>& dArena ) +{ + int iCount = dIndex.GetLength(); + if ( iCount < 2 ) + return; + + // Temporary arrays for radix sort + CSphVector dTemp; + dTemp.Resize ( iCount ); + int* pInput = dIndex.Begin(); + int* pOutput = dTemp.Begin(); + + if constexpr ( SIZE == 2 ) + { + // For SIZE=2: use std::sort fallback (SIZE=2 uses packed keys which need special handling) + // This optimization is mainly for SIZE=3 and SIZE=5 + dIndex.Sort ( Lesser ( [&dArena] ( int a, int b ) noexcept { + return dArena[a].m_tKey.m_Data < dArena[b].m_tKey.m_Data; + } ) ); + return; + } + else + { + // For SIZE=3 (12 bytes) or SIZE=5 (20 bytes): use optimized 16-bit passes + constexpr int KEY_SIZE = SIZE * sizeof(DWORD); + constexpr int WORD_COUNT = ( KEY_SIZE + 1 ) / 2; // Round up + + // Pre-extract all key words to improve cache locality + // This trades memory for better cache behavior by avoiding random dArena access + CSphVector dKeyWords; + dKeyWords.Resize ( iCount * WORD_COUNT ); + uint16_t* pKeyWords = dKeyWords.Begin(); + + // Extract all key words in one sequential pass + for ( int i = 0; i < iCount; ++i ) + { + int idx = pInput[i]; + if ( idx < 1 || idx >= dArena.GetLength() ) + { + // Zero out to prevent crash + memset ( pKeyWords + ( i * WORD_COUNT ), 0, KEY_SIZE ); + continue; + } + const uint16_t* pKeyData = reinterpret_cast( dArena[idx].m_tKey.m_Data.data() ); + uint16_t* pDst = pKeyWords + ( i * WORD_COUNT ); + memcpy ( pDst, pKeyData, KEY_SIZE ); + } + + // Use heap-allocated arrays to avoid stack overflow (256KB each) + CSphVector dCount, dPos; + dCount.Resize ( 65536 ); + dPos.Resize ( 65536 ); + int* aCount = dCount.Begin(); + int* aPos = dPos.Begin(); + + // Allocate temp key words buffer once outside the loop to avoid reallocation + CSphVector dKeyWordsTemp; + dKeyWordsTemp.Resize ( iCount * WORD_COUNT ); + + // Process 16-bit words from LSB to MSB + for ( int iWord = 0; iWord < WORD_COUNT; ++iWord ) + { + // Clear count array efficiently + memset ( aCount, 0, 65536 * sizeof(int) ); + + // Count phase - sequential access to pre-extracted keys + // After the first pass, pKeyWords points to reordered data, so we access it in order + for ( int i = 0; i < iCount; ++i ) + { + uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord]; + ++aCount[uWord]; + } + + // Convert to positions + aPos[0] = 0; + for ( int i = 1; i < 65536; ++i ) + aPos[i] = aPos[i-1] + aCount[i-1]; + + // Distribute phase - update indices and reorder key words + // Get pointer to temp buffer - this will be swapped with pKeyWords after distribution + // After first pass, pKeyWords points to reordered data, so we need to get the other buffer + uint16_t* pKeyWordsTemp = ( pKeyWords == dKeyWords.Begin() ) ? dKeyWordsTemp.Begin() : dKeyWords.Begin(); + + for ( int i = 0; i < iCount; ++i ) + { + uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord]; + int iNewPos = aPos[uWord]++; + pOutput[iNewPos] = pInput[i]; + // Copy key words to new position for next pass + memcpy ( pKeyWordsTemp + ( iNewPos * WORD_COUNT ), pKeyWords + ( i * WORD_COUNT ), KEY_SIZE ); + } + + std::swap ( pInput, pOutput ); + std::swap ( pKeyWords, pKeyWordsTemp ); + } + } + + // If final result is in temp buffer, copy back + if ( pInput == dTemp.Begin() ) + memcpy ( dIndex.Begin(), dTemp.Begin(), iCount * sizeof(int) ); +} + static inline int ZippedIntSize ( DWORD v ) noexcept { if ( v < ( 1UL << 7 ) ) @@ -369,9 +482,11 @@ void InfixBuilder_c::SaveEntries ( CSphWriter& wrDict ) wrDict.PutBlob ( g_sTagInfixEntries ); CSphVector dIndex; - dIndex.Resize ( m_dArena.GetLength() - 1 ); + int iTotalEntries = m_dArena.GetLength() - 1; + dIndex.Resize ( iTotalEntries ); dIndex.FillSeq(1); - dIndex.Sort ( Lesser ( [this] ( int a, int b ) noexcept { return m_dArena[a].m_tKey.m_Data < m_dArena[b].m_tKey.m_Data; } ) ); + // Use radix sort for O(n) performance instead of O(n log n) + RadixSortIndices ( dIndex, m_dArena ); m_dBlocksWords.Reserve ( m_dArena.GetLength() / INFIX_BLOCK_SIZE * sizeof ( DWORD ) * SIZE ); int iBlock = 0; From 656f5db934ad25a3c2a586a4490c1167e030ca27 Mon Sep 17 00:00:00 2001 From: Sergey Nikolaev Date: Wed, 26 Nov 2025 11:14:06 +0700 Subject: [PATCH 2/3] fixes --- src/dict/infix/infix_builder.cpp | 85 ++++++++++++++++---------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/src/dict/infix/infix_builder.cpp b/src/dict/infix/infix_builder.cpp index f41f5578f5..05f43cac20 100644 --- a/src/dict/infix/infix_builder.cpp +++ b/src/dict/infix/infix_builder.cpp @@ -354,8 +354,9 @@ void InfixBuilder_c::AddWord ( const BYTE* pWord, int iWordLength, int iCh } } -// Optimized radix sort using 16-bit (2-byte) passes to reduce memory scans -// This reduces passes from 12 to 6 for SIZE=3, and from 20 to 10 for SIZE=5 +// Radix sort for fixed-size keys using LSD (Least Significant Digit) algorithm +// Processes bytes from least significant (last) to most significant (first) +// This ensures exact compatibility with std::array::operator< lexicographic comparison template void RadixSortIndices ( CSphVector& dIndex, const CSphSwapVector>& dArena ) { @@ -372,7 +373,6 @@ void RadixSortIndices ( CSphVector& dIndex, const CSphSwapVector& dIndex, const CSphSwapVector dKeyWords; - dKeyWords.Resize ( iCount * WORD_COUNT ); - uint16_t* pKeyWords = dKeyWords.Begin(); + CSphVector dKeyBytes; + dKeyBytes.Resize ( iCount * KEY_SIZE ); + BYTE* pKeyBytes = dKeyBytes.Begin(); - // Extract all key words in one sequential pass + // Extract all key bytes in one sequential pass for ( int i = 0; i < iCount; ++i ) { int idx = pInput[i]; - if ( idx < 1 || idx >= dArena.GetLength() ) + if ( idx >= 1 && idx < dArena.GetLength() ) { - // Zero out to prevent crash - memset ( pKeyWords + ( i * WORD_COUNT ), 0, KEY_SIZE ); - continue; + const BYTE* pSrc = dArena[idx].m_tKey.m_Data.data(); + memcpy ( pKeyBytes + ( i * KEY_SIZE ), pSrc, KEY_SIZE ); + } + else + { + memset ( pKeyBytes + ( i * KEY_SIZE ), 0, KEY_SIZE ); } - const uint16_t* pKeyData = reinterpret_cast( dArena[idx].m_tKey.m_Data.data() ); - uint16_t* pDst = pKeyWords + ( i * WORD_COUNT ); - memcpy ( pDst, pKeyData, KEY_SIZE ); } - // Use heap-allocated arrays to avoid stack overflow (256KB each) + // Use heap-allocated arrays to avoid stack overflow CSphVector dCount, dPos; - dCount.Resize ( 65536 ); - dPos.Resize ( 65536 ); + dCount.Resize ( 256 ); + dPos.Resize ( 256 ); int* aCount = dCount.Begin(); int* aPos = dPos.Begin(); - // Allocate temp key words buffer once outside the loop to avoid reallocation - CSphVector dKeyWordsTemp; - dKeyWordsTemp.Resize ( iCount * WORD_COUNT ); + // Allocate temp key bytes buffer once outside the loop + CSphVector dKeyBytesTemp; + dKeyBytesTemp.Resize ( iCount * KEY_SIZE ); - // Process 16-bit words from LSB to MSB - for ( int iWord = 0; iWord < WORD_COUNT; ++iWord ) + // Process bytes from last (least significant) to first (most significant) + // This is LSD radix sort: stable and produces correct lexicographic ordering + for ( int iByte = KEY_SIZE - 1; iByte >= 0; --iByte ) { - // Clear count array efficiently - memset ( aCount, 0, 65536 * sizeof(int) ); + // Clear count array + memset ( aCount, 0, 256 * sizeof(int) ); - // Count phase - sequential access to pre-extracted keys - // After the first pass, pKeyWords points to reordered data, so we access it in order + // Count phase: sequential access to pre-extracted keys for ( int i = 0; i < iCount; ++i ) { - uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord]; - ++aCount[uWord]; + BYTE uByte = pKeyBytes[i * KEY_SIZE + iByte]; + ++aCount[uByte]; } - // Convert to positions + // Convert counts to starting positions (cumulative sum) aPos[0] = 0; - for ( int i = 1; i < 65536; ++i ) + for ( int i = 1; i < 256; ++i ) aPos[i] = aPos[i-1] + aCount[i-1]; - // Distribute phase - update indices and reorder key words - // Get pointer to temp buffer - this will be swapped with pKeyWords after distribution - // After first pass, pKeyWords points to reordered data, so we need to get the other buffer - uint16_t* pKeyWordsTemp = ( pKeyWords == dKeyWords.Begin() ) ? dKeyWordsTemp.Begin() : dKeyWords.Begin(); + // Distribute phase: reorder indices and key bytes + // Get pointer to temp buffer - this will be swapped with pKeyBytes after distribution + BYTE* pKeyBytesTemp = ( pKeyBytes == dKeyBytes.Begin() ) ? dKeyBytesTemp.Begin() : dKeyBytes.Begin(); for ( int i = 0; i < iCount; ++i ) { - uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord]; - int iNewPos = aPos[uWord]++; + BYTE uByte = pKeyBytes[i * KEY_SIZE + iByte]; + int iNewPos = aPos[uByte]++; pOutput[iNewPos] = pInput[i]; - // Copy key words to new position for next pass - memcpy ( pKeyWordsTemp + ( iNewPos * WORD_COUNT ), pKeyWords + ( i * WORD_COUNT ), KEY_SIZE ); + // Copy key bytes to new position for next pass + memcpy ( pKeyBytesTemp + ( iNewPos * KEY_SIZE ), pKeyBytes + ( i * KEY_SIZE ), KEY_SIZE ); } + // Swap input/output buffers and key byte buffers for next iteration std::swap ( pInput, pOutput ); - std::swap ( pKeyWords, pKeyWordsTemp ); + std::swap ( pKeyBytes, pKeyBytesTemp ); } } @@ -646,4 +645,4 @@ std::unique_ptr sphCreateInfixBuilder ( int iCodepointBytes, C case 3: return std::make_unique>(); // upto 6x3 bytes, 5 dwords, utf-8 default: pError->SetSprintf ( "unhandled max infix codepoint size %d", iCodepointBytes ); return nullptr; } -} \ No newline at end of file +} From de6b0d45364f2b2db18d7eecafe2faa76c190ff2 Mon Sep 17 00:00:00 2001 From: Sergey Nikolaev Date: Wed, 26 Nov 2025 12:10:59 +0700 Subject: [PATCH 3/3] chore: minor changes --- src/dict/infix/infix_builder.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/dict/infix/infix_builder.cpp b/src/dict/infix/infix_builder.cpp index 05f43cac20..d670b227ba 100644 --- a/src/dict/infix/infix_builder.cpp +++ b/src/dict/infix/infix_builder.cpp @@ -25,7 +25,7 @@ // KEYWORDS STORING DICTIONARY, INFIX HASH BUILDER ////////////////////////////////////////////////////////////////////////// -static constexpr int INFIX_ARENA_LENGTH = 1 << 24; // 16M buckets (was 1M) +static constexpr int INFIX_ARENA_LENGTH = 1 << 24; // 16M buckets provides better performance than 1M which was used previously template struct Infix_t @@ -481,10 +481,9 @@ void InfixBuilder_c::SaveEntries ( CSphWriter& wrDict ) wrDict.PutBlob ( g_sTagInfixEntries ); CSphVector dIndex; - int iTotalEntries = m_dArena.GetLength() - 1; - dIndex.Resize ( iTotalEntries ); + dIndex.Resize ( m_dArena.GetLength() - 1 ); dIndex.FillSeq(1); - // Use radix sort for O(n) performance instead of O(n log n) + // Use radix sort for O(n) performance instead of O(n log n) which std::sort provides RadixSortIndices ( dIndex, m_dArena ); m_dBlocksWords.Reserve ( m_dArena.GetLength() / INFIX_BLOCK_SIZE * sizeof ( DWORD ) * SIZE );