From bcc4d60fa3dabb6841ce9ae3b1dfc16645d98c77 Mon Sep 17 00:00:00 2001
From: Sergey Nikolaev <prostuda@academ.org>
Date: Tue, 25 Nov 2025 17:24:18 +0700
Subject: [PATCH 1/3] feat: optimize infix hash builder with improved radix
 sort and increased arena size

- Enhanced the infix hash builder by implementing an optimized radix sort for better performance.
- Increased the arena size from 1M to 16M buckets to accommodate larger datasets.
- Added a new operator for Infix_t to support comparisons.
- Improved cache locality by pre-extracting key words during sorting.
---
 src/dict/infix/infix_builder.cpp | 121 ++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 3 deletions(-)
diff --git a/src/dict/infix/infix_builder.cpp b/src/dict/infix/infix_builder.cpp
index ec0b9fee67..f41f5578f5 100644
--- a/src/dict/infix/infix_builder.cpp
+++ b/src/dict/infix/infix_builder.cpp
@@ -12,17 +12,20 @@
 
 #include "infix_builder.h"
 #include "sphinxint.h"
+#include "sphinxutils.h"
 
 #include "std/crc32.h"
 #include "fileio.h"
 
 #include <array>
+#include <cstring>
+#include <cstdint>
 
 //////////////////////////////////////////////////////////////////////////
 // KEYWORDS STORING DICTIONARY, INFIX HASH BUILDER
 //////////////////////////////////////////////////////////////////////////
 
-static constexpr int INFIX_ARENA_LENGTH = 1048576;
+static constexpr int INFIX_ARENA_LENGTH = 1 << 24; // 16M buckets (was 1M)
 
 template<int SIZE>
 struct Infix_t
@@ -42,6 +45,11 @@ struct Infix_t
 	{
 		return m_Data == rhs.m_Data;
 	}
+
+	bool operator< ( const Infix_t<SIZE>& rhs ) const noexcept
+	{
+		return m_Data < rhs.m_Data;
+	}
 };
 
 
@@ -346,6 +354,111 @@ void InfixBuilder_c<SIZE>::AddWord ( const BYTE* pWord, int iWordLength, int iCh
 	}
 }
 
+// Optimized radix sort using 16-bit (2-byte) passes to reduce memory scans
+// This reduces passes from 12 to 6 for SIZE=3, and from 20 to 10 for SIZE=5
+template<int SIZE>
+void RadixSortIndices ( CSphVector<int>& dIndex, const CSphSwapVector<InfixHashEntry_t<SIZE>>& dArena )
+{
+	int iCount = dIndex.GetLength();
+	if ( iCount < 2 )
+		return;
+
+	// Temporary arrays for radix sort
+	CSphVector<int> dTemp;
+	dTemp.Resize ( iCount );
+	int* pInput = dIndex.Begin();
+	int* pOutput = dTemp.Begin();
+
+	if constexpr ( SIZE == 2 )
+	{
+		// For SIZE=2: use std::sort fallback (SIZE=2 uses packed keys which need special handling)
+		// This optimization is mainly for SIZE=3 and SIZE=5
+		dIndex.Sort ( Lesser ( [&dArena] ( int a, int b ) noexcept { 
+			return dArena[a].m_tKey.m_Data < dArena[b].m_tKey.m_Data; 
+		} ) );
+		return;
+	}
+	else
+	{
+		// For SIZE=3 (12 bytes) or SIZE=5 (20 bytes): use optimized 16-bit passes
+		constexpr int KEY_SIZE = SIZE * sizeof(DWORD);
+		constexpr int WORD_COUNT = ( KEY_SIZE + 1 ) / 2; // Round up
+		
+		// Pre-extract all key words to improve cache locality
+		// This trades memory for better cache behavior by avoiding random dArena access
+		CSphVector<uint16_t> dKeyWords;
+		dKeyWords.Resize ( iCount * WORD_COUNT );
+		uint16_t* pKeyWords = dKeyWords.Begin();
+		
+		// Extract all key words in one sequential pass
+		for ( int i = 0; i < iCount; ++i )
+		{
+			int idx = pInput[i];
+			if ( idx < 1 || idx >= dArena.GetLength() )
+			{
+				// Zero out to prevent crash
+				memset ( pKeyWords + ( i * WORD_COUNT ), 0, KEY_SIZE );
+				continue;
+			}
+			const uint16_t* pKeyData = reinterpret_cast<const uint16_t*>( dArena[idx].m_tKey.m_Data.data() );
+			uint16_t* pDst = pKeyWords + ( i * WORD_COUNT );
+			memcpy ( pDst, pKeyData, KEY_SIZE );
+		}
+		
+		// Use heap-allocated arrays to avoid stack overflow (256KB each)
+		CSphVector<int> dCount, dPos;
+		dCount.Resize ( 65536 );
+		dPos.Resize ( 65536 );
+		int* aCount = dCount.Begin();
+		int* aPos = dPos.Begin();
+		
+		// Allocate temp key words buffer once outside the loop to avoid reallocation
+		CSphVector<uint16_t> dKeyWordsTemp;
+		dKeyWordsTemp.Resize ( iCount * WORD_COUNT );
+		
+		// Process 16-bit words from LSB to MSB
+		for ( int iWord = 0; iWord < WORD_COUNT; ++iWord )
+		{
+			// Clear count array efficiently
+			memset ( aCount, 0, 65536 * sizeof(int) );
+			
+			// Count phase - sequential access to pre-extracted keys
+			// After the first pass, pKeyWords points to reordered data, so we access it in order
+			for ( int i = 0; i < iCount; ++i )
+			{
+				uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord];
+				++aCount[uWord];
+			}
+			
+			// Convert to positions
+			aPos[0] = 0;
+			for ( int i = 1; i < 65536; ++i )
+				aPos[i] = aPos[i-1] + aCount[i-1];
+			
+			// Distribute phase - update indices and reorder key words
+			// Get pointer to temp buffer - this will be swapped with pKeyWords after distribution
+			// After first pass, pKeyWords points to reordered data, so we need to get the other buffer
+			uint16_t* pKeyWordsTemp = ( pKeyWords == dKeyWords.Begin() ) ? dKeyWordsTemp.Begin() : dKeyWords.Begin();
+			
+			for ( int i = 0; i < iCount; ++i )
+			{
+				uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord];
+				int iNewPos = aPos[uWord]++;
+				pOutput[iNewPos] = pInput[i];
+				// Copy key words to new position for next pass
+				memcpy ( pKeyWordsTemp + ( iNewPos * WORD_COUNT ), pKeyWords + ( i * WORD_COUNT ), KEY_SIZE );
+			}
+			
+			std::swap ( pInput, pOutput );
+			std::swap ( pKeyWords, pKeyWordsTemp );
+		}
+	}
+
+	// If final result is in temp buffer, copy back
+	if ( pInput == dTemp.Begin() )
+		memcpy ( dIndex.Begin(), dTemp.Begin(), iCount * sizeof(int) );
+}
+
 static inline int ZippedIntSize ( DWORD v ) noexcept
 {
 	if ( v < ( 1UL << 7 ) )
@@ -369,9 +482,11 @@ void InfixBuilder_c<SIZE>::SaveEntries ( CSphWriter& wrDict )
 	wrDict.PutBlob ( g_sTagInfixEntries );
 
 	CSphVector<int> dIndex;
-	dIndex.Resize ( m_dArena.GetLength() - 1 );
+	int iTotalEntries = m_dArena.GetLength() - 1;
+	dIndex.Resize ( iTotalEntries );
 	dIndex.FillSeq(1);
-	dIndex.Sort ( Lesser ( [this] ( int a, int b ) noexcept { return m_dArena[a].m_tKey.m_Data < m_dArena[b].m_tKey.m_Data; } ) );
+	// Use radix sort for O(n) performance instead of O(n log n)
+	RadixSortIndices<SIZE> ( dIndex, m_dArena );
 
 	m_dBlocksWords.Reserve ( m_dArena.GetLength() / INFIX_BLOCK_SIZE * sizeof ( DWORD ) * SIZE );
 	int iBlock = 0;

From 656f5db934ad25a3c2a586a4490c1167e030ca27 Mon Sep 17 00:00:00 2001
From: Sergey Nikolaev <prostuda@academ.org>
Date: Wed, 26 Nov 2025 11:14:06 +0700
Subject: [PATCH 2/3] fixes

---
 src/dict/infix/infix_builder.cpp | 85 ++++++++++++++++----------------
 1 file changed, 42 insertions(+), 43 deletions(-)

diff --git a/src/dict/infix/infix_builder.cpp b/src/dict/infix/infix_builder.cpp
index f41f5578f5..05f43cac20 100644
--- a/src/dict/infix/infix_builder.cpp
+++ b/src/dict/infix/infix_builder.cpp
@@ -354,8 +354,9 @@ void InfixBuilder_c<SIZE>::AddWord ( const BYTE* pWord, int iWordLength, int iCh
 	}
 }
 
-// Optimized radix sort using 16-bit (2-byte) passes to reduce memory scans
-// This reduces passes from 12 to 6 for SIZE=3, and from 20 to 10 for SIZE=5
+// Radix sort for fixed-size keys using LSD (Least Significant Digit) algorithm
+// Processes bytes from least significant (last) to most significant (first)
+// This ensures exact compatibility with std::array::operator< lexicographic comparison
 template<int SIZE>
 void RadixSortIndices ( CSphVector<int>& dIndex, const CSphSwapVector<InfixHashEntry_t<SIZE>>& dArena )
 {
@@ -372,7 +373,6 @@ void RadixSortIndices ( CSphVector<int>& dIndex, const CSphSwapVector<InfixHashE
 	if constexpr ( SIZE == 2 )
 	{
 		// For SIZE=2: use std::sort fallback (SIZE=2 uses packed keys which need special handling)
-		// This optimization is mainly for SIZE=3 and SIZE=5
 		dIndex.Sort ( Lesser ( [&dArena] ( int a, int b ) noexcept { 
 			return dArena[a].m_tKey.m_Data < dArena[b].m_tKey.m_Data; 
 		} ) );
@@ -380,77 +380,76 @@ void RadixSortIndices ( CSphVector<int>& dIndex, const CSphSwapVector<InfixHashE
 	}
 	else
 	{
-		// For SIZE=3 (12 bytes) or SIZE=5 (20 bytes): use optimized 16-bit passes
+		// For SIZE=3 (12 bytes) or SIZE=5 (20 bytes): use byte-by-byte LSD radix sort
 		constexpr int KEY_SIZE = SIZE * sizeof(DWORD);
-		constexpr int WORD_COUNT = ( KEY_SIZE + 1 ) / 2; // Round up
 		
-		// Pre-extract all key words to improve cache locality
+		// Pre-extract all key bytes to improve cache locality
 		// This trades memory for better cache behavior by avoiding random dArena access
-		CSphVector<uint16_t> dKeyWords;
-		dKeyWords.Resize ( iCount * WORD_COUNT );
-		uint16_t* pKeyWords = dKeyWords.Begin();
+		CSphVector<BYTE> dKeyBytes;
+		dKeyBytes.Resize ( iCount * KEY_SIZE );
+		BYTE* pKeyBytes = dKeyBytes.Begin();
 		
-		// Extract all key words in one sequential pass
+		// Extract all key bytes in one sequential pass
 		for ( int i = 0; i < iCount; ++i )
 		{
 			int idx = pInput[i];
-			if ( idx < 1 || idx >= dArena.GetLength() )
+			if ( idx >= 1 && idx < dArena.GetLength() )
 			{
-				// Zero out to prevent crash
-				memset ( pKeyWords + ( i * WORD_COUNT ), 0, KEY_SIZE );
-				continue;
+				const BYTE* pSrc = dArena[idx].m_tKey.m_Data.data();
+				memcpy ( pKeyBytes + ( i * KEY_SIZE ), pSrc, KEY_SIZE );
+			}
+			else
+			{
+				memset ( pKeyBytes + ( i * KEY_SIZE ), 0, KEY_SIZE );
 			}
-			const uint16_t* pKeyData = reinterpret_cast<const uint16_t*>( dArena[idx].m_tKey.m_Data.data() );
-			uint16_t* pDst = pKeyWords + ( i * WORD_COUNT );
-			memcpy ( pDst, pKeyData, KEY_SIZE );
 		}
 		
-		// Use heap-allocated arrays to avoid stack overflow (256KB each)
+		// Use heap-allocated arrays to avoid stack overflow
 		CSphVector<int> dCount, dPos;
-		dCount.Resize ( 65536 );
-		dPos.Resize ( 65536 );
+		dCount.Resize ( 256 );
+		dPos.Resize ( 256 );
 		int* aCount = dCount.Begin();
 		int* aPos = dPos.Begin();
 		
-		// Allocate temp key words buffer once outside the loop to avoid reallocation
-		CSphVector<uint16_t> dKeyWordsTemp;
-		dKeyWordsTemp.Resize ( iCount * WORD_COUNT );
+		// Allocate temp key bytes buffer once outside the loop
+		CSphVector<BYTE> dKeyBytesTemp;
+		dKeyBytesTemp.Resize ( iCount * KEY_SIZE );
 		
-		// Process 16-bit words from LSB to MSB
-		for ( int iWord = 0; iWord < WORD_COUNT; ++iWord )
+		// Process bytes from last (least significant) to first (most significant)
+		// This is LSD radix sort: stable and produces correct lexicographic ordering
+		for ( int iByte = KEY_SIZE - 1; iByte >= 0; --iByte )
 		{
-			// Clear count array efficiently
-			memset ( aCount, 0, 65536 * sizeof(int) );
+			// Clear count array
+			memset ( aCount, 0, 256 * sizeof(int) );
 			
-			// Count phase - sequential access to pre-extracted keys
-			// After the first pass, pKeyWords points to reordered data, so we access it in order
+			// Count phase: sequential access to pre-extracted keys
 			for ( int i = 0; i < iCount; ++i )
 			{
-				uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord];
-				++aCount[uWord];
+				BYTE uByte = pKeyBytes[i * KEY_SIZE + iByte];
+				++aCount[uByte];
 			}
 			
-			// Convert to positions
+			// Convert counts to starting positions (cumulative sum)
 			aPos[0] = 0;
-			for ( int i = 1; i < 65536; ++i )
+			for ( int i = 1; i < 256; ++i )
 				aPos[i] = aPos[i-1] + aCount[i-1];
 			
-			// Distribute phase - update indices and reorder key words
-			// Get pointer to temp buffer - this will be swapped with pKeyWords after distribution
-			// After first pass, pKeyWords points to reordered data, so we need to get the other buffer
-			uint16_t* pKeyWordsTemp = ( pKeyWords == dKeyWords.Begin() ) ? dKeyWordsTemp.Begin() : dKeyWords.Begin();
+			// Distribute phase: reorder indices and key bytes
+			// Get pointer to temp buffer - this will be swapped with pKeyBytes after distribution
+			BYTE* pKeyBytesTemp = ( pKeyBytes == dKeyBytes.Begin() ) ? dKeyBytesTemp.Begin() : dKeyBytes.Begin();
 			
 			for ( int i = 0; i < iCount; ++i )
 			{
-				uint16_t uWord = pKeyWords[i * WORD_COUNT + iWord];
-				int iNewPos = aPos[uWord]++;
+				BYTE uByte = pKeyBytes[i * KEY_SIZE + iByte];
+				int iNewPos = aPos[uByte]++;
 				pOutput[iNewPos] = pInput[i];
-				// Copy key words to new position for next pass
-				memcpy ( pKeyWordsTemp + ( iNewPos * WORD_COUNT ), pKeyWords + ( i * WORD_COUNT ), KEY_SIZE );
+				// Copy key bytes to new position for next pass
+				memcpy ( pKeyBytesTemp + ( iNewPos * KEY_SIZE ), pKeyBytes + ( i * KEY_SIZE ), KEY_SIZE );
 			}
 			
+			// Swap input/output buffers and key byte buffers for next iteration
 			std::swap ( pInput, pOutput );
-			std::swap ( pKeyWords, pKeyWordsTemp );
+			std::swap ( pKeyBytes, pKeyBytesTemp );
 		}
 	}
 
@@ -646,4 +645,4 @@ std::unique_ptr<ISphInfixBuilder> sphCreateInfixBuilder ( int iCodepointBytes, C
 	case 3: return std::make_unique<InfixBuilder_c<5>>(); // upto 6x3 bytes, 5 dwords, utf-8
 	default: pError->SetSprintf ( "unhandled max infix codepoint size %d", iCodepointBytes ); return nullptr;
 	}
-}
\ No newline at end of file
+}

From de6b0d45364f2b2db18d7eecafe2faa76c190ff2 Mon Sep 17 00:00:00 2001
From: Sergey Nikolaev <prostuda@academ.org>
Date: Wed, 26 Nov 2025 12:10:59 +0700
Subject: [PATCH 3/3] chore: minor changes

---
 src/dict/infix/infix_builder.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/dict/infix/infix_builder.cpp b/src/dict/infix/infix_builder.cpp
index 05f43cac20..d670b227ba 100644
--- a/src/dict/infix/infix_builder.cpp
+++ b/src/dict/infix/infix_builder.cpp
@@ -25,7 +25,7 @@
 // KEYWORDS STORING DICTIONARY, INFIX HASH BUILDER
 //////////////////////////////////////////////////////////////////////////
 
-static constexpr int INFIX_ARENA_LENGTH = 1 << 24; // 16M buckets (was 1M)
+static constexpr int INFIX_ARENA_LENGTH = 1 << 24; // 16M buckets provides better performance than 1M which was used previously
 
 template<int SIZE>
 struct Infix_t
@@ -481,10 +481,9 @@ void InfixBuilder_c<SIZE>::SaveEntries ( CSphWriter& wrDict )
 	wrDict.PutBlob ( g_sTagInfixEntries );
 
 	CSphVector<int> dIndex;
-	int iTotalEntries = m_dArena.GetLength() - 1;
-	dIndex.Resize ( iTotalEntries );
+	dIndex.Resize ( m_dArena.GetLength() - 1 );
 	dIndex.FillSeq(1);
-	// Use radix sort for O(n) performance instead of O(n log n)
+	// Use radix sort for O(n) performance instead of O(n log n) which std::sort provides
 	RadixSortIndices<SIZE> ( dIndex, m_dArena );
 
 	m_dBlocksWords.Reserve ( m_dArena.GetLength() / INFIX_BLOCK_SIZE * sizeof ( DWORD ) * SIZE );