@@ -65,6 +65,7 @@ public actor BM25Index {
6565 private var documentFrequencies : [ String : Int ]
6666 private var documentLengths : [ UUID : Int ]
6767 private var documentTokens : [ UUID : [ String ] ]
68+ private var documentTermFrequencies : [ UUID : [ String : Int ] ]
6869 private var averageDocumentLength : Float
6970
7071 /// Creates a new BM25 index for the given documents
@@ -78,7 +79,14 @@ public actor BM25Index {
7879 self . b = b
7980 // Convert to lightweight BM25Document to reduce memory usage
8081 let lightweightDocs = documents. map { BM25Document ( from: $0) }
81- ( self . documents, self . documentFrequencies, self . documentLengths, self . documentTokens, self . averageDocumentLength) =
82+ (
83+ self . documents,
84+ self . documentFrequencies,
85+ self . documentLengths,
86+ self . documentTokens,
87+ self . documentTermFrequencies,
88+ self . averageDocumentLength
89+ ) =
8290 Self . buildIndex ( from: lightweightDocs)
8391 }
8492
@@ -100,16 +108,19 @@ public actor BM25Index {
100108 // Initialize empty dictionaries first
101109 var tempTokens : [ UUID : [ String ] ] = [ : ]
102110 var tempLengths : [ UUID : Int ] = [ : ]
111+ var tempTermFrequencies : [ UUID : [ String : Int ] ] = [ : ]
103112
104113 // Tokenize once per document and cache for later reuse
105114 for (id, document) in self . documents {
106115 let tokens = tokenize ( document. text)
107116 tempTokens [ id] = tokens
108117 tempLengths [ id] = tokens. count
118+ tempTermFrequencies [ id] = Self . termFrequencyMap ( tokens: tokens)
109119 }
110120
111121 self . documentTokens = tempTokens
112122 self . documentLengths = tempLengths
123+ self . documentTermFrequencies = tempTermFrequencies
113124
114125 // Guard against division by zero when documents array is empty
115126 if self . documents. isEmpty {
@@ -134,6 +145,7 @@ public actor BM25Index {
134145 [ String : Int ] ,
135146 [ UUID : Int ] ,
136147 [ UUID : [ String ] ] ,
148+ [ UUID : [ String : Int ] ] ,
137149 Float
138150 ) {
139151 // Use reduce to handle duplicate IDs gracefully (keep last occurrence)
@@ -143,12 +155,14 @@ public actor BM25Index {
143155 var documentFrequencies : [ String : Int ] = [ : ]
144156 var documentLengths : [ UUID : Int ] = [ : ]
145157 var documentTokens : [ UUID : [ String ] ] = [ : ]
158+ var documentTermFrequencies : [ UUID : [ String : Int ] ] = [ : ]
146159
147160 // Tokenize once per document and cache for later reuse
148161 for (id, document) in docsMap {
149162 let tokens = tokenize ( document. text)
150163 documentTokens [ id] = tokens
151164 documentLengths [ id] = tokens. count
165+ documentTermFrequencies [ id] = termFrequencyMap ( tokens: tokens)
152166 }
153167
154168 // Calculate average document length
@@ -167,7 +181,20 @@ public actor BM25Index {
167181 }
168182 }
169183
170- return ( docsMap, documentFrequencies, documentLengths, documentTokens, averageDocumentLength)
184+ return (
185+ docsMap,
186+ documentFrequencies,
187+ documentLengths,
188+ documentTokens,
189+ documentTermFrequencies,
190+ averageDocumentLength
191+ )
192+ }
193+
194+ private static func termFrequencyMap( tokens: [ String ] ) -> [ String : Int ] {
195+ tokens. reduce ( into: [ : ] ) { counts, token in
196+ counts [ token, default: 0 ] += 1
197+ }
171198 }
172199 // swiftlint:enable large_tuple
173200
@@ -182,29 +209,41 @@ public actor BM25Index {
182209 guard !queryTerms. isEmpty else {
183210 return [ ]
184211 }
212+ let queryTermFrequencies = queryTerms. reduce ( into: [ String: Int] ( ) ) { counts, term in
213+ counts [ term, default: 0 ] += 1
214+ }
215+ let avgDocLength = max ( averageDocumentLength, 1e-9 )
216+ let documentCount = Float ( documents. count)
217+ var queryIDFs : [ String : Float ] = [ : ]
218+ queryIDFs. reserveCapacity ( queryTermFrequencies. count)
219+
220+ for term in queryTermFrequencies. keys {
221+ let df = Float ( documentFrequencies [ term] ?? 0 )
222+ let idfArgument = ( documentCount - df + 0.5 ) / ( df + 0.5 )
223+ queryIDFs [ term] = log ( max ( idfArgument, 1e-9 ) )
224+ }
225+
185226 var scores : [ ( BM25Document , Float ) ] = [ ]
227+ scores. reserveCapacity ( documents. count)
186228
187229 for document in documents. values {
188230 let docLength = Float ( documentLengths [ document. id] ?? 0 )
189231 var score : Float = 0.0
190232
191- // Use cached tokens instead of re-tokenizing
192- let docTokens = documentTokens [ document. id] ?? [ ]
193- let docTokenCounts = Dictionary ( grouping: docTokens, by: { $0 } ) . mapValues { Float ( $0. count) }
233+ let docTermFrequencies = documentTermFrequencies [ document. id] ?? [ : ]
234+ let lengthNormalization = k1 * ( 1 - b + b * docLength / avgDocLength)
194235
195- for term in queryTerms {
196- let tf = docTokenCounts [ term] ?? 0
197- let df = Float ( documentFrequencies [ term] ?? 0 )
198-
199- // Ensure argument to log is positive for numerical stability
200- let idfArgument = ( Float ( documents. count) - df + 0.5 ) / ( df + 0.5 )
201- let idf = log ( max ( idfArgument, 1e-9 ) )
236+ for (term, queryTermCount) in queryTermFrequencies {
237+ let tf = Float ( docTermFrequencies [ term] ?? 0 )
238+ guard tf > 0 else {
239+ continue
240+ }
202241
242+ let idf = queryIDFs [ term] ?? 0
203243 let numerator = tf * ( k1 + 1 )
204- let avgDocLen = max ( averageDocumentLength, 1e-9 ) // Prevent division by zero
205- let denominator = tf + k1 * ( 1 - b + b * docLength / avgDocLen)
244+ let denominator = tf + lengthNormalization
206245
207- score += idf * ( numerator / denominator)
246+ score += Float ( queryTermCount ) * idf * ( numerator / denominator)
208247 }
209248
210249 scores. append ( ( document, score) )
@@ -246,6 +285,7 @@ public actor BM25Index {
246285
247286 documentLengths. removeValue ( forKey: documentID)
248287 documentTokens. removeValue ( forKey: documentID)
288+ documentTermFrequencies. removeValue ( forKey: documentID)
249289 updateAverageDocumentLength ( )
250290 }
251291
@@ -276,6 +316,7 @@ public actor BM25Index {
276316 // Tokenize once and cache for later reuse
277317 let tokens = tokenize ( document. text)
278318 documentTokens [ document. id] = tokens
319+ documentTermFrequencies [ document. id] = Self . termFrequencyMap ( tokens: tokens)
279320 let length = tokens. count
280321 documentLengths [ document. id] = length
281322
@@ -302,6 +343,7 @@ public actor BM25Index {
302343 documentFrequencies. removeAll ( )
303344 documentLengths. removeAll ( )
304345 documentTokens. removeAll ( )
346+ documentTermFrequencies. removeAll ( )
305347 averageDocumentLength = 0
306348 }
307349
0 commit comments