Skip to content

Commit 9613630

Browse files
authored
Merge pull request #74 from rryam/perf/core-speed-optimizations
Improve storage and BM25 search performance
2 parents 0b2beed + 7e33fcd commit 9613630

File tree

3 files changed

+76
-24
lines changed

3 files changed

+76
-24
lines changed

Sources/VecturaKit/Core/VecturaKit.swift

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -307,15 +307,8 @@ public actor VecturaKit {
307307
b: config.searchOptions.b
308308
)
309309

310-
// Determine if we should unload text index after search
311-
// This is enabled for indexed memory strategy to minimize memory footprint
312-
let shouldUnloadTextIndex: Bool
313-
switch config.memoryStrategy {
314-
case .indexed:
315-
shouldUnloadTextIndex = true
316-
case .automatic, .fullMemory:
317-
shouldUnloadTextIndex = false
318-
}
310+
// Keep BM25 index warm across queries to avoid rebuilding on every search.
311+
let shouldUnloadTextIndex = false
319312

320313
// Combine into hybrid search engine
321314
return HybridSearchEngine(

Sources/VecturaKit/SearchEngine/BM25Index.swift

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ public actor BM25Index {
6565
private var documentFrequencies: [String: Int]
6666
private var documentLengths: [UUID: Int]
6767
private var documentTokens: [UUID: [String]]
68+
private var documentTermFrequencies: [UUID: [String: Int]]
6869
private var averageDocumentLength: Float
6970

7071
/// Creates a new BM25 index for the given documents
@@ -78,7 +79,14 @@ public actor BM25Index {
7879
self.b = b
7980
// Convert to lightweight BM25Document to reduce memory usage
8081
let lightweightDocs = documents.map { BM25Document(from: $0) }
81-
(self.documents, self.documentFrequencies, self.documentLengths, self.documentTokens, self.averageDocumentLength) =
82+
(
83+
self.documents,
84+
self.documentFrequencies,
85+
self.documentLengths,
86+
self.documentTokens,
87+
self.documentTermFrequencies,
88+
self.averageDocumentLength
89+
) =
8290
Self.buildIndex(from: lightweightDocs)
8391
}
8492

@@ -100,16 +108,19 @@ public actor BM25Index {
100108
// Initialize empty dictionaries first
101109
var tempTokens: [UUID: [String]] = [:]
102110
var tempLengths: [UUID: Int] = [:]
111+
var tempTermFrequencies: [UUID: [String: Int]] = [:]
103112

104113
// Tokenize once per document and cache for later reuse
105114
for (id, document) in self.documents {
106115
let tokens = tokenize(document.text)
107116
tempTokens[id] = tokens
108117
tempLengths[id] = tokens.count
118+
tempTermFrequencies[id] = Self.termFrequencyMap(tokens: tokens)
109119
}
110120

111121
self.documentTokens = tempTokens
112122
self.documentLengths = tempLengths
123+
self.documentTermFrequencies = tempTermFrequencies
113124

114125
// Guard against division by zero when documents array is empty
115126
if self.documents.isEmpty {
@@ -134,6 +145,7 @@ public actor BM25Index {
134145
[String: Int],
135146
[UUID: Int],
136147
[UUID: [String]],
148+
[UUID: [String: Int]],
137149
Float
138150
) {
139151
// Use reduce to handle duplicate IDs gracefully (keep last occurrence)
@@ -143,12 +155,14 @@ public actor BM25Index {
143155
var documentFrequencies: [String: Int] = [:]
144156
var documentLengths: [UUID: Int] = [:]
145157
var documentTokens: [UUID: [String]] = [:]
158+
var documentTermFrequencies: [UUID: [String: Int]] = [:]
146159

147160
// Tokenize once per document and cache for later reuse
148161
for (id, document) in docsMap {
149162
let tokens = tokenize(document.text)
150163
documentTokens[id] = tokens
151164
documentLengths[id] = tokens.count
165+
documentTermFrequencies[id] = termFrequencyMap(tokens: tokens)
152166
}
153167

154168
// Calculate average document length
@@ -167,7 +181,20 @@ public actor BM25Index {
167181
}
168182
}
169183

170-
return (docsMap, documentFrequencies, documentLengths, documentTokens, averageDocumentLength)
184+
return (
185+
docsMap,
186+
documentFrequencies,
187+
documentLengths,
188+
documentTokens,
189+
documentTermFrequencies,
190+
averageDocumentLength
191+
)
192+
}
193+
194+
private static func termFrequencyMap(tokens: [String]) -> [String: Int] {
195+
tokens.reduce(into: [:]) { counts, token in
196+
counts[token, default: 0] += 1
197+
}
171198
}
172199
// swiftlint:enable large_tuple
173200

@@ -182,29 +209,41 @@ public actor BM25Index {
182209
guard !queryTerms.isEmpty else {
183210
return []
184211
}
212+
let queryTermFrequencies = queryTerms.reduce(into: [String: Int]()) { counts, term in
213+
counts[term, default: 0] += 1
214+
}
215+
let avgDocLength = max(averageDocumentLength, 1e-9)
216+
let documentCount = Float(documents.count)
217+
var queryIDFs: [String: Float] = [:]
218+
queryIDFs.reserveCapacity(queryTermFrequencies.count)
219+
220+
for term in queryTermFrequencies.keys {
221+
let df = Float(documentFrequencies[term] ?? 0)
222+
let idfArgument = (documentCount - df + 0.5) / (df + 0.5)
223+
queryIDFs[term] = log(max(idfArgument, 1e-9))
224+
}
225+
185226
var scores: [(BM25Document, Float)] = []
227+
scores.reserveCapacity(documents.count)
186228

187229
for document in documents.values {
188230
let docLength = Float(documentLengths[document.id] ?? 0)
189231
var score: Float = 0.0
190232

191-
// Use cached tokens instead of re-tokenizing
192-
let docTokens = documentTokens[document.id] ?? []
193-
let docTokenCounts = Dictionary(grouping: docTokens, by: { $0 }).mapValues { Float($0.count) }
233+
let docTermFrequencies = documentTermFrequencies[document.id] ?? [:]
234+
let lengthNormalization = k1 * (1 - b + b * docLength / avgDocLength)
194235

195-
for term in queryTerms {
196-
let tf = docTokenCounts[term] ?? 0
197-
let df = Float(documentFrequencies[term] ?? 0)
198-
199-
// Ensure argument to log is positive for numerical stability
200-
let idfArgument = (Float(documents.count) - df + 0.5) / (df + 0.5)
201-
let idf = log(max(idfArgument, 1e-9))
236+
for (term, queryTermCount) in queryTermFrequencies {
237+
let tf = Float(docTermFrequencies[term] ?? 0)
238+
guard tf > 0 else {
239+
continue
240+
}
202241

242+
let idf = queryIDFs[term] ?? 0
203243
let numerator = tf * (k1 + 1)
204-
let avgDocLen = max(averageDocumentLength, 1e-9) // Prevent division by zero
205-
let denominator = tf + k1 * (1 - b + b * docLength / avgDocLen)
244+
let denominator = tf + lengthNormalization
206245

207-
score += idf * (numerator / denominator)
246+
score += Float(queryTermCount) * idf * (numerator / denominator)
208247
}
209248

210249
scores.append((document, score))
@@ -246,6 +285,7 @@ public actor BM25Index {
246285

247286
documentLengths.removeValue(forKey: documentID)
248287
documentTokens.removeValue(forKey: documentID)
288+
documentTermFrequencies.removeValue(forKey: documentID)
249289
updateAverageDocumentLength()
250290
}
251291

@@ -276,6 +316,7 @@ public actor BM25Index {
276316
// Tokenize once and cache for later reuse
277317
let tokens = tokenize(document.text)
278318
documentTokens[document.id] = tokens
319+
documentTermFrequencies[document.id] = Self.termFrequencyMap(tokens: tokens)
279320
let length = tokens.count
280321
documentLengths[document.id] = length
281322

@@ -302,6 +343,7 @@ public actor BM25Index {
302343
documentFrequencies.removeAll()
303344
documentLengths.removeAll()
304345
documentTokens.removeAll()
346+
documentTermFrequencies.removeAll()
305347
averageDocumentLength = 0
306348
}
307349

Sources/VecturaKit/Storage/FileStorageProvider.swift

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,23 @@ extension FileStorageProvider: VecturaStorage {
7373
return documents
7474
}
7575

76+
/// Returns total document count without decoding all document files.
77+
public func getTotalDocumentCount() async throws -> Int {
78+
if cacheEnabled && !cache.isEmpty {
79+
return cache.count
80+
}
81+
82+
let fileURLs = try FileManager.default.contentsOfDirectory(
83+
at: storageDirectory,
84+
includingPropertiesForKeys: nil
85+
)
86+
return fileURLs.reduce(into: 0) { count, fileURL in
87+
if fileURL.pathExtension.lowercased() == "json" {
88+
count += 1
89+
}
90+
}
91+
}
92+
7693
/// Saves a document and updates cache
7794
public func saveDocument(_ document: VecturaDocument) async throws {
7895
try await saveDocumentToStorage(document)

0 commit comments

Comments
 (0)