Skip to content

Commit a2cf070

Browse files
authored
Merge pull request #60 from rryam/audit-fix-hardening
Harden storage consistency and validation
2 parents 6588a07 + c8b499f commit a2cf070

21 files changed

+481
-121
lines changed

Docs/INDEXED_STORAGE_GUIDE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ let vectura = try await VecturaKit(config: config)
4747
**Behavior:**
4848
- < 10,000 documents → Uses `fullMemory` mode
4949
- ≥ 10,000 documents → Uses `indexed` mode (if storage supports it) with default parameters:
50-
- `candidateMultiplier: 10`
50+
- `candidateMultiplier: 4`
5151
- `batchSize: 100`
5252
- `maxConcurrentBatches: 4`
5353

@@ -79,7 +79,7 @@ Use indexed mode for large datasets:
7979
let config = VecturaConfig(
8080
name: "my-database",
8181
memoryStrategy: .indexed(
82-
candidateMultiplier: 10 // Search 10× topK candidates
82+
candidateMultiplier: 4 // Search 4× topK candidates
8383
)
8484
)
8585

Package.resolved

Lines changed: 15 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Package.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ let package = Package(
3737
dependencies: [
3838
.package(url: "https://github.com/jkrukowski/swift-embeddings.git", from: "0.0.21"),
3939
.package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.4.0"),
40-
.package(url: "https://github.com/ml-explore/mlx-swift-lm/", from: "2.29.2"),
40+
.package(url: "https://github.com/ml-explore/mlx-swift-lm/", from: "2.30.3"),
4141
],
4242
targets: [
4343
.target(

Sources/VecturaCLI/VecturaCLI.swift

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ struct VecturaCLI: AsyncParsableCommand {
6565
}
6666
}
6767

68-
static func setupDB(dbName: String, dimension: Int, numResults: Int, threshold: Float, modelId: String) async throws
68+
static func setupDB(dbName: String, dimension: Int?, numResults: Int, threshold: Float, modelId: String) async throws
6969
-> VecturaKit {
7070
let config = try VecturaConfig(
7171
name: dbName,
@@ -90,8 +90,8 @@ extension VecturaCLI {
9090
@Option(name: [.long, .customShort("d")], help: "Database name")
9191
var dbName: String = "vectura-cli-demo-db"
9292

93-
@Option(name: [.long, .customShort("v")], help: "Vector dimension")
94-
var dimension: Int = 512
93+
@Option(name: [.long, .customShort("v")], help: "Vector dimension (auto-detected if not specified)")
94+
var dimension: Int?
9595

9696
@Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold for searches")
9797
var threshold: Float = 0.5
@@ -252,7 +252,8 @@ extension VecturaCLI {
252252
print("Total search queries: \(searchQueries.count)")
253253
print("Average search time: \(String(format: "%.1f", avgSearchTime * 1000))ms")
254254
print("Model: \(modelId)")
255-
print("Vector dimension: \(dimension)")
255+
let dimensionDescription = dimension.map(String.init) ?? "auto-detected"
256+
print("Vector dimension: \(dimensionDescription)")
256257
}
257258

258259
private func loadMockDataset() throws -> MockDataset {
@@ -275,8 +276,8 @@ extension VecturaCLI {
275276
@Option(name: [.long, .customShort("d")], help: "Database name")
276277
var dbName: String = "vectura-cli-db"
277278

278-
@Option(name: [.long, .customShort("v")], help: "Vector dimension")
279-
var dimension: Int = 384
279+
@Option(name: [.long, .customShort("v")], help: "Vector dimension (auto-detected if not specified)")
280+
var dimension: Int?
280281

281282
@Option(name: [.long, .customShort("m")], help: "Model ID for embeddings")
282283
var modelId: String = "sentence-transformers/all-MiniLM-L6-v2"
@@ -310,8 +311,8 @@ extension VecturaCLI {
310311
@Option(name: [.long, .customShort("d")], help: "Database name")
311312
var dbName: String = "vectura-cli-db"
312313

313-
@Option(name: [.long, .customShort("v")], help: "Vector dimension")
314-
var dimension: Int = 384
314+
@Option(name: [.long, .customShort("v")], help: "Vector dimension (auto-detected if not specified)")
315+
var dimension: Int?
315316

316317
@Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold")
317318
var threshold: Float = 0.7
@@ -363,8 +364,8 @@ extension VecturaCLI {
363364
@Option(name: [.long, .customShort("d")], help: "Database name")
364365
var dbName: String = "vectura-cli-db"
365366

366-
@Option(name: [.long, .customShort("v")], help: "Vector dimension")
367-
var dimension: Int = 384
367+
@Option(name: [.long, .customShort("v")], help: "Vector dimension (auto-detected if not specified)")
368+
var dimension: Int?
368369

369370
@Option(name: [.long, .customShort("m")], help: "Model ID for embeddings")
370371
var modelId: String = "sentence-transformers/all-MiniLM-L6-v2"
@@ -396,8 +397,8 @@ extension VecturaCLI {
396397
@Option(name: [.long, .customShort("d")], help: "Database name")
397398
var dbName: String = "vectura-cli-db"
398399

399-
@Option(name: [.long, .customShort("v")], help: "Vector dimension")
400-
var dimension: Int = 384
400+
@Option(name: [.long, .customShort("v")], help: "Vector dimension (auto-detected if not specified)")
401+
var dimension: Int?
401402

402403
@Argument(help: "Document IDs to delete")
403404
var ids: [DocumentID]
@@ -423,8 +424,8 @@ extension VecturaCLI {
423424
@Option(name: [.long, .customShort("d")], help: "Database name")
424425
var dbName: String = "vectura-cli-db"
425426

426-
@Option(name: [.long, .customShort("v")], help: "Vector dimension")
427-
var dimension: Int = 384
427+
@Option(name: [.long, .customShort("v")], help: "Vector dimension (auto-detected if not specified)")
428+
var dimension: Int?
428429

429430
mutating func run() async throws {
430431
let db = try await VecturaCLI.setupDB(

Sources/VecturaKit/Core/VecturaConfig.swift

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,36 @@ public struct VecturaConfig: Sendable {
166166
searchOptions: SearchOptions = .init(),
167167
memoryStrategy: MemoryStrategy = .automatic()
168168
) throws {
169+
let trimmedName = name.trimmingCharacters(in: .whitespacesAndNewlines)
170+
guard !trimmedName.isEmpty else {
171+
throw VecturaError.invalidInput("Database name cannot be empty or whitespace")
172+
}
173+
guard trimmedName != "." && trimmedName != ".." else {
174+
throw VecturaError.invalidInput("Database name cannot be '.' or '..'")
175+
}
176+
guard !trimmedName.contains("/") && !trimmedName.contains("\\") else {
177+
throw VecturaError.invalidInput("Database name cannot contain path separators")
178+
}
179+
169180
// Validate search options
181+
guard searchOptions.defaultNumResults > 0 else {
182+
throw VecturaError.invalidInput(
183+
"defaultNumResults must be greater than 0, got \(searchOptions.defaultNumResults)"
184+
)
185+
}
186+
if let threshold = searchOptions.minThreshold {
187+
guard threshold >= 0.0 && threshold <= 1.0 else {
188+
throw VecturaError.invalidInput(
189+
"minThreshold must be between 0.0 and 1.0, got \(threshold)"
190+
)
191+
}
192+
}
193+
guard searchOptions.k1 > 0 else {
194+
throw VecturaError.invalidInput("k1 must be greater than 0, got \(searchOptions.k1)")
195+
}
196+
guard searchOptions.b >= 0.0 && searchOptions.b <= 1.0 else {
197+
throw VecturaError.invalidInput("b must be between 0.0 and 1.0, got \(searchOptions.b)")
198+
}
170199
guard searchOptions.bm25NormalizationFactor > 0 else {
171200
throw VecturaError.invalidInput(
172201
"bm25NormalizationFactor must be positive, got \(searchOptions.bm25NormalizationFactor)"
@@ -190,7 +219,7 @@ public struct VecturaConfig: Sendable {
190219
break
191220
}
192221

193-
self.name = name
222+
self.name = trimmedName
194223
self.directoryURL = directoryURL
195224
self.dimension = dimension
196225
self.searchOptions = searchOptions

Sources/VecturaKit/Core/VecturaKit.swift

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,82 @@ public actor VecturaKit {
147147
documentIds.append(docId)
148148
}
149149

150+
let existingDocumentsById: [UUID: VecturaDocument]
151+
let idsToRestore = Set(documentIds)
152+
if idsToRestore.isEmpty {
153+
existingDocumentsById = [:]
154+
} else if let indexedStorage = storageProvider as? IndexedVecturaStorage {
155+
existingDocumentsById = try await indexedStorage.loadDocuments(ids: Array(idsToRestore))
156+
} else if ids != nil {
157+
let existingDocs = try await storageProvider.loadDocuments()
158+
existingDocumentsById = existingDocs.reduce(into: [:]) { dict, doc in
159+
if idsToRestore.contains(doc.id) {
160+
dict[doc.id] = doc
161+
}
162+
}
163+
} else {
164+
existingDocumentsById = [:]
165+
}
166+
150167
// Save documents to storage (storage provider handles batch concurrency)
151168
try await storageProvider.saveDocuments(documentsToSave)
152169

153170
// Notify search engine to index documents
154-
for doc in documentsToSave {
155-
try await searchEngine.indexDocument(doc)
171+
var indexedDocumentIDs: [UUID] = []
172+
indexedDocumentIDs.reserveCapacity(documentsToSave.count)
173+
174+
do {
175+
for doc in documentsToSave {
176+
try await searchEngine.indexDocument(doc)
177+
indexedDocumentIDs.append(doc.id)
178+
}
179+
} catch {
180+
Self.logger.error("Indexing failed after saving documents: \(error.localizedDescription)")
181+
182+
for id in indexedDocumentIDs {
183+
do {
184+
try await searchEngine.removeDocument(id: id)
185+
} catch {
186+
Self.logger.warning(
187+
"Failed to rollback search index for \(id): \(error.localizedDescription)"
188+
)
189+
}
190+
}
191+
192+
for doc in documentsToSave {
193+
if let existingDoc = existingDocumentsById[doc.id] {
194+
do {
195+
try await storageProvider.updateDocument(existingDoc)
196+
} catch {
197+
Self.logger.warning(
198+
"Failed to restore stored document \(doc.id): \(error.localizedDescription)"
199+
)
200+
}
201+
} else {
202+
do {
203+
try await storageProvider.deleteDocument(withID: doc.id)
204+
} catch {
205+
Self.logger.warning(
206+
"Failed to rollback stored document \(doc.id): \(error.localizedDescription)"
207+
)
208+
}
209+
}
210+
}
211+
212+
for id in indexedDocumentIDs {
213+
guard let existingDoc = existingDocumentsById[id] else {
214+
continue
215+
}
216+
do {
217+
try await searchEngine.indexDocument(existingDoc)
218+
} catch {
219+
Self.logger.warning(
220+
"Failed to restore search index for \(id): \(error.localizedDescription)"
221+
)
222+
}
223+
}
224+
225+
throw error
156226
}
157227

158228
return documentIds

Sources/VecturaKit/SearchEngine/BM25Index.swift

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ public actor BM25Index {
179179
/// - Returns: Array of tuples containing lightweight documents and their BM25 scores
180180
public func search(query: String, topK: Int = 10) -> [(document: BM25Document, score: Float)] {
181181
let queryTerms = tokenize(query)
182+
guard !queryTerms.isEmpty else {
183+
return []
184+
}
182185
var scores: [(BM25Document, Float)] = []
183186

184187
for document in documents.values {
@@ -207,10 +210,12 @@ public actor BM25Index {
207210
scores.append((document, score))
208211
}
209212

210-
return scores
211-
.sorted { $0.1 > $1.1 }
212-
.prefix(topK)
213-
.filter { $0.1 > 0 }
213+
return Array(
214+
scores
215+
.sorted { $0.1 > $1.1 }
216+
.filter { $0.1 > 0 }
217+
.prefix(topK)
218+
)
214219
}
215220

216221
/// Add a new document to the index incrementally

0 commit comments

Comments
 (0)