Skip to content

Commit d6b515a

Browse files
committed
feat(core,cli): implement LanceDB upsert and compact command
- Update LanceDBVectorStore to use mergeInsert API for upsert operations - Add scalar index on 'id' column to speed up merge operations - Implement optimize() method for vector store compaction - Add new 'dev compact' CLI command for database optimization - Prevents duplicate entries when re-indexing repository This resolves the duplicate search results issue by using LanceDB's merge_insert API instead of simple append operations. The compact command allows users to optimize their vector store by merging fragments and updating indices for better query performance.
1 parent 8195010 commit d6b515a

File tree

5 files changed

+151
-3
lines changed

5 files changed

+151
-3
lines changed

packages/cli/src/cli.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import chalk from 'chalk';
44
import { Command } from 'commander';
55
import { cleanCommand } from './commands/clean.js';
6+
import { compactCommand } from './commands/compact.js';
67
import { exploreCommand } from './commands/explore.js';
78
import { ghCommand } from './commands/gh.js';
89
import { indexCommand } from './commands/index.js';
@@ -28,6 +29,7 @@ program.addCommand(planCommand);
2829
program.addCommand(ghCommand);
2930
program.addCommand(updateCommand);
3031
program.addCommand(statsCommand);
32+
program.addCommand(compactCommand);
3133
program.addCommand(cleanCommand);
3234

3335
// Show help if no command provided
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import { RepositoryIndexer } from '@lytics/dev-agent-core';
2+
import chalk from 'chalk';
3+
import { Command } from 'commander';
4+
import ora from 'ora';
5+
import { loadConfig } from '../utils/config.js';
6+
import { logger } from '../utils/logger.js';
7+
8+
export const compactCommand = new Command('compact')
9+
.description('🗜️ Optimize and compact the vector store')
10+
.option('-v, --verbose', 'Show detailed optimization information', false)
11+
.action(async (options) => {
12+
const spinner = ora('Loading configuration...').start();
13+
14+
try {
15+
// Load config
16+
const config = await loadConfig();
17+
if (!config) {
18+
spinner.fail('No config found');
19+
logger.error('Run "dev init" first to initialize the repository');
20+
process.exit(1);
21+
return;
22+
}
23+
24+
spinner.text = 'Initializing indexer...';
25+
const indexer = new RepositoryIndexer(config);
26+
await indexer.initialize();
27+
28+
// Get stats before optimization
29+
const statsBefore = await indexer.getStats();
30+
if (!statsBefore) {
31+
spinner.fail('No index found');
32+
logger.error('Run "dev index" first to index the repository');
33+
await indexer.close();
34+
process.exit(1);
35+
return;
36+
}
37+
38+
spinner.text = 'Optimizing vector store...';
39+
const startTime = Date.now();
40+
41+
// Access the internal vector storage and call optimize
42+
// We need to access the private vectorStorage property
43+
// @ts-expect-error - accessing private property for optimization
44+
await indexer.vectorStorage.optimize();
45+
46+
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
47+
48+
// Get stats after optimization
49+
const statsAfter = await indexer.getStats();
50+
51+
await indexer.close();
52+
53+
spinner.succeed(chalk.green('Vector store optimized successfully!'));
54+
55+
// Show results
56+
logger.log('');
57+
logger.log(chalk.bold('Optimization Results:'));
58+
logger.log(` ${chalk.cyan('Duration:')} ${duration}s`);
59+
logger.log(` ${chalk.cyan('Total documents:')} ${statsAfter?.vectorsStored || 0}`);
60+
61+
if (options.verbose) {
62+
logger.log('');
63+
logger.log(chalk.bold('Before Optimization:'));
64+
logger.log(` ${chalk.cyan('Storage size:')} ${statsBefore.vectorsStored} vectors`);
65+
logger.log('');
66+
logger.log(chalk.bold('After Optimization:'));
67+
logger.log(` ${chalk.cyan('Storage size:')} ${statsAfter?.vectorsStored || 0} vectors`);
68+
}
69+
70+
logger.log('');
71+
logger.log(
72+
chalk.gray(
73+
'Optimization merges small data fragments, updates indices, and improves query performance.'
74+
)
75+
);
76+
} catch (error) {
77+
spinner.fail('Failed to optimize vector store');
78+
logger.error(error instanceof Error ? error.message : String(error));
79+
if (options.verbose && error instanceof Error && error.stack) {
80+
logger.debug(error.stack);
81+
}
82+
process.exit(1);
83+
}
84+
});

packages/core/src/vector/index.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,18 @@ export class VectorStorage {
132132
};
133133
}
134134

135+
/**
136+
* Optimize the vector store (compact fragments, update indices)
137+
* Call this after bulk indexing operations for better performance
138+
*/
139+
async optimize(): Promise<void> {
140+
if (!this.initialized) {
141+
throw new Error('VectorStorage not initialized. Call initialize() first.');
142+
}
143+
144+
await this.store.optimize();
145+
}
146+
135147
/**
136148
* Close the storage
137149
*/

packages/core/src/vector/store.ts

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export class LanceDBVectorStore implements VectorStore {
4343
}
4444

4545
/**
46-
* Add documents to the store
46+
* Add documents to the store using upsert (prevents duplicates)
4747
*/
4848
async add(documents: EmbeddingDocument[], embeddings: number[][]): Promise<void> {
4949
if (!this.connection) {
@@ -70,9 +70,16 @@ export class LanceDBVectorStore implements VectorStore {
7070
if (!this.table) {
7171
// Create table on first add
7272
this.table = await this.connection.createTable(this.tableName, data);
73+
// Create scalar index on 'id' column for fast upsert operations
74+
await this.ensureIdIndex();
7375
} else {
74-
// Add to existing table
75-
await this.table.add(data);
76+
// Use mergeInsert to prevent duplicates (upsert operation)
77+
// This updates existing documents with the same ID or inserts new ones
78+
await this.table
79+
.mergeInsert('id')
80+
.whenMatchedUpdateAll()
81+
.whenNotMatchedInsertAll()
82+
.execute(data);
7683
}
7784
} catch (error) {
7885
throw new Error(
@@ -192,6 +199,44 @@ export class LanceDBVectorStore implements VectorStore {
192199
}
193200
}
194201

202+
/**
203+
* Optimize the vector store (compact fragments, update indices)
204+
*/
205+
async optimize(): Promise<void> {
206+
if (!this.table) {
207+
return;
208+
}
209+
210+
try {
211+
await this.table.optimize();
212+
} catch (error) {
213+
throw new Error(
214+
`Failed to optimize: ${error instanceof Error ? error.message : String(error)}`
215+
);
216+
}
217+
}
218+
219+
/**
220+
* Ensure scalar index exists on 'id' column for fast upsert operations
221+
*/
222+
private async ensureIdIndex(): Promise<void> {
223+
if (!this.table) {
224+
return;
225+
}
226+
227+
try {
228+
// Create a scalar index on the 'id' column to speed up mergeInsert operations
229+
// LanceDB will use an appropriate index type automatically
230+
await this.table.createIndex('id');
231+
} catch (error) {
232+
// Index may already exist or not be supported - log but don't fail
233+
// Some versions of LanceDB may not support this or it may already exist
234+
console.warn(
235+
`Could not create index on 'id' column: ${error instanceof Error ? error.message : String(error)}`
236+
);
237+
}
238+
}
239+
195240
/**
196241
* Close the store
197242
*/

packages/core/src/vector/types.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ export interface VectorStore {
9090
*/
9191
count(): Promise<number>;
9292

93+
/**
94+
* Optimize the store (compact fragments, update indices)
95+
*/
96+
optimize(): Promise<void>;
97+
9398
/**
9499
* Close the store
95100
*/

0 commit comments

Comments
 (0)