Skip to content

Commit 6ae4588

Browse files
committed
feat(core,cli): implement LanceDB upsert and compact command
- Update LanceDBVectorStore to use mergeInsert API for upsert operations - Add scalar index on 'id' column to speed up merge operations - Implement optimize() method for vector store compaction - Add new 'dev compact' CLI command for database optimization - Prevents duplicate entries when re-indexing repository This resolves the duplicate search results issue by using LanceDB's merge_insert API instead of simple append operations. The compact command allows users to optimize their vector store by merging fragments and updating indices for better query performance.
1 parent 10dc1e4 commit 6ae4588

File tree

7 files changed

+209
-22
lines changed

7 files changed

+209
-22
lines changed

packages/cli/src/cli.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import chalk from 'chalk';
44
import { Command } from 'commander';
55
import { cleanCommand } from './commands/clean.js';
6+
import { compactCommand } from './commands/compact.js';
67
import { exploreCommand } from './commands/explore.js';
78
import { ghCommand } from './commands/gh.js';
89
import { indexCommand } from './commands/index.js';
@@ -28,6 +29,7 @@ program.addCommand(planCommand);
2829
program.addCommand(ghCommand);
2930
program.addCommand(updateCommand);
3031
program.addCommand(statsCommand);
32+
program.addCommand(compactCommand);
3133
program.addCommand(cleanCommand);
3234

3335
// Show help if no command provided
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import { RepositoryIndexer } from '@lytics/dev-agent-core';
2+
import chalk from 'chalk';
3+
import { Command } from 'commander';
4+
import ora from 'ora';
5+
import { loadConfig } from '../utils/config.js';
6+
import { logger } from '../utils/logger.js';
7+
8+
export const compactCommand = new Command('compact')
9+
.description('🗜️ Optimize and compact the vector store')
10+
.option('-v, --verbose', 'Show detailed optimization information', false)
11+
.action(async (options) => {
12+
const spinner = ora('Loading configuration...').start();
13+
14+
try {
15+
// Load config
16+
const config = await loadConfig();
17+
if (!config) {
18+
spinner.fail('No config found');
19+
logger.error('Run "dev init" first to initialize the repository');
20+
process.exit(1);
21+
return;
22+
}
23+
24+
spinner.text = 'Initializing indexer...';
25+
const indexer = new RepositoryIndexer(config);
26+
await indexer.initialize();
27+
28+
// Get stats before optimization
29+
const statsBefore = await indexer.getStats();
30+
if (!statsBefore) {
31+
spinner.fail('No index found');
32+
logger.error('Run "dev index" first to index the repository');
33+
await indexer.close();
34+
process.exit(1);
35+
return;
36+
}
37+
38+
spinner.text = 'Optimizing vector store...';
39+
const startTime = Date.now();
40+
41+
// Access the internal vector storage and call optimize
42+
// We need to access the private vectorStorage property
43+
// @ts-expect-error - accessing private property for optimization
44+
await indexer.vectorStorage.optimize();
45+
46+
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
47+
48+
// Get stats after optimization
49+
const statsAfter = await indexer.getStats();
50+
51+
await indexer.close();
52+
53+
spinner.succeed(chalk.green('Vector store optimized successfully!'));
54+
55+
// Show results
56+
logger.log('');
57+
logger.log(chalk.bold('Optimization Results:'));
58+
logger.log(` ${chalk.cyan('Duration:')} ${duration}s`);
59+
logger.log(` ${chalk.cyan('Total documents:')} ${statsAfter?.vectorsStored || 0}`);
60+
61+
if (options.verbose) {
62+
logger.log('');
63+
logger.log(chalk.bold('Before Optimization:'));
64+
logger.log(` ${chalk.cyan('Storage size:')} ${statsBefore.vectorsStored} vectors`);
65+
logger.log('');
66+
logger.log(chalk.bold('After Optimization:'));
67+
logger.log(` ${chalk.cyan('Storage size:')} ${statsAfter?.vectorsStored || 0} vectors`);
68+
}
69+
70+
logger.log('');
71+
logger.log(
72+
chalk.gray(
73+
'Optimization merges small data fragments, updates indices, and improves query performance.'
74+
)
75+
);
76+
} catch (error) {
77+
spinner.fail('Failed to optimize vector store');
78+
logger.error(error instanceof Error ? error.message : String(error));
79+
if (options.verbose && error instanceof Error && error.stack) {
80+
logger.debug(error.stack);
81+
}
82+
process.exit(1);
83+
}
84+
});

packages/core/src/vector/index.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,18 @@ export class VectorStorage {
132132
};
133133
}
134134

135+
/**
136+
* Optimize the vector store (compact fragments, update indices)
137+
* Call this after bulk indexing operations for better performance
138+
*/
139+
async optimize(): Promise<void> {
140+
if (!this.initialized) {
141+
throw new Error('VectorStorage not initialized. Call initialize() first.');
142+
}
143+
144+
await this.store.optimize();
145+
}
146+
135147
/**
136148
* Close the storage
137149
*/

packages/core/src/vector/store.ts

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export class LanceDBVectorStore implements VectorStore {
4343
}
4444

4545
/**
46-
* Add documents to the store
46+
* Add documents to the store using upsert (prevents duplicates)
4747
*/
4848
async add(documents: EmbeddingDocument[], embeddings: number[][]): Promise<void> {
4949
if (!this.connection) {
@@ -70,9 +70,16 @@ export class LanceDBVectorStore implements VectorStore {
7070
if (!this.table) {
7171
// Create table on first add
7272
this.table = await this.connection.createTable(this.tableName, data);
73+
// Create scalar index on 'id' column for fast upsert operations
74+
await this.ensureIdIndex();
7375
} else {
74-
// Add to existing table
75-
await this.table.add(data);
76+
// Use mergeInsert to prevent duplicates (upsert operation)
77+
// This updates existing documents with the same ID or inserts new ones
78+
await this.table
79+
.mergeInsert('id')
80+
.whenMatchedUpdateAll()
81+
.whenNotMatchedInsertAll()
82+
.execute(data);
7683
}
7784
} catch (error) {
7885
throw new Error(
@@ -192,6 +199,44 @@ export class LanceDBVectorStore implements VectorStore {
192199
}
193200
}
194201

202+
/**
203+
* Optimize the vector store (compact fragments, update indices)
204+
*/
205+
async optimize(): Promise<void> {
206+
if (!this.table) {
207+
return;
208+
}
209+
210+
try {
211+
await this.table.optimize();
212+
} catch (error) {
213+
throw new Error(
214+
`Failed to optimize: ${error instanceof Error ? error.message : String(error)}`
215+
);
216+
}
217+
}
218+
219+
/**
220+
* Ensure scalar index exists on 'id' column for fast upsert operations
221+
*/
222+
private async ensureIdIndex(): Promise<void> {
223+
if (!this.table) {
224+
return;
225+
}
226+
227+
try {
228+
// Create a scalar index on the 'id' column to speed up mergeInsert operations
229+
// LanceDB will use an appropriate index type automatically
230+
await this.table.createIndex('id');
231+
} catch (error) {
232+
// Index may already exist or not be supported - log but don't fail
233+
// Some versions of LanceDB may not support this or it may already exist
234+
console.warn(
235+
`Could not create index on 'id' column: ${error instanceof Error ? error.message : String(error)}`
236+
);
237+
}
238+
}
239+
195240
/**
196241
* Close the store
197242
*/

packages/core/src/vector/types.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ export interface VectorStore {
9090
*/
9191
count(): Promise<number>;
9292

93+
/**
94+
* Optimize the store (compact fragments, update indices)
95+
*/
96+
optimize(): Promise<void>;
97+
9398
/**
9499
* Close the store
95100
*/

packages/mcp-server/bin/dev-agent-mcp.ts

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
*/
66

77
import { RepositoryIndexer } from '@lytics/dev-agent-core';
8-
import { GitHubIndexer } from '@lytics/dev-agent-subagents';
98
import {
109
ExploreAdapter,
1110
GitHubAdapter,
@@ -31,16 +30,6 @@ async function main() {
3130

3231
await indexer.initialize();
3332

34-
// Initialize GitHub indexer
35-
const githubIndexer = new GitHubIndexer({
36-
vectorStorePath: `${repositoryPath}/.dev-agent/github-vectors.lance`,
37-
statePath: `${repositoryPath}/.dev-agent/github-state.json`,
38-
autoUpdate: false, // Don't auto-update on server start
39-
});
40-
41-
// Initialize GitHub indexer (lazy - will be ready when first used)
42-
await githubIndexer.initialize();
43-
4433
// Create and register adapters
4534
const searchAdapter = new SearchAdapter({
4635
repositoryIndexer: indexer,
@@ -72,7 +61,9 @@ async function main() {
7261

7362
const githubAdapter = new GitHubAdapter({
7463
repositoryPath,
75-
githubIndexer,
64+
// GitHubIndexer will be lazily initialized on first use
65+
vectorStorePath: `${repositoryPath}/.dev-agent/github-vectors.lance`,
66+
statePath: `${repositoryPath}/.dev-agent/github-state.json`,
7667
defaultLimit: 10,
7768
defaultFormat: 'compact',
7869
});
@@ -95,7 +86,10 @@ async function main() {
9586
const shutdown = async () => {
9687
await server.stop();
9788
await indexer.close();
98-
await githubIndexer.close();
89+
// Close GitHub adapter if initialized
90+
if (githubAdapter.githubIndexer) {
91+
await githubAdapter.githubIndexer.close();
92+
}
9993
process.exit(0);
10094
};
10195

packages/mcp-server/src/adapters/built-in/github-adapter.ts

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ import type { AdapterContext, ToolDefinition, ToolExecutionContext, ToolResult }
1414

1515
export interface GitHubAdapterConfig {
1616
repositoryPath: string;
17-
githubIndexer: GitHubIndexer;
17+
// Either pass an initialized indexer OR paths for lazy initialization
18+
githubIndexer?: GitHubIndexer;
19+
vectorStorePath?: string;
20+
statePath?: string;
1821
defaultLimit?: number;
1922
defaultFormat?: 'compact' | 'verbose';
2023
}
@@ -33,26 +36,65 @@ export class GitHubAdapter extends ToolAdapter {
3336
};
3437

3538
private repositoryPath: string;
36-
private githubIndexer: GitHubIndexer;
39+
public githubIndexer?: GitHubIndexer; // Public for cleanup in shutdown
40+
private vectorStorePath?: string;
41+
private statePath?: string;
3742
private defaultLimit: number;
3843
private defaultFormat: 'compact' | 'verbose';
3944

4045
constructor(config: GitHubAdapterConfig) {
4146
super();
4247
this.repositoryPath = config.repositoryPath;
4348
this.githubIndexer = config.githubIndexer;
49+
this.vectorStorePath = config.vectorStorePath;
50+
this.statePath = config.statePath;
4451
this.defaultLimit = config.defaultLimit ?? 10;
4552
this.defaultFormat = config.defaultFormat ?? 'compact';
53+
54+
// Validate: either githubIndexer OR both paths must be provided
55+
if (!this.githubIndexer && (!this.vectorStorePath || !this.statePath)) {
56+
throw new Error(
57+
'GitHubAdapter requires either githubIndexer or both vectorStorePath and statePath'
58+
);
59+
}
4660
}
4761

4862
async initialize(context: AdapterContext): Promise<void> {
4963
context.logger.info('GitHubAdapter initialized', {
5064
repositoryPath: this.repositoryPath,
5165
defaultLimit: this.defaultLimit,
5266
defaultFormat: this.defaultFormat,
67+
lazyInit: !this.githubIndexer,
5368
});
5469
}
5570

71+
/**
72+
* Lazy initialization of GitHubIndexer
73+
* Only creates the indexer when first needed
74+
*/
75+
private async ensureGitHubIndexer(): Promise<GitHubIndexer> {
76+
if (this.githubIndexer) {
77+
return this.githubIndexer;
78+
}
79+
80+
// Validate paths are available
81+
if (!this.vectorStorePath || !this.statePath) {
82+
throw new Error('GitHubAdapter not configured for lazy initialization');
83+
}
84+
85+
// Lazy initialization
86+
const { GitHubIndexer: GitHubIndexerClass } = await import('@lytics/dev-agent-subagents');
87+
88+
this.githubIndexer = new GitHubIndexerClass({
89+
vectorStorePath: this.vectorStorePath,
90+
statePath: this.statePath,
91+
autoUpdate: false,
92+
});
93+
94+
await this.githubIndexer.initialize();
95+
return this.githubIndexer;
96+
}
97+
5698
getToolDefinition(): ToolDefinition {
5799
return {
58100
name: 'dev_gh',
@@ -260,7 +302,8 @@ export class GitHubAdapter extends ToolAdapter {
260302
options: GitHubSearchOptions,
261303
format: string
262304
): Promise<string> {
263-
const results = await this.githubIndexer.search(query, options);
305+
const indexer = await this.ensureGitHubIndexer();
306+
const results = await indexer.search(query, options);
264307

265308
if (results.length === 0) {
266309
return '## GitHub Search Results\n\nNo matching issues or PRs found. Try:\n- Using different keywords\n- Removing filters (type, state, labels)\n- Re-indexing GitHub data with "dev gh index"';
@@ -278,7 +321,8 @@ export class GitHubAdapter extends ToolAdapter {
278321
*/
279322
private async getContext(number: number, format: string): Promise<string> {
280323
// Search for the specific issue/PR
281-
const results = await this.githubIndexer.search(`#${number}`, { limit: 1 });
324+
const indexer = await this.ensureGitHubIndexer();
325+
const results = await indexer.search(`#${number}`, { limit: 1 });
282326

283327
if (results.length === 0) {
284328
throw new Error(`Issue/PR #${number} not found`);
@@ -298,7 +342,8 @@ export class GitHubAdapter extends ToolAdapter {
298342
*/
299343
private async getRelated(number: number, limit: number, format: string): Promise<string> {
300344
// First get the main issue/PR
301-
const mainResults = await this.githubIndexer.search(`#${number}`, { limit: 1 });
345+
const indexer = await this.ensureGitHubIndexer();
346+
const mainResults = await indexer.search(`#${number}`, { limit: 1 });
302347

303348
if (mainResults.length === 0) {
304349
throw new Error(`Issue/PR #${number} not found`);
@@ -307,7 +352,7 @@ export class GitHubAdapter extends ToolAdapter {
307352
const mainDoc = mainResults[0].document;
308353

309354
// Search for related items using the title
310-
const relatedResults = await this.githubIndexer.search(mainDoc.title, { limit: limit + 1 });
355+
const relatedResults = await indexer.search(mainDoc.title, { limit: limit + 1 });
311356

312357
// Filter out the main item itself
313358
const related = relatedResults.filter((r) => r.document.number !== number).slice(0, limit);

0 commit comments

Comments
 (0)