diff --git a/packages/core/src/git/__tests__/indexer.test.ts b/packages/core/src/git/__tests__/indexer.test.ts new file mode 100644 index 0000000..8c2dfed --- /dev/null +++ b/packages/core/src/git/__tests__/indexer.test.ts @@ -0,0 +1,318 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import type { VectorStorage } from '../../vector'; +import type { SearchResult } from '../../vector/types'; +import type { GitExtractor } from '../extractor'; +import { GitIndexer } from '../indexer'; +import type { GitCommit } from '../types'; + +// Mock commit data +const createMockCommit = (overrides: Partial = {}): GitCommit => ({ + hash: 'abc123def456789012345678901234567890abcd', + shortHash: 'abc123d', + message: 'feat: add new feature\n\nThis adds a great new feature.', + subject: 'feat: add new feature', + body: 'This adds a great new feature.', + author: { + name: 'Test User', + email: 'test@example.com', + date: '2025-01-15T10:00:00Z', + }, + committer: { + name: 'Test User', + email: 'test@example.com', + date: '2025-01-15T10:00:00Z', + }, + files: [ + { path: 'src/feature.ts', status: 'added', additions: 50, deletions: 0 }, + { path: 'src/index.ts', status: 'modified', additions: 5, deletions: 2 }, + ], + stats: { + additions: 55, + deletions: 2, + filesChanged: 2, + }, + refs: { + branches: [], + tags: [], + issueRefs: [123], + prRefs: [], + }, + parents: ['parent123'], + ...overrides, +}); + +describe('GitIndexer', () => { + let mockExtractor: GitExtractor; + let mockVectorStorage: VectorStorage; + let indexer: GitIndexer; + + beforeEach(() => { + // Create mock extractor + mockExtractor = { + getCommits: vi.fn().mockResolvedValue([ + createMockCommit(), + createMockCommit({ + hash: 'def456abc789012345678901234567890abcdef', + shortHash: 'def456a', + subject: 'fix: resolve bug #456', + body: 'Fixes the critical bug.', + refs: { branches: [], tags: [], issueRefs: [456], prRefs: [] }, + }), + ]), + getCommit: vi.fn(), + getBlame: vi.fn(), + getRepositoryInfo: vi.fn(), + }; + + // Create mock vector storage + mockVectorStorage = { + initialize: vi.fn().mockResolvedValue(undefined), + addDocuments: vi.fn().mockResolvedValue(undefined), + search: vi.fn().mockResolvedValue([]), + getDocument: vi.fn(), + deleteDocuments: vi.fn(), + getStats: vi.fn(), + optimize: vi.fn(), + close: vi.fn(), + } as unknown as VectorStorage; + + indexer = new GitIndexer({ + extractor: mockExtractor, + vectorStorage: mockVectorStorage, + commitLimit: 100, + batchSize: 10, + }); + }); + + describe('index', () => { + it('should extract and index commits', async () => { + const result = await indexer.index(); + + expect(mockExtractor.getCommits).toHaveBeenCalledWith({ + limit: 100, + since: undefined, + until: undefined, + author: undefined, + noMerges: true, + }); + + expect(mockVectorStorage.addDocuments).toHaveBeenCalled(); + expect(result.commitsIndexed).toBe(2); + expect(result.errors).toHaveLength(0); + }); + + it('should respect limit option', async () => { + await indexer.index({ limit: 50 }); + + expect(mockExtractor.getCommits).toHaveBeenCalledWith(expect.objectContaining({ limit: 50 })); + }); + + it('should pass date filters to extractor', async () => { + await indexer.index({ + since: '2025-01-01', + until: '2025-01-31', + }); + + expect(mockExtractor.getCommits).toHaveBeenCalledWith( + expect.objectContaining({ + since: '2025-01-01', + until: '2025-01-31', + }) + ); + }); + + it('should pass author filter to extractor', async () => { + await indexer.index({ author: 'test@example.com' }); + + expect(mockExtractor.getCommits).toHaveBeenCalledWith( + expect.objectContaining({ author: 'test@example.com' }) + ); + }); + + it('should handle empty repository', async () => { + vi.mocked(mockExtractor.getCommits).mockResolvedValue([]); + + const result = await indexer.index(); + + expect(result.commitsIndexed).toBe(0); + expect(mockVectorStorage.addDocuments).not.toHaveBeenCalled(); + }); + + it('should handle extraction errors', async () => { + vi.mocked(mockExtractor.getCommits).mockRejectedValue(new Error('Git error')); + + const result = await indexer.index(); + + expect(result.commitsIndexed).toBe(0); + expect(result.errors).toHaveLength(1); + expect(result.errors[0]).toContain('Git error'); + }); + + it('should handle storage errors gracefully', async () => { + vi.mocked(mockVectorStorage.addDocuments).mockRejectedValue(new Error('Storage error')); + + const result = await indexer.index(); + + expect(result.errors).toHaveLength(1); + expect(result.errors[0]).toContain('Storage error'); + }); + + it('should report progress', async () => { + const progressUpdates: Array<{ phase: string; percentComplete: number }> = []; + + await indexer.index({ + onProgress: (progress) => { + progressUpdates.push({ + phase: progress.phase, + percentComplete: progress.percentComplete, + }); + }, + }); + + expect(progressUpdates).toContainEqual(expect.objectContaining({ phase: 'extracting' })); + expect(progressUpdates).toContainEqual(expect.objectContaining({ phase: 'embedding' })); + expect(progressUpdates).toContainEqual(expect.objectContaining({ phase: 'storing' })); + expect(progressUpdates).toContainEqual( + expect.objectContaining({ phase: 'complete', percentComplete: 100 }) + ); + }); + + it('should batch documents correctly', async () => { + // Create many commits + const manyCommits = Array.from({ length: 25 }, (_, i) => + createMockCommit({ + hash: `hash${i.toString().padStart(38, '0')}`, + shortHash: `h${i}`, + subject: `Commit ${i}`, + }) + ); + vi.mocked(mockExtractor.getCommits).mockResolvedValue(manyCommits); + + await indexer.index(); + + // With batchSize 10, 25 commits should result in 3 batches + expect(mockVectorStorage.addDocuments).toHaveBeenCalledTimes(3); + }); + }); + + describe('search', () => { + it('should search for commits by semantic query', async () => { + const mockCommit = createMockCommit(); + vi.mocked(mockVectorStorage.search).mockResolvedValue([ + { + id: `commit:${mockCommit.hash}`, + score: 0.9, + metadata: { + type: 'commit', + hash: mockCommit.hash, + _commit: mockCommit, + }, + } as SearchResult, + ]); + + const results = await indexer.search('add new feature'); + + expect(mockVectorStorage.search).toHaveBeenCalledWith('add new feature', { + limit: 10, + scoreThreshold: 0, + filter: { type: 'commit' }, + }); + expect(results).toHaveLength(1); + expect(results[0].hash).toBe(mockCommit.hash); + }); + + it('should respect limit option', async () => { + await indexer.search('query', { limit: 5 }); + + expect(mockVectorStorage.search).toHaveBeenCalledWith( + 'query', + expect.objectContaining({ limit: 5 }) + ); + }); + + it('should filter out results without commit metadata', async () => { + vi.mocked(mockVectorStorage.search).mockResolvedValue([ + { + id: 'commit:abc', + score: 0.9, + metadata: { type: 'commit' }, // Missing _commit + } as SearchResult, + ]); + + const results = await indexer.search('query'); + + expect(results).toHaveLength(0); + }); + }); + + describe('getFileHistory', () => { + it('should get history for a specific file', async () => { + const mockCommits = [createMockCommit()]; + vi.mocked(mockExtractor.getCommits).mockResolvedValue(mockCommits); + + const results = await indexer.getFileHistory('src/feature.ts'); + + expect(mockExtractor.getCommits).toHaveBeenCalledWith({ + path: 'src/feature.ts', + limit: 20, + follow: true, + noMerges: true, + }); + expect(results).toEqual(mockCommits); + }); + + it('should respect limit option', async () => { + await indexer.getFileHistory('src/file.ts', { limit: 5 }); + + expect(mockExtractor.getCommits).toHaveBeenCalledWith(expect.objectContaining({ limit: 5 })); + }); + }); + + describe('document preparation', () => { + it('should create proper document structure', async () => { + await indexer.index(); + + const addCall = vi.mocked(mockVectorStorage.addDocuments).mock.calls[0]; + const documents = addCall[0]; + + expect(documents[0]).toMatchObject({ + id: expect.stringMatching(/^commit:/), + text: expect.stringContaining('feat: add new feature'), + metadata: expect.objectContaining({ + type: 'commit', + hash: expect.any(String), + shortHash: expect.any(String), + subject: expect.any(String), + author: expect.any(String), + authorEmail: expect.any(String), + date: expect.any(String), + filesChanged: expect.any(Number), + additions: expect.any(Number), + deletions: expect.any(Number), + issueRefs: expect.any(Array), + prRefs: expect.any(Array), + _commit: expect.any(Object), + }), + }); + }); + + it('should include file paths in text for better search', async () => { + await indexer.index(); + + const addCall = vi.mocked(mockVectorStorage.addDocuments).mock.calls[0]; + const documents = addCall[0]; + + expect(documents[0].text).toContain('src/feature.ts'); + expect(documents[0].text).toContain('src/index.ts'); + }); + + it('should include issue refs in metadata', async () => { + await indexer.index(); + + const addCall = vi.mocked(mockVectorStorage.addDocuments).mock.calls[0]; + const documents = addCall[0]; + + expect(documents[0].metadata.issueRefs).toContain(123); + }); + }); +}); diff --git a/packages/core/src/git/index.ts b/packages/core/src/git/index.ts index 69f3140..4eb34c5 100644 --- a/packages/core/src/git/index.ts +++ b/packages/core/src/git/index.ts @@ -1,8 +1,9 @@ /** * Git Module * - * Provides git history extraction and types for semantic search. + * Provides git history extraction, indexing, and types for semantic search. */ export * from './extractor'; +export * from './indexer'; export * from './types'; diff --git a/packages/core/src/git/indexer.ts b/packages/core/src/git/indexer.ts new file mode 100644 index 0000000..7027f9a --- /dev/null +++ b/packages/core/src/git/indexer.ts @@ -0,0 +1,294 @@ +/** + * Git Indexer + * + * Indexes git commits into the vector store for semantic search. + */ + +import type { VectorStorage } from '../vector'; +import type { EmbeddingDocument } from '../vector/types'; +import type { GitExtractor } from './extractor'; +import type { GetCommitsOptions, GitCommit, GitIndexResult } from './types'; + +/** + * Configuration for the git indexer + */ +export interface GitIndexerConfig { + /** Git extractor instance */ + extractor: GitExtractor; + /** Vector storage instance */ + vectorStorage: VectorStorage; + /** Maximum commits to index (default: 1000) */ + commitLimit?: number; + /** Batch size for embedding (default: 32) */ + batchSize?: number; +} + +/** + * Options for indexing git commits + */ +export interface GitIndexOptions { + /** Maximum commits to index (overrides config) */ + limit?: number; + /** Only index commits after this date */ + since?: string; + /** Only index commits before this date */ + until?: string; + /** Filter by author email */ + author?: string; + /** Exclude merge commits (default: true) */ + noMerges?: boolean; + /** Progress callback */ + onProgress?: (progress: GitIndexProgress) => void; +} + +/** + * Progress information for git indexing + */ +export interface GitIndexProgress { + phase: 'extracting' | 'embedding' | 'storing' | 'complete'; + commitsProcessed: number; + totalCommits: number; + percentComplete: number; +} + +/** + * Document type marker for commits + */ +const COMMIT_DOC_TYPE = 'commit'; + +/** + * Git Indexer - indexes commits for semantic search + */ +export class GitIndexer { + private readonly extractor: GitExtractor; + private readonly vectorStorage: VectorStorage; + private readonly commitLimit: number; + private readonly batchSize: number; + + constructor(config: GitIndexerConfig) { + this.extractor = config.extractor; + this.vectorStorage = config.vectorStorage; + this.commitLimit = config.commitLimit ?? 1000; + this.batchSize = config.batchSize ?? 32; + } + + /** + * Index git commits into the vector store + */ + async index(options: GitIndexOptions = {}): Promise { + const startTime = Date.now(); + const errors: string[] = []; + + const limit = options.limit ?? this.commitLimit; + const onProgress = options.onProgress; + + // Phase 1: Extract commits + onProgress?.({ + phase: 'extracting', + commitsProcessed: 0, + totalCommits: 0, + percentComplete: 0, + }); + + const extractOptions: GetCommitsOptions = { + limit, + since: options.since, + until: options.until, + author: options.author, + noMerges: options.noMerges ?? true, + }; + + let commits: GitCommit[]; + try { + commits = await this.extractor.getCommits(extractOptions); + } catch (error) { + const message = `Failed to extract commits: ${error instanceof Error ? error.message : String(error)}`; + errors.push(message); + return { + commitsIndexed: 0, + durationMs: Date.now() - startTime, + errors, + }; + } + + if (commits.length === 0) { + onProgress?.({ + phase: 'complete', + commitsProcessed: 0, + totalCommits: 0, + percentComplete: 100, + }); + return { + commitsIndexed: 0, + durationMs: Date.now() - startTime, + errors, + }; + } + + // Phase 2: Prepare documents for embedding + onProgress?.({ + phase: 'embedding', + commitsProcessed: 0, + totalCommits: commits.length, + percentComplete: 25, + }); + + const documents = this.prepareCommitDocuments(commits); + + // Phase 3: Store in batches + onProgress?.({ + phase: 'storing', + commitsProcessed: 0, + totalCommits: commits.length, + percentComplete: 50, + }); + + let commitsIndexed = 0; + for (let i = 0; i < documents.length; i += this.batchSize) { + const batch = documents.slice(i, i + this.batchSize); + + try { + await this.vectorStorage.addDocuments(batch); + commitsIndexed += batch.length; + + onProgress?.({ + phase: 'storing', + commitsProcessed: commitsIndexed, + totalCommits: commits.length, + percentComplete: 50 + (commitsIndexed / commits.length) * 50, + }); + } catch (error) { + const message = `Failed to store batch ${i / this.batchSize}: ${error instanceof Error ? error.message : String(error)}`; + errors.push(message); + } + } + + // Phase 4: Complete + onProgress?.({ + phase: 'complete', + commitsProcessed: commitsIndexed, + totalCommits: commits.length, + percentComplete: 100, + }); + + return { + commitsIndexed, + durationMs: Date.now() - startTime, + errors, + }; + } + + /** + * Search for commits by semantic query + */ + async search( + query: string, + options: { limit?: number; scoreThreshold?: number } = {} + ): Promise { + const results = await this.vectorStorage.search(query, { + limit: options.limit ?? 10, + scoreThreshold: options.scoreThreshold ?? 0, + filter: { type: COMMIT_DOC_TYPE }, + }); + + // Extract commits from metadata + return results + .map((result) => { + const commit = result.metadata._commit as GitCommit | undefined; + if (!commit) return null; + return commit; + }) + .filter((c): c is GitCommit => c !== null); + } + + /** + * Get commits for a specific file + */ + async getFileHistory(filePath: string, options: { limit?: number } = {}): Promise { + // Use the extractor directly for file-specific history + return this.extractor.getCommits({ + path: filePath, + limit: options.limit ?? 20, + follow: true, + noMerges: true, + }); + } + + /** + * Get commit count in the index + */ + async getIndexedCommitCount(): Promise { + // Search with a broad query to count commits + // This is approximate - ideally we'd have a filter count method + const results = await this.vectorStorage.search('commit', { + limit: 10000, + filter: { type: COMMIT_DOC_TYPE }, + }); + return results.length; + } + + /** + * Prepare commit documents for embedding + */ + private prepareCommitDocuments(commits: GitCommit[]): EmbeddingDocument[] { + return commits.map((commit) => { + // Create a rich text representation for embedding + const textParts = [ + commit.subject, + commit.body, + // Include file paths for context + commit.files + .map((f) => f.path) + .join(' '), + ].filter(Boolean); + + const text = textParts.join('\n\n'); + + // Create unique ID from commit hash + const id = `commit:${commit.hash}`; + + return { + id, + text, + metadata: { + type: COMMIT_DOC_TYPE, + hash: commit.hash, + shortHash: commit.shortHash, + subject: commit.subject, + author: commit.author.name, + authorEmail: commit.author.email, + date: commit.author.date, + filesChanged: commit.stats.filesChanged, + additions: commit.stats.additions, + deletions: commit.stats.deletions, + issueRefs: commit.refs.issueRefs, + prRefs: commit.refs.prRefs, + // Store full commit for retrieval + _commit: commit, + }, + }; + }); + } +} + +/** + * Create a git indexer with default configuration + */ +export function createGitIndexer( + repositoryPath: string, + vectorStorage: VectorStorage, + options: Partial = {} +): GitIndexer { + // Import dynamically to avoid circular dependency + const { LocalGitExtractor } = require('./extractor') as { + LocalGitExtractor: typeof import('./extractor').LocalGitExtractor; + }; + + const extractor = new LocalGitExtractor(repositoryPath); + + return new GitIndexer({ + extractor, + vectorStorage, + ...options, + }); +}