diff --git a/packages/core/src/git/__tests__/extractor.test.ts b/packages/core/src/git/__tests__/extractor.test.ts new file mode 100644 index 0000000..7c89858 --- /dev/null +++ b/packages/core/src/git/__tests__/extractor.test.ts @@ -0,0 +1,243 @@ +import { execSync } from 'node:child_process'; +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { LocalGitExtractor } from '../extractor'; + +describe('LocalGitExtractor', () => { + let testRepoPath: string; + let extractor: LocalGitExtractor; + + beforeAll(() => { + // Create a temporary git repository for testing + testRepoPath = fs.mkdtempSync(path.join(os.tmpdir(), 'git-extractor-test-')); + + // Initialize git repo + execSync('git init', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git config user.email "test@example.com"', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git config user.name "Test User"', { cwd: testRepoPath, stdio: 'pipe' }); + + // Create initial commit + fs.writeFileSync(path.join(testRepoPath, 'README.md'), '# Test Repo\n'); + execSync('git add README.md', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "Initial commit"', { cwd: testRepoPath, stdio: 'pipe' }); + + // Create a second commit with issue reference + fs.writeFileSync(path.join(testRepoPath, 'file1.ts'), 'export const x = 1;\n'); + execSync('git add file1.ts', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "feat: add file1 #123"', { cwd: testRepoPath, stdio: 'pipe' }); + + // Create a third commit with PR reference + fs.writeFileSync(path.join(testRepoPath, 'file2.ts'), 'export const y = 2;\n'); + execSync('git add file2.ts', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "fix: bug fix PR #456"', { cwd: testRepoPath, stdio: 'pipe' }); + + // Create a fourth commit modifying existing file + fs.appendFileSync(path.join(testRepoPath, 'file1.ts'), 'export const z = 3;\n'); + execSync('git add file1.ts', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "refactor: update file1"', { cwd: testRepoPath, stdio: 'pipe' }); + + extractor = new LocalGitExtractor(testRepoPath); + }); + + afterAll(() => { + // Cleanup + fs.rmSync(testRepoPath, { recursive: true, force: true }); + }); + + describe('getCommits', () => { + it('should return commits in reverse chronological order', async () => { + const commits = await extractor.getCommits(); + + expect(commits.length).toBe(4); + expect(commits[0].subject).toBe('refactor: update file1'); + expect(commits[3].subject).toBe('Initial commit'); + }); + + it('should respect limit option', async () => { + const commits = await extractor.getCommits({ limit: 2 }); + + expect(commits.length).toBe(2); + expect(commits[0].subject).toBe('refactor: update file1'); + expect(commits[1].subject).toBe('fix: bug fix PR #456'); + }); + + it('should include author information', async () => { + const commits = await extractor.getCommits({ limit: 1 }); + + expect(commits[0].author.name).toBe('Test User'); + expect(commits[0].author.email).toBe('test@example.com'); + expect(commits[0].author.date).toMatch(/^\d{4}-\d{2}-\d{2}T/); + }); + + it('should include file changes', async () => { + const commits = await extractor.getCommits({ limit: 1 }); + + expect(commits[0].files.length).toBeGreaterThan(0); + expect(commits[0].files[0].path).toBe('file1.ts'); + expect(commits[0].stats.filesChanged).toBe(1); + }); + + it('should extract issue references from message', async () => { + const commits = await extractor.getCommits(); + const issueCommit = commits.find((c) => c.subject.includes('#123')); + + expect(issueCommit).toBeDefined(); + expect(issueCommit?.refs.issueRefs).toContain(123); + }); + + it('should extract PR references from message', async () => { + const commits = await extractor.getCommits(); + const prCommit = commits.find((c) => c.subject.includes('PR #456')); + + expect(prCommit).toBeDefined(); + expect(prCommit?.refs.prRefs).toContain(456); + }); + + it('should filter by path', async () => { + const commits = await extractor.getCommits({ path: 'file1.ts' }); + + expect(commits.length).toBe(2); // Initial add and update + expect(commits.every((c) => c.files.some((f) => f.path === 'file1.ts'))).toBe(true); + }); + + it('should handle empty repository gracefully', async () => { + const emptyRepoPath = fs.mkdtempSync(path.join(os.tmpdir(), 'git-empty-test-')); + execSync('git init', { cwd: emptyRepoPath, stdio: 'pipe' }); + + const emptyExtractor = new LocalGitExtractor(emptyRepoPath); + + // Should not throw, just return empty array + const commits = await emptyExtractor.getCommits(); + expect(commits).toEqual([]); + + fs.rmSync(emptyRepoPath, { recursive: true, force: true }); + }); + }); + + describe('getCommit', () => { + it('should return a single commit by hash', async () => { + const commits = await extractor.getCommits({ limit: 1 }); + const hash = commits[0].hash; + + const commit = await extractor.getCommit(hash); + + expect(commit).not.toBeNull(); + expect(commit?.hash).toBe(hash); + expect(commit?.subject).toBe('refactor: update file1'); + }); + + it('should return null for non-existent hash', async () => { + const commit = await extractor.getCommit('0000000000000000000000000000000000000000'); + + expect(commit).toBeNull(); + }); + + it('should work with short hash', async () => { + const commits = await extractor.getCommits({ limit: 1 }); + const shortHash = commits[0].shortHash; + + const commit = await extractor.getCommit(shortHash); + + expect(commit).not.toBeNull(); + expect(commit?.shortHash).toBe(shortHash); + }); + }); + + describe('getRepositoryInfo', () => { + it('should return repository information', async () => { + const info = await extractor.getRepositoryInfo(); + + expect(info.branch).toBeDefined(); + expect(info.head).toMatch(/^[0-9a-f]{40}$/); + expect(info.dirty).toBe(false); + }); + + it('should detect dirty state', async () => { + // Create uncommitted change + fs.writeFileSync(path.join(testRepoPath, 'uncommitted.txt'), 'dirty'); + + const info = await extractor.getRepositoryInfo(); + expect(info.dirty).toBe(true); + + // Cleanup + fs.unlinkSync(path.join(testRepoPath, 'uncommitted.txt')); + }); + }); + + describe('getBlame', () => { + it('should return blame information for a file', async () => { + const blame = await extractor.getBlame('file1.ts'); + + expect(blame.file).toBe('file1.ts'); + expect(blame.lines.length).toBe(2); // Two lines in file + expect(blame.lines[0].lineNumber).toBe(1); + expect(blame.lines[0].content).toBe('export const x = 1;'); + expect(blame.lines[0].commit.author).toBe('Test User'); + }); + + it('should support line range', async () => { + const blame = await extractor.getBlame('file1.ts', { startLine: 1, endLine: 1 }); + + expect(blame.lines.length).toBe(1); + expect(blame.lines[0].lineNumber).toBe(1); + }); + }); + + describe('reference extraction', () => { + it('should extract multiple issue references', async () => { + // Create commit with multiple refs + fs.writeFileSync(path.join(testRepoPath, 'multi.ts'), 'multi'); + execSync('git add multi.ts', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "fix: resolve #1, #2, and #3"', { cwd: testRepoPath, stdio: 'pipe' }); + + const commits = await extractor.getCommits({ limit: 1 }); + + expect(commits[0].refs.issueRefs).toContain(1); + expect(commits[0].refs.issueRefs).toContain(2); + expect(commits[0].refs.issueRefs).toContain(3); + }); + + it('should not confuse PR refs with issue refs', async () => { + fs.writeFileSync(path.join(testRepoPath, 'pr-test.ts'), 'pr'); + execSync('git add pr-test.ts', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "Merge pull request #999 from branch"', { + cwd: testRepoPath, + stdio: 'pipe', + }); + + const commits = await extractor.getCommits({ limit: 1 }); + + expect(commits[0].refs.prRefs).toContain(999); + expect(commits[0].refs.issueRefs).not.toContain(999); + }); + }); + + describe('file change parsing', () => { + it('should track additions and deletions', async () => { + const commits = await extractor.getCommits(); + const updateCommit = commits.find((c) => c.subject === 'refactor: update file1'); + + expect(updateCommit).toBeDefined(); + expect(updateCommit?.stats.additions).toBeGreaterThan(0); + }); + + it('should handle file renames', async () => { + // Create and rename a file + fs.writeFileSync(path.join(testRepoPath, 'old-name.ts'), 'content'); + execSync('git add old-name.ts', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "add file to rename"', { cwd: testRepoPath, stdio: 'pipe' }); + + fs.renameSync(path.join(testRepoPath, 'old-name.ts'), path.join(testRepoPath, 'new-name.ts')); + execSync('git add -A', { cwd: testRepoPath, stdio: 'pipe' }); + execSync('git commit -m "rename file"', { cwd: testRepoPath, stdio: 'pipe' }); + + const commits = await extractor.getCommits({ limit: 1 }); + + // Note: git may or may not detect this as a rename depending on similarity + // Just verify there are file changes + expect(commits[0].files.length).toBeGreaterThan(0); + }); + }); +}); diff --git a/packages/core/src/git/extractor.ts b/packages/core/src/git/extractor.ts new file mode 100644 index 0000000..5b1efcf --- /dev/null +++ b/packages/core/src/git/extractor.ts @@ -0,0 +1,490 @@ +/** + * Git Extractor + * + * Extracts git history data by shelling out to git commands. + * Designed as an interface for future pluggability (GitHub API, etc.) + */ + +import { execSync } from 'node:child_process'; +import type { + BlameOptions, + GetCommitsOptions, + GitBlame, + GitBlameLine, + GitCommit, + GitFileChange, + GitPerson, + GitRefs, + GitRepositoryInfo, +} from './types'; + +/** + * Abstract interface for git data extraction. + * Allows swapping local git for GitHub API in the future. + */ +export interface GitExtractor { + /** Get commits matching options */ + getCommits(options?: GetCommitsOptions): Promise; + + /** Get a single commit by hash */ + getCommit(hash: string): Promise; + + /** Get blame for a file (future) */ + getBlame(file: string, options?: BlameOptions): Promise; + + /** Get repository info */ + getRepositoryInfo(): Promise; +} + +/** Field separator for git log parsing */ +const FIELD_SEP = '␞'; // ASCII Record Separator +/** Record separator for git log parsing */ +const RECORD_SEP = '␟'; // ASCII Unit Separator + +/** + * Git log format string + * Fields: hash, short hash, author name, author email, author date, + * committer name, committer email, committer date, subject, body, parents + * + * We use COMMIT_START marker to reliably split commits since body can contain newlines + */ +const COMMIT_START = '::COMMIT_START::'; +const LOG_FORMAT = [ + `${COMMIT_START}%H`, // hash (with marker) + '%h', // short hash + '%an', // author name + '%ae', // author email + '%aI', // author date (ISO) + '%cn', // committer name + '%ce', // committer email + '%cI', // committer date (ISO) + '%s', // subject + '%b', // body + '%P', // parent hashes +].join(FIELD_SEP); + +/** + * Local git implementation using shell commands + */ +export class LocalGitExtractor implements GitExtractor { + constructor(private repositoryPath: string) {} + + /** + * Get commits matching the given options + */ + async getCommits(options: GetCommitsOptions = {}): Promise { + const { + limit = 100, + since, + until, + author, + path, + follow = true, + noMerges = true, + startFrom, + } = options; + + // Build git log command + const args: string[] = [ + 'log', + `--format=${LOG_FORMAT}${RECORD_SEP}`, + '--numstat', + `-n${limit}`, + ]; + + if (noMerges) args.push('--no-merges'); + if (since) args.push(`--since="${since}"`); + if (until) args.push(`--until="${until}"`); + if (author) args.push(`--author="${author}"`); + if (startFrom) args.push(startFrom); + if (path) { + if (follow) args.push('--follow'); + args.push('--', path); + } + + const output = this.execGit(args); + if (!output.trim()) { + return []; + } + + return this.parseLogOutput(output); + } + + /** + * Get a single commit by hash + */ + async getCommit(hash: string): Promise { + try { + const args = ['show', `--format=${LOG_FORMAT}${RECORD_SEP}`, '--numstat', hash]; + + const output = this.execGit(args); + if (!output.trim()) { + return null; + } + + const commits = this.parseLogOutput(output); + return commits[0] || null; + } catch { + return null; + } + } + + /** + * Get blame for a file + * @throws Error - Not implemented yet (future feature) + */ + async getBlame(file: string, options?: BlameOptions): Promise { + const args = ['blame', '-l', '-t', '--line-porcelain']; + + if (options?.startLine && options?.endLine) { + args.push(`-L${options.startLine},${options.endLine}`); + } + + args.push('--', file); + + const output = this.execGit(args); + return this.parseBlameOutput(file, output); + } + + /** + * Get repository information + */ + async getRepositoryInfo(): Promise { + // Get remote URL + let remote: string | null = null; + let owner: string | null = null; + let name = ''; + + try { + remote = this.execGit(['remote', 'get-url', 'origin']).trim(); + const parsed = this.parseRemoteUrl(remote); + owner = parsed.owner; + name = parsed.name; + } catch { + // No remote configured + name = this.repositoryPath.split('/').pop() || 'unknown'; + } + + // Get current branch + let branch = 'HEAD'; + try { + branch = this.execGit(['rev-parse', '--abbrev-ref', 'HEAD']).trim(); + } catch { + // Detached HEAD or other issue + } + + // Get HEAD commit + let head = ''; + try { + head = this.execGit(['rev-parse', 'HEAD']).trim(); + } catch { + // Empty repo + } + + // Check for uncommitted changes + let dirty = false; + try { + const status = this.execGit(['status', '--porcelain']); + dirty = status.trim().length > 0; + } catch { + // Ignore + } + + return { name, remote, owner, branch, head, dirty }; + } + + /** + * Execute a git command and return stdout + */ + private execGit(args: string[]): string { + const command = `git ${args.join(' ')}`; + try { + return execSync(command, { + cwd: this.repositoryPath, + encoding: 'utf-8', + maxBuffer: 50 * 1024 * 1024, // 50MB for large repos + stdio: ['pipe', 'pipe', 'pipe'], + }); + } catch (error) { + // Check for empty repo or other expected errors + const message = error instanceof Error ? error.message : String(error); + if (message.includes('does not have any commits yet')) { + return ''; + } + throw error; + } + } + + /** + * Parse git log output into commits + */ + private parseLogOutput(output: string): GitCommit[] { + const commits: GitCommit[] = []; + + // Split by commit start marker + const records = output.split(COMMIT_START).filter((r) => r.trim()); + + for (const record of records) { + const commit = this.parseCommitRecord(record); + if (commit) { + commits.push(commit); + } + } + + return commits; + } + + /** + * Parse a single commit record + */ + private parseCommitRecord(record: string): GitCommit | null { + // Record format: HASH␞shortHash␞...␞parents␟\n\nnumstat lines + // The record separator (␟) marks end of metadata, then numstat follows + + // Split on record separator to separate metadata from numstat + const [metadataPart, numstatPart] = record.split(RECORD_SEP); + if (!metadataPart) return null; + + const fields = metadataPart.split(FIELD_SEP); + + if (fields.length < 11) { + // Invalid format + return null; + } + + const [ + hash, + shortHash, + authorName, + authorEmail, + authorDate, + committerName, + committerEmail, + committerDate, + subject, + body, + parentStr, + ] = fields; + + // Parse file changes from numstat + const files: GitFileChange[] = []; + let additions = 0; + let deletions = 0; + + if (numstatPart) { + const numstatLines = numstatPart.trim().split('\n'); + for (const line of numstatLines) { + const trimmed = line.trim(); + if (!trimmed) continue; + + const fileChange = this.parseNumstatLine(trimmed); + if (fileChange) { + files.push(fileChange); + additions += fileChange.additions; + deletions += fileChange.deletions; + } + } + } + + // Extract references from message + const fullMessage = body ? `${subject}\n\n${body}` : subject; + const refs = this.extractRefs(fullMessage); + + const author: GitPerson = { + name: authorName, + email: authorEmail, + date: authorDate, + }; + + const committer: GitPerson = { + name: committerName, + email: committerEmail, + date: committerDate, + }; + + return { + hash, + shortHash, + message: fullMessage, + subject, + body: body || '', + author, + committer, + files, + stats: { + additions, + deletions, + filesChanged: files.length, + }, + refs, + parents: parentStr ? parentStr.split(' ').filter(Boolean) : [], + }; + } + + /** + * Parse a numstat line (additions, deletions, path) + */ + private parseNumstatLine(line: string): GitFileChange | null { + // Format: "10\t5\tpath/to/file" or "10\t5\told => new" for renames + const parts = line.split('\t'); + if (parts.length < 3) return null; + + const [addStr, delStr, ...pathParts] = parts; + const pathStr = pathParts.join('\t'); + + // Handle binary files (shown as -) + const additions = addStr === '-' ? 0 : parseInt(addStr, 10) || 0; + const deletions = delStr === '-' ? 0 : parseInt(delStr, 10) || 0; + + // Check for rename (old => new format) + const renameMatch = pathStr.match(/^(.+?)\s*=>\s*(.+)$/); + if (renameMatch) { + // Handle renames with {} notation: path/{old => new}/file + const fullPath = pathStr; + const braceMatch = fullPath.match(/^(.*?)\{(.+?)\s*=>\s*(.+?)\}(.*)$/); + + if (braceMatch) { + const [, prefix, oldPart, newPart, suffix] = braceMatch; + return { + path: `${prefix}${newPart}${suffix}`.replace(/\/+/g, '/'), + previousPath: `${prefix}${oldPart}${suffix}`.replace(/\/+/g, '/'), + status: 'renamed', + additions, + deletions, + }; + } + + return { + path: renameMatch[2].trim(), + previousPath: renameMatch[1].trim(), + status: 'renamed', + additions, + deletions, + }; + } + + // Determine status based on additions/deletions + let status: GitFileChange['status'] = 'modified'; + if (additions > 0 && deletions === 0) { + // Could be new file, but we can't tell from numstat alone + status = 'modified'; + } + + return { + path: pathStr, + status, + additions, + deletions, + }; + } + + /** + * Extract issue and PR references from commit message + */ + private extractRefs(message: string): GitRefs { + const issueRefs: number[] = []; + const prRefs: number[] = []; + + // Match PR references: "PR #123", "pull request #123", "Merge pull request #123" + const prMatches = message.matchAll(/(?:PR\s*#|pull\s+request\s*#|Merge pull request #)(\d+)/gi); + for (const match of prMatches) { + const num = parseInt(match[1], 10); + if (!prRefs.includes(num)) { + prRefs.push(num); + } + } + + // Match issue references: #123 (but not PR #123) + // Use negative lookbehind to exclude PR references + const issueMatches = message.matchAll( + /(?; + /** First commit date */ + firstCommit: string; + /** Last commit date */ + lastCommit: string; +} + +/** + * Result of git indexing + */ +export interface GitIndexResult { + /** Number of commits indexed */ + commitsIndexed: number; + /** Time taken in ms */ + durationMs: number; + /** Any errors encountered */ + errors: string[]; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index e51be77..2647a1b 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -3,6 +3,7 @@ export * from './api'; export * from './context'; export * from './events'; +export * from './git'; export * from './github'; export * from './indexer'; export * from './map';