diff --git a/PLAN.md b/PLAN.md index b77868a..7b0007e 100644 --- a/PLAN.md +++ b/PLAN.md @@ -240,6 +240,40 @@ Git history is valuable context that LLMs can't easily access. We add intelligen --- +## Current: Performance & Reliability (v0.6.x - v0.7.x) + +> Critical high-impact improvements for production readiness and user experience. + +**Epic:** #104 (Progress: 6/9 complete) + +### Completed Improvements ✅ + +| Feature | Status | Version | Impact | +|---------|--------|---------|--------| +| Index size reporting | ✅ Done | v0.4.3 | Track disk usage growth | +| Adaptive concurrency | ✅ Done | v0.6.0 | Auto-detect optimal batch size by CPU/memory | +| Incremental indexing | ✅ Done | v0.5.1 | <30s updates for single file changes (#122) | +| Progress indicators | ✅ Done | v0.1.0 | Real-time feedback for long operations | +| Error handling | ✅ Done | v0.3.0 | Graceful degradation | +| Basic validation | ✅ Done | v0.2.0 | Git repo and path checks | + +### Remaining Work 🔄 + +| Issue | Priority | Impact | Status | +|-------|----------|--------|--------| +| #152 - MCP lazy initialization | P0 | Reduce startup from 2-5s to <500ms | 🔲 Todo | +| #153 - GitHub history in planner | P0 | Add commit context to AI plans | 🔲 Todo | +| #154 - Memory monitoring | P1 | Prevent leaks, maintain <500MB usage | 🔲 Todo | + +**Success Metrics:** +- ✅ Large repo indexing: <5min for 50k files +- ✅ Incremental updates: <30s for single file changes +- 🔲 MCP server startup: <500ms (currently 2-5s) +- 🔲 Memory usage: <500MB steady state +- 🔲 Planner quality: Include git history context + +--- + ## Next: Extended Git Intelligence (v0.5.0) > Building on git history with deeper insights. @@ -277,7 +311,195 @@ Git history is valuable context that LLMs can't easily access. We add intelligen --- -## Future: Extended Intelligence (v0.6+) +## Next: Dashboard & Visualization (v0.7.1) + +> Making codebase insights visible and accessible. + +**Epic:** #145 + +### Philosophy + +Dev-agent provides rich context about codebases, but it's currently text-only. A dashboard makes insights: +- **Visible** - See language breakdown, component types, health status at a glance +- **Interactive** - Explore relationships, drill into packages +- **Actionable** - Identify areas needing attention + +### Goals + +1. **Enhanced CLI** (`dev dashboard`) - Terminal-based stats with rich formatting +2. **Web Dashboard** - Next.js app with real-time insights +3. **Data Infrastructure** - Aggregate stats during indexing for efficient display + +### Components + +| Component | Status | Priority | +|-----------|--------|----------| +| **CLI Enhancements** | | | +| Language breakdown display | 🔲 Todo | 🔴 High | +| Component type statistics | 🔲 Todo | 🔴 High | +| Package-level stats (monorepo) | 🔲 Todo | 🔴 High | +| Rich formatting (tables, colors) | 🔲 Todo | 🔴 High | +| **Core Data Collection** | | | +| Track language metrics in indexer | 🔲 Todo | 🔴 High | +| Aggregate component type counts | 🔲 Todo | 🔴 High | +| Package-level aggregation | 🔲 Todo | 🟡 Medium | +| Change frequency tracking | 🔲 Todo | 🟡 Medium | +| **Web Dashboard** | | | +| Next.js app setup (`apps/dashboard/`) | 🔲 Todo | 🔴 High | +| Tremor component library | 🔲 Todo | 🔴 High | +| API routes (stats, health) | 🔲 Todo | 🔴 High | +| Real-time stats display | 🔲 Todo | 🔴 High | +| Language distribution charts | 🔲 Todo | 🟡 Medium | +| Component type visualizations | 🔲 Todo | 🟡 Medium | +| Health status indicators | 🔲 Todo | 🟡 Medium | +| Vector index metrics (simple) | 🔲 Todo | 🟡 Medium | +| Basic package list (monorepo) | 🔲 Todo | 🟡 Medium | + +### Architecture + +``` +apps/ +└── dashboard/ # Next.js 16 + React 19 + Tremor + ├── app/ + │ ├── page.tsx # Main dashboard + │ └── api/ + │ └── stats/ # Next.js API routes + └── components/ + └── tremor/ # Tremor dashboard components + +packages/core/ +└── src/ + └── indexer/ + └── stats-aggregator.ts # New: Collect detailed stats +``` + +### Implementation Plan + +**Implementation Phases:** + +**Phase 1: Data Foundation** +- Enhance IndexStats with language/component breakdowns +- Aggregate stats during indexing (minimal overhead) +- Foundation for all visualizations + +**Phase 2: CLI Enhancements** +- Rich terminal output with tables and colors +- Package-level breakdown for monorepos +- Immediate user value + +**Phase 3: Web Dashboard** +- Next.js 16 app in `apps/dashboard/` +- Tremor component setup +- Basic stats display with charts + +**Phase 4: Advanced Features** +- Interactive exploration +- Package explorer (monorepo support) +- Real-time updates + +--- + +## Next: Advanced LanceDB Visualizations (v0.7.2) + +> Making vector embeddings visible and explorable. + +### Philosophy + +LanceDB stores 384-dimensional embeddings for semantic search, but these are invisible to users. Advanced visualizations reveal: +- **Where code lives** in semantic space (2D projections) +- **What's related** beyond imports (similarity networks) +- **How embeddings evolve** over time (drift tracking) +- **Search quality** insights (what works, what doesn't) + +### Goals + +1. **Semantic Code Map** - 2D/3D projection of vector space +2. **Similarity Explorer** - Interactive component relationship graph +3. **Search Quality Dashboard** - Analyze search performance +4. **Embedding Health** - Coverage and quality metrics per directory + +### Components + +| Component | Description | Priority | +|-----------|-------------|----------| +| **Semantic Code Map** | | | +| t-SNE/UMAP projection to 2D | Visualize embedding space | 🔴 High | +| Interactive scatter plot | Click to see code snippet | 🔴 High | +| Color by language/type | Visual code categorization | 🟡 Medium | +| Cluster detection | Auto-identify code groups | 🟡 Medium | +| **Similarity Network** | | | +| Component relationship graph | Force-directed layout | 🔴 High | +| Semantic similarity edges | Show hidden relationships | 🔴 High | +| Interactive exploration | Zoom, pan, filter | 🟡 Medium | +| Duplication detection | High similarity alerts | 🟡 Medium | +| **Search Quality** | | | +| Search metrics dashboard | Track performance over time | 🔴 High | +| Query similarity heatmap | Understand search patterns | 🟡 Medium | +| "Dead zone" detection | Queries with poor results | 🟡 Medium | +| Recommendation engine | Suggest better queries | 🟢 Low | +| **Embedding Health** | | | +| Coverage heatmap by directory | Identify blind spots | 🔴 High | +| Quality scoring per file | Flag low-quality embeddings | 🟡 Medium | +| Drift tracking over time | Monitor embedding changes | 🟡 Medium | +| Re-index recommendations | Suggest what needs updating | 🟢 Low | + +### Architecture + +``` +Dashboard UI + ↓ +Advanced Viz Components (D3.js, Plotly, or similar) + ↓ +New API Routes + ├─ GET /api/embeddings/projection (t-SNE/UMAP data) + ├─ GET /api/embeddings/similarity (network graph) + ├─ GET /api/embeddings/quality (coverage metrics) + └─ GET /api/embeddings/search-history (query analysis) + ↓ +LanceDB + Vector Analysis + └─ Dimensionality reduction, similarity queries, metrics +``` + +### Dependencies + +**New:** +- `umap-js` or `tsne-js` - Dimensionality reduction +- `d3` or `@visx/visx` - Advanced visualizations +- `react-force-graph` - Network graphs (or `sigma.js`) +- `@tensorflow/tfjs` (optional) - Advanced vector operations + +### Implementation Phases + +**Phase 1: Semantic Code Map** +- Implement t-SNE/UMAP projection +- Create 2D scatter plot visualization +- Add basic interactivity (hover, click) + +**Phase 2: Similarity Network** +- Build component similarity graph +- Implement force-directed layout +- Add filtering and exploration + +**Phase 3: Search Quality** +- Track search queries and results +- Build metrics dashboard +- Implement quality scoring + +**Phase 4: Embedding Health** +- Coverage analysis by directory +- Quality scoring per file +- Drift detection system + +### Success Metrics + +- Developers can visually explore codebase semantics +- Identify code duplication without running analysis tools +- Understand which areas need re-indexing +- Improve search query formulation based on insights + +--- + +## Future: Extended Intelligence (v0.8+) ### Multi-Language Support diff --git a/packages/core/src/indexer.ts b/packages/core/src/indexer.ts new file mode 100644 index 0000000..2bd6a56 --- /dev/null +++ b/packages/core/src/indexer.ts @@ -0,0 +1,8 @@ +/** + * Repository Indexer module exports + */ + +export { RepositoryIndexer } from './indexer/index'; +export { StatsAggregator } from './indexer/stats-aggregator'; +export * from './indexer/types'; +export * from './indexer/utils'; diff --git a/packages/core/src/indexer/__tests__/detailed-stats.integration.test.ts b/packages/core/src/indexer/__tests__/detailed-stats.integration.test.ts new file mode 100644 index 0000000..90c3db7 --- /dev/null +++ b/packages/core/src/indexer/__tests__/detailed-stats.integration.test.ts @@ -0,0 +1,274 @@ +import * as fs from 'node:fs/promises'; +import * as os from 'node:os'; +import * as path from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { RepositoryIndexer } from '../index'; +import type { DetailedIndexStats } from '../types'; + +describe('Detailed Stats Integration', () => { + let testDir: string; + let vectorStorePath: string; + + beforeEach(async () => { + // Create temp directory + testDir = path.join(os.tmpdir(), `test-detailed-stats-${Date.now()}`); + await fs.mkdir(testDir, { recursive: true }); + vectorStorePath = path.join(testDir, 'vectors.lance'); + }); + + afterEach(async () => { + // Cleanup + await fs.rm(testDir, { recursive: true, force: true }); + }); + + it('should collect detailed language stats', async () => { + // Create test files with different languages + const srcDir = path.join(testDir, 'src'); + await fs.mkdir(srcDir, { recursive: true }); + + await fs.writeFile( + path.join(srcDir, 'test.ts'), + ` + export function hello(): string { + return "Hello from TypeScript"; + } + + export class Greeter { + greet(): string { + return "Hello"; + } + } + ` + ); + + await fs.writeFile( + path.join(srcDir, 'test.js'), + ` + function goodbye() { + return "Goodbye from JavaScript"; + } + + module.exports = { goodbye }; + ` + ); + + // Index the repository + const indexer = new RepositoryIndexer({ + repositoryPath: testDir, + vectorStorePath, + }); + + await indexer.initialize(); + const stats = (await indexer.index()) as DetailedIndexStats; + await indexer.close(); + + // Verify language stats + expect(stats.byLanguage).toBeDefined(); + expect(stats.byLanguage?.typescript).toBeDefined(); + expect(stats.byLanguage?.javascript).toBeDefined(); + + // TypeScript should have 2 components (function + class) + expect(stats.byLanguage?.typescript.files).toBe(1); + expect(stats.byLanguage?.typescript.components).toBeGreaterThanOrEqual(2); + + // JavaScript should have 1 component (function) + expect(stats.byLanguage?.javascript.files).toBe(1); + expect(stats.byLanguage?.javascript.components).toBeGreaterThanOrEqual(1); + }); + + it('should collect component type stats', async () => { + const srcDir = path.join(testDir, 'src'); + await fs.mkdir(srcDir, { recursive: true }); + + await fs.writeFile( + path.join(srcDir, 'components.ts'), + ` + export function myFunction(): void {} + export class MyClass {} + export interface MyInterface { + prop: string; + } + export type MyType = string | number; + export const useCustomHook = () => { + return { value: 42 }; + }; + ` + ); + + const indexer = new RepositoryIndexer({ + repositoryPath: testDir, + vectorStorePath, + }); + + await indexer.initialize(); + const stats = (await indexer.index()) as DetailedIndexStats; + await indexer.close(); + + // Verify component type stats + expect(stats.byComponentType).toBeDefined(); + expect(stats.byComponentType?.function).toBeGreaterThanOrEqual(1); + expect(stats.byComponentType?.class).toBeGreaterThanOrEqual(1); + expect(stats.byComponentType?.interface).toBeGreaterThanOrEqual(1); + expect(stats.byComponentType?.type).toBeGreaterThanOrEqual(1); + // Variable type might be present if arrow function is detected + if (stats.byComponentType?.variable) { + expect(stats.byComponentType?.variable).toBeGreaterThanOrEqual(1); + } + }); + + it('should collect stats for mixed language repository', async () => { + const srcDir = path.join(testDir, 'src'); + await fs.mkdir(srcDir, { recursive: true }); + + // TypeScript file + await fs.writeFile( + path.join(srcDir, 'utils.ts'), + ` + export function add(a: number, b: number): number { + return a + b; + } + ` + ); + + // JavaScript file + await fs.writeFile( + path.join(srcDir, 'legacy.js'), + ` + function multiply(a, b) { + return a * b; + } + ` + ); + + // Markdown file + await fs.writeFile( + path.join(testDir, 'README.md'), + ` +# Test Project + +This is a test project. + ` + ); + + const indexer = new RepositoryIndexer({ + repositoryPath: testDir, + vectorStorePath, + }); + + await indexer.initialize(); + const stats = (await indexer.index()) as DetailedIndexStats; + await indexer.close(); + + // Should have stats for all three languages + expect(stats.byLanguage).toBeDefined(); + if (stats.byLanguage) { + expect(Object.keys(stats.byLanguage).length).toBeGreaterThanOrEqual(3); + } + + // Verify each language has file count + expect(stats.byLanguage?.typescript?.files).toBeGreaterThanOrEqual(1); + expect(stats.byLanguage?.javascript?.files).toBeGreaterThanOrEqual(1); + expect(stats.byLanguage?.markdown?.files).toBeGreaterThanOrEqual(1); + }); + + it('should handle incremental updates with stats', async () => { + const srcDir = path.join(testDir, 'src'); + await fs.mkdir(srcDir, { recursive: true }); + + // Initial file + await fs.writeFile( + path.join(srcDir, 'initial.ts'), + ` + export function initial(): string { + return "initial"; + } + ` + ); + + const indexer = new RepositoryIndexer({ + repositoryPath: testDir, + vectorStorePath, + }); + + await indexer.initialize(); + const _initialStats = (await indexer.index()) as DetailedIndexStats; + + // Add new file + await fs.writeFile( + path.join(srcDir, 'added.js'), + ` + function added() { + return "added"; + } + ` + ); + + // Wait a bit to ensure timestamp difference + await new Promise((resolve) => setTimeout(resolve, 100)); + + // Update index + const updateStats = (await indexer.update({ + since: new Date(Date.now() - 1000), + })) as DetailedIndexStats; + + await indexer.close(); + + // Verify update stats show the new JavaScript file + expect(updateStats.byLanguage).toBeDefined(); + expect(updateStats.byLanguage?.javascript).toBeDefined(); + expect(updateStats.byLanguage?.javascript.files).toBeGreaterThanOrEqual(1); + }); + + it('should calculate line counts correctly', async () => { + const srcDir = path.join(testDir, 'src'); + await fs.mkdir(srcDir, { recursive: true }); + + await fs.writeFile( + path.join(srcDir, 'multiline.ts'), + ` + export function longFunction(): void { + // Line 1 + // Line 2 + // Line 3 + // Line 4 + // Line 5 + console.log("This is a long function"); + } + ` + ); + + const indexer = new RepositoryIndexer({ + repositoryPath: testDir, + vectorStorePath, + }); + + await indexer.initialize(); + const stats = (await indexer.index()) as DetailedIndexStats; + await indexer.close(); + + // Verify line count is captured + expect(stats.byLanguage?.typescript.lines).toBeGreaterThan(0); + }); + + it('should handle empty repository gracefully', async () => { + // Empty directory - no source files + const indexer = new RepositoryIndexer({ + repositoryPath: testDir, + vectorStorePath, + }); + + await indexer.initialize(); + const stats = (await indexer.index()) as DetailedIndexStats; + await indexer.close(); + + // Should have empty stats + expect(stats.byLanguage).toBeDefined(); + if (stats.byLanguage) { + expect(Object.keys(stats.byLanguage).length).toBe(0); + } + expect(stats.byComponentType).toBeDefined(); + if (stats.byComponentType) { + expect(Object.keys(stats.byComponentType).length).toBe(0); + } + }); +}); diff --git a/packages/core/src/indexer/__tests__/stats-aggregator.test.ts b/packages/core/src/indexer/__tests__/stats-aggregator.test.ts new file mode 100644 index 0000000..7b71eb3 --- /dev/null +++ b/packages/core/src/indexer/__tests__/stats-aggregator.test.ts @@ -0,0 +1,466 @@ +import { beforeEach, describe, expect, it } from 'vitest'; +import type { Document } from '../../scanner/types'; +import { StatsAggregator } from '../stats-aggregator'; + +describe('StatsAggregator', () => { + let aggregator: StatsAggregator; + + beforeEach(() => { + aggregator = new StatsAggregator(); + }); + + describe('Basic Aggregation', () => { + it('should aggregate language stats', () => { + const tsDoc: Document = { + id: 'test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'test.ts', + startLine: 1, + endLine: 3, + name: 'test', + exported: true, + }, + }; + + const jsDoc: Document = { + id: 'test.js:func:1', + text: 'function test() {}', + type: 'function', + language: 'javascript', + metadata: { + file: 'test.js', + startLine: 1, + endLine: 3, + name: 'test', + exported: true, + }, + }; + + aggregator.addDocument(tsDoc); + aggregator.addDocument(jsDoc); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byLanguage.typescript).toEqual({ + files: 1, + components: 1, + lines: 3, + }); + + expect(stats.byLanguage.javascript).toEqual({ + files: 1, + components: 1, + lines: 3, + }); + }); + + it('should aggregate component type stats', () => { + const functionDoc: Document = { + id: 'test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + const classDoc: Document = { + id: 'test.ts:class:5', + text: 'class Test {}', + type: 'class', + language: 'typescript', + metadata: { + file: 'test.ts', + startLine: 5, + endLine: 7, + name: 'Test', + exported: true, + }, + }; + + aggregator.addDocument(functionDoc); + aggregator.addDocument(classDoc); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byComponentType).toEqual({ + function: 1, + class: 1, + }); + }); + + it('should count multiple documents from same file correctly', () => { + const doc1: Document = { + id: 'test.ts:func1:1', + text: 'function one() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + const doc2: Document = { + id: 'test.ts:func2:5', + text: 'function two() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'test.ts', + startLine: 5, + endLine: 7, + exported: true, + }, + }; + + aggregator.addDocument(doc1); + aggregator.addDocument(doc2); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byLanguage.typescript).toEqual({ + files: 1, // Same file + components: 2, // Two components + lines: 6, // 3 + 3 lines + }); + }); + }); + + describe('Package Aggregation', () => { + it('should aggregate package stats', () => { + aggregator.registerPackage('packages/core', '@lytics/dev-agent-core'); + + const doc: Document = { + id: 'packages/core/src/test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'packages/core/src/test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + aggregator.addDocument(doc); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byPackage['packages/core']).toEqual({ + name: '@lytics/dev-agent-core', + path: 'packages/core', + files: 1, + components: 1, + languages: { + typescript: 1, + }, + }); + }); + + it('should handle multiple packages', () => { + aggregator.registerPackage('packages/core', '@lytics/dev-agent-core'); + aggregator.registerPackage('packages/cli', '@lytics/dev-agent-cli'); + + const coreDoc: Document = { + id: 'packages/core/src/test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'packages/core/src/test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + const cliDoc: Document = { + id: 'packages/cli/src/main.ts:func:1', + text: 'function main() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'packages/cli/src/main.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + aggregator.addDocument(coreDoc); + aggregator.addDocument(cliDoc); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byPackage['packages/core'].components).toBe(1); + expect(stats.byPackage['packages/cli'].components).toBe(1); + }); + + it('should match most specific package for nested paths', () => { + aggregator.registerPackage('packages', 'root-package'); + aggregator.registerPackage('packages/core', '@lytics/dev-agent-core'); + + const doc: Document = { + id: 'packages/core/src/test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'packages/core/src/test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + aggregator.addDocument(doc); + + const stats = aggregator.getDetailedStats(); + + // Should match the more specific package + expect(stats.byPackage['packages/core'].components).toBe(1); + expect(stats.byPackage['packages'].components).toBe(0); + }); + + it('should handle mixed languages in a package', () => { + aggregator.registerPackage('packages/core', '@lytics/dev-agent-core'); + + const tsDoc: Document = { + id: 'packages/core/src/test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'packages/core/src/test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + const jsDoc: Document = { + id: 'packages/core/src/util.js:func:1', + text: 'function util() {}', + type: 'function', + language: 'javascript', + metadata: { + file: 'packages/core/src/util.js', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + aggregator.addDocument(tsDoc); + aggregator.addDocument(jsDoc); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byPackage['packages/core'].languages).toEqual({ + typescript: 1, + javascript: 1, + }); + }); + }); + + describe('Multiple Languages', () => { + it('should handle all supported languages', () => { + const docs: Document[] = [ + { + id: 'test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { file: 'test.ts', startLine: 1, endLine: 3, exported: true }, + }, + { + id: 'test.js:func:1', + text: 'function test() {}', + type: 'function', + language: 'javascript', + metadata: { file: 'test.js', startLine: 1, endLine: 3, exported: true }, + }, + { + id: 'test.go:func:1', + text: 'func test() {}', + type: 'function', + language: 'go', + metadata: { file: 'test.go', startLine: 1, endLine: 3, exported: true }, + }, + { + id: 'README.md:doc:1', + text: '# Documentation', + type: 'documentation', + language: 'markdown', + metadata: { file: 'README.md', startLine: 1, endLine: 10, exported: false }, + }, + ]; + + for (const doc of docs) { + aggregator.addDocument(doc); + } + + const stats = aggregator.getDetailedStats(); + + expect(stats.byLanguage.typescript).toBeDefined(); + expect(stats.byLanguage.javascript).toBeDefined(); + expect(stats.byLanguage.go).toBeDefined(); + expect(stats.byLanguage.markdown).toBeDefined(); + }); + }); + + describe('Performance', () => { + it('should handle large numbers of documents efficiently', () => { + const start = Date.now(); + + // Simulate 10,000 documents + for (let i = 0; i < 10000; i++) { + const doc: Document = { + id: `file${i}.ts:func:${i}`, + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: `file${i}.ts`, + startLine: 1, + endLine: 3, + exported: true, + }, + }; + aggregator.addDocument(doc); + } + + const duration = Date.now() - start; + const stats = aggregator.getDetailedStats(); + + // Should complete in reasonable time (<100ms for 10k docs) + expect(duration).toBeLessThan(100); + expect(stats.byLanguage.typescript.files).toBe(10000); + expect(stats.byLanguage.typescript.components).toBe(10000); + }); + }); + + describe('Utility Methods', () => { + it('should reset all stats', () => { + const doc: Document = { + id: 'test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + aggregator.addDocument(doc); + aggregator.reset(); + + const stats = aggregator.getDetailedStats(); + const counts = aggregator.getCounts(); + + expect(Object.keys(stats.byLanguage).length).toBe(0); + expect(Object.keys(stats.byComponentType).length).toBe(0); + expect(counts.files).toBe(0); + }); + + it('should provide accurate counts', () => { + aggregator.registerPackage('packages/core', '@lytics/dev-agent-core'); + + const docs: Document[] = [ + { + id: 'test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { file: 'test.ts', startLine: 1, endLine: 3, exported: true }, + }, + { + id: 'test.ts:class:5', + text: 'class Test {}', + type: 'class', + language: 'typescript', + metadata: { file: 'test.ts', startLine: 5, endLine: 7, exported: true }, + }, + ]; + + for (const doc of docs) { + aggregator.addDocument(doc); + } + + const counts = aggregator.getCounts(); + + expect(counts.languages).toBe(1); // Only TypeScript + expect(counts.componentTypes).toBe(2); // function and class + expect(counts.packages).toBe(1); // One registered package + expect(counts.files).toBe(1); // One unique file + }); + }); + + describe('Edge Cases', () => { + it('should handle empty aggregation', () => { + const stats = aggregator.getDetailedStats(); + + expect(stats.byLanguage).toEqual({}); + expect(stats.byComponentType).toEqual({}); + expect(stats.byPackage).toEqual({}); + }); + + it('should handle documents without package', () => { + const doc: Document = { + id: 'src/test.ts:func:1', + text: 'function test() {}', + type: 'function', + language: 'typescript', + metadata: { + file: 'src/test.ts', + startLine: 1, + endLine: 3, + exported: true, + }, + }; + + // No packages registered + aggregator.addDocument(doc); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byLanguage.typescript.files).toBe(1); + expect(Object.keys(stats.byPackage).length).toBe(0); + }); + + it('should handle single-line components', () => { + const doc: Document = { + id: 'test.ts:var:1', + text: 'export const x = 42;', + type: 'variable', + language: 'typescript', + metadata: { + file: 'test.ts', + startLine: 1, + endLine: 1, + exported: true, + }, + }; + + aggregator.addDocument(doc); + + const stats = aggregator.getDetailedStats(); + + expect(stats.byLanguage.typescript.lines).toBe(1); + }); + }); +}); diff --git a/packages/core/src/indexer/index.ts b/packages/core/src/indexer/index.ts index 43805f2..f1a8403 100644 --- a/packages/core/src/indexer/index.ts +++ b/packages/core/src/indexer/index.ts @@ -10,7 +10,9 @@ import type { Document } from '../scanner/types'; import { getCurrentSystemResources, getOptimalConcurrency } from '../utils/concurrency'; import { VectorStorage } from '../vector'; import type { EmbeddingDocument, SearchOptions, SearchResult } from '../vector/types'; +import { StatsAggregator } from './stats-aggregator'; import type { + DetailedIndexStats, FileMetadata, IndexError, IndexerConfig, @@ -94,6 +96,12 @@ export class RepositoryIndexer { filesScanned = scanResult.stats.filesScanned; documentsExtracted = scanResult.documents.length; + // Aggregate detailed statistics + const statsAggregator = new StatsAggregator(); + for (const doc of scanResult.documents) { + statsAggregator.addDocument(doc); + } + // Phase 2: Prepare documents for embedding const logger = options.logger?.child({ component: 'indexer' }); logger?.info({ documents: documentsExtracted }, 'Preparing documents for embedding'); @@ -229,7 +237,10 @@ export class RepositoryIndexer { percentComplete: 100, }); - const stats: IndexStats = { + // Get detailed stats from aggregator + const detailedStats = statsAggregator.getDetailedStats(); + + const stats: DetailedIndexStats = { filesScanned, documentsExtracted, documentsIndexed, @@ -239,6 +250,7 @@ export class RepositoryIndexer { startTime, endTime, repositoryPath: this.config.repositoryPath, + ...detailedStats, }; return stats; @@ -326,6 +338,7 @@ export class RepositoryIndexer { // Scan and index changed + added files let documentsExtracted = 0; let documentsIndexed = 0; + const statsAggregator = new StatsAggregator(); if (filesToReindex.length > 0) { const scanResult = await scanRepository({ @@ -337,6 +350,11 @@ export class RepositoryIndexer { documentsExtracted = scanResult.documents.length; + // Aggregate detailed statistics + for (const doc of scanResult.documents) { + statsAggregator.addDocument(doc); + } + // Index new documents const embeddingDocuments = prepareDocumentsForEmbedding(scanResult.documents); await this.vectorStorage.addDocuments(embeddingDocuments); @@ -350,6 +368,8 @@ export class RepositoryIndexer { } const endTime = new Date(); + const detailedStats = statsAggregator.getDetailedStats(); + return { filesScanned: filesToReindex.length, documentsExtracted, @@ -360,6 +380,7 @@ export class RepositoryIndexer { startTime, endTime, repositoryPath: this.config.repositoryPath, + ...detailedStats, }; } diff --git a/packages/core/src/indexer/stats-aggregator.ts b/packages/core/src/indexer/stats-aggregator.ts new file mode 100644 index 0000000..c503a78 --- /dev/null +++ b/packages/core/src/indexer/stats-aggregator.ts @@ -0,0 +1,175 @@ +/** + * Statistics aggregator for efficient incremental stats collection during indexing + */ + +import type { Document, DocumentType } from '../scanner/types'; +import type { LanguageStats, PackageStats, SupportedLanguage } from './types'; + +/** + * Efficiently aggregates statistics during indexing with O(1) operations + * Uses streaming aggregation to avoid post-processing overhead + */ +export class StatsAggregator { + private languageStats = new Map(); + private componentTypeStats = new Map(); + private packageStats = new Map(); + private fileToPackage = new Map(); // Cache for package lookups + private processedFiles = new Set(); + + /** + * Add a document to the aggregation + * O(1) operation - just increments counters + */ + addDocument(doc: Document): void { + const language = doc.language as SupportedLanguage; + const file = doc.metadata.file; + const type = doc.type; + + // Track unique files per language + const isNewFile = !this.processedFiles.has(file); + if (isNewFile) { + this.processedFiles.add(file); + } + + // Increment language stats + this.incrementLanguage(language, doc, isNewFile); + + // Increment component type stats + this.incrementComponentType(type); + + // Increment package stats (if in monorepo) + this.incrementPackage(file, language, isNewFile); + } + + /** + * Register a package for monorepo support + */ + registerPackage(packagePath: string, packageName: string): void { + if (!this.packageStats.has(packagePath)) { + this.packageStats.set(packagePath, { + name: packageName, + path: packagePath, + files: 0, + components: 0, + languages: {}, + }); + } + } + + /** + * Get aggregated statistics + */ + getDetailedStats(): { + byLanguage: Record; + byComponentType: Record; + byPackage: Record; + } { + return { + byLanguage: Object.fromEntries(this.languageStats) as Record< + SupportedLanguage, + LanguageStats + >, + byComponentType: Object.fromEntries(this.componentTypeStats), + byPackage: Object.fromEntries(this.packageStats), + }; + } + + /** + * Reset all stats (useful for testing) + */ + reset(): void { + this.languageStats.clear(); + this.componentTypeStats.clear(); + this.packageStats.clear(); + this.fileToPackage.clear(); + this.processedFiles.clear(); + } + + /** + * Get current counts for monitoring + */ + getCounts(): { + languages: number; + componentTypes: number; + packages: number; + files: number; + } { + return { + languages: this.languageStats.size, + componentTypes: this.componentTypeStats.size, + packages: this.packageStats.size, + files: this.processedFiles.size, + }; + } + + // Private helpers + + private incrementLanguage(language: SupportedLanguage, doc: Document, isNewFile: boolean): void { + if (!this.languageStats.has(language)) { + this.languageStats.set(language, { + files: 0, + components: 0, + lines: 0, + }); + } + + const stats = this.languageStats.get(language); + if (!stats) return; // Should never happen, but guard for type safety + + if (isNewFile) { + stats.files++; + } + stats.components++; + + // Approximate lines from component range + const lines = doc.metadata.endLine - doc.metadata.startLine + 1; + stats.lines += lines; + } + + private incrementComponentType(type: DocumentType | string): void { + const current = this.componentTypeStats.get(type) || 0; + this.componentTypeStats.set(type, current + 1); + } + + private incrementPackage(file: string, language: SupportedLanguage, isNewFile: boolean): void { + // Find package for this file (cached lookup) + let packagePath = this.fileToPackage.get(file); + + if (!packagePath) { + // Find nearest parent package by checking registered packages + packagePath = this.findPackageForFile(file); + if (packagePath) { + this.fileToPackage.set(file, packagePath); + } + } + + if (packagePath) { + const pkg = this.packageStats.get(packagePath); + if (pkg) { + if (isNewFile) { + pkg.files++; + } + pkg.components++; + pkg.languages[language] = (pkg.languages[language] || 0) + 1; + } + } + } + + private findPackageForFile(file: string): string | undefined { + // Find the longest matching package path (most specific) + let bestMatch: string | undefined; + let bestMatchLength = 0; + + for (const packagePath of this.packageStats.keys()) { + // Check if file is within this package + if (file.startsWith(`${packagePath}/`) || file.startsWith(packagePath)) { + if (packagePath.length > bestMatchLength) { + bestMatch = packagePath; + bestMatchLength = packagePath.length; + } + } + } + + return bestMatch; + } +} diff --git a/packages/core/src/indexer/types.ts b/packages/core/src/indexer/types.ts index 79195da..4465faa 100644 --- a/packages/core/src/indexer/types.ts +++ b/packages/core/src/indexer/types.ts @@ -113,6 +113,60 @@ export interface IndexError { timestamp: Date; } +/** + * Supported languages for detailed statistics + */ +export type SupportedLanguage = 'typescript' | 'javascript' | 'go' | 'markdown'; + +/** + * Statistics for a specific language + */ +export interface LanguageStats { + /** Number of files in this language */ + files: number; + + /** Number of components extracted from this language */ + components: number; + + /** Total lines of code (approximate from component ranges) */ + lines: number; +} + +/** + * Statistics for a package/module in a monorepo + */ +export interface PackageStats { + /** Package name (from package.json or go.mod) */ + name: string; + + /** Package path relative to repository root */ + path: string; + + /** Number of files in this package */ + files: number; + + /** Number of components in this package */ + components: number; + + /** Language breakdown within this package */ + languages: Partial>; +} + +/** + * Detailed statistics with language, component type, and package breakdowns + * Extends IndexStats with optional detailed information for backward compatibility + */ +export interface DetailedIndexStats extends IndexStats { + /** Statistics broken down by language */ + byLanguage?: Record; + + /** Statistics broken down by component type */ + byComponentType?: Partial>; + + /** Statistics broken down by package (for monorepos) */ + byPackage?: Record; +} + /** * Metadata tracked for each indexed file */ diff --git a/packages/core/src/scanner/typescript.ts b/packages/core/src/scanner/typescript.ts index 3a768c2..3ab8a35 100644 --- a/packages/core/src/scanner/typescript.ts +++ b/packages/core/src/scanner/typescript.ts @@ -49,6 +49,16 @@ export class TypeScriptScanner implements Scanner { ); } + /** + * Detect actual language based on file extension + * TypeScript files: .ts, .tsx + * JavaScript files: .js, .jsx, .mjs, .cjs + */ + private detectLanguage(filePath: string): 'typescript' | 'javascript' { + const ext = path.extname(filePath).toLowerCase(); + return ext === '.ts' || ext === '.tsx' ? 'typescript' : 'javascript'; + } + /** * Get optimal concurrency level for TypeScript processing */ @@ -439,6 +449,7 @@ export class TypeScriptScanner implements Scanner { const isExported = fn.isExported(); const snippet = this.truncateSnippet(fullText); const callees = this.extractCallees(fn, sourceFile); + const language = this.detectLanguage(file); // Build text for embedding const text = this.buildEmbeddingText({ @@ -446,14 +457,14 @@ export class TypeScriptScanner implements Scanner { name, signature, docComment, - language: 'typescript', + language, }); return { id: `${file}:${name}:${startLine}`, text, type: 'function', - language: 'typescript', + language, metadata: { file, startLine, @@ -479,6 +490,7 @@ export class TypeScriptScanner implements Scanner { const docComment = this.getDocComment(cls); const isExported = cls.isExported(); const snippet = this.truncateSnippet(fullText); + const language = this.detectLanguage(file); // Get class signature (class name + extends + implements) const extendsClause = cls.getExtends()?.getText() || ''; @@ -493,14 +505,14 @@ export class TypeScriptScanner implements Scanner { name, signature, docComment, - language: 'typescript', + language, }); return { id: `${file}:${name}:${startLine}`, text, type: 'class', - language: 'typescript', + language, metadata: { file, startLine, @@ -533,20 +545,21 @@ export class TypeScriptScanner implements Scanner { const isPublic = !method.hasModifier(SyntaxKind.PrivateKeyword); const snippet = this.truncateSnippet(fullText); const callees = this.extractCallees(method, sourceFile); + const language = this.detectLanguage(file); const text = this.buildEmbeddingText({ type: 'method', name: `${className}.${name}`, signature, docComment, - language: 'typescript', + language, }); return { id: `${file}:${className}.${name}:${startLine}`, text, type: 'method', - language: 'typescript', + language, metadata: { file, startLine, @@ -574,6 +587,7 @@ export class TypeScriptScanner implements Scanner { const docComment = this.getDocComment(iface); const isExported = iface.isExported(); const snippet = this.truncateSnippet(fullText); + const language = this.detectLanguage(file); // Get interface signature const extendsClause = iface @@ -587,14 +601,14 @@ export class TypeScriptScanner implements Scanner { name, signature, docComment, - language: 'typescript', + language, }); return { id: `${file}:${name}:${startLine}`, text, type: 'interface', - language: 'typescript', + language, metadata: { file, startLine, @@ -623,20 +637,21 @@ export class TypeScriptScanner implements Scanner { // For type aliases, the full text IS the signature (no body) const signature = fullText; const snippet = this.truncateSnippet(fullText); + const language = this.detectLanguage(file); const text = this.buildEmbeddingText({ type: 'type', name, signature, docComment, - language: 'typescript', + language, }); return { id: `${file}:${name}:${startLine}`, text, type: 'type', - language: 'typescript', + language, metadata: { file, startLine, @@ -678,6 +693,7 @@ export class TypeScriptScanner implements Scanner { const isExported = varStmt.isExported(); const snippet = this.truncateSnippet(fullText); const callees = this.extractCallees(funcNode, sourceFile); + const language = this.detectLanguage(file); // Check if async const isAsync = funcNode.isAsync?.() ?? false; @@ -701,14 +717,14 @@ export class TypeScriptScanner implements Scanner { name, signature, docComment, - language: 'typescript', + language, }); return { id: `${file}:${name}:${startLine}`, text, type: 'variable', - language: 'typescript', + language, metadata: { file, startLine, @@ -765,20 +781,21 @@ export class TypeScriptScanner implements Scanner { const signature = typeAnnotation ? `export const ${name}: ${typeAnnotation}` : `export const ${name}`; + const language = this.detectLanguage(file); const text = this.buildEmbeddingText({ type: 'constant', name, signature, docComment, - language: 'typescript', + language, }); return { id: `${file}:${name}:${startLine}`, text, type: 'variable', - language: 'typescript', + language, metadata: { file, startLine,