diff --git a/.nvmrc b/.nvmrc new file mode 100644 index 0000000..3fe3b15 --- /dev/null +++ b/.nvmrc @@ -0,0 +1 @@ +24.13.0 diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index e98a6d0..236e55d 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -24,6 +24,7 @@ type CommandName = | 'purge-comments' | 'embed' | 'cluster' + | 'diff' | 'clusters' | 'cluster-detail' | 'search' @@ -51,6 +52,7 @@ function usage(devMode = false): string { ' close-cluster --id ', ' embed [--number ]', ' cluster [--k ] [--threshold ] [--heap-snapshot-dir ] [--heap-log-interval-ms ]', + ' diff ', ' clusters [--min-size ] [--limit ] [--sort recent|size] [--search ] [--include-closed]', ' cluster-detail --id [--member-limit ] [--body-chars ] [--include-closed]', ' search --query [--mode keyword|semantic|hybrid]', @@ -495,6 +497,12 @@ export async function run(argv: string[], stdout: NodeJS.WritableStream = proces heapDiagnostics?.dispose(); } } + case 'diff': { + const { owner, repo } = parseRepoFlags(rest); + const result = getService().diffClusters({ owner, repo }); + stdout.write(`${JSON.stringify(result, null, 2)}\n`); + return; + } case 'clusters': { const { owner, repo, values } = parseRepoFlags(rest); const sort = values.sort === 'recent' || values.sort === 'size' ? values.sort : undefined; diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 0fd8226..af73cb2 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -8,6 +8,7 @@ import { fileURLToPath } from 'node:url'; import blessed from 'neo-blessed'; import type { + ClusterDiffResponse, GHCrawlService, TuiClusterDetail, TuiClusterSortMode, @@ -175,6 +176,7 @@ export async function startTui(params: StartTuiParams): Promise { let activeJob: BackgroundRefreshJob | null = null; let modalOpen = false; let exitRequested = false; + let diffOverlay: ClusterDiffResponse | null = null; const clearCaches = (): void => { clusterDetailCache.clear(); @@ -390,7 +392,7 @@ export async function startTui(params: StartTuiParams): Promise { widgets.members.select(memberIndex); } - widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane)); + widgets.detail.setContent(diffOverlay ? renderDiffPane(diffOverlay) : renderDetailPane(threadDetail, clusterDetail, focusPane)); updatePaneStyles(widgets, focusPane); const activeJobs = [syncJobRunning ? 'sync' : null, embedJobRunning ? 'embed' : null, clusterJobRunning ? 'cluster' : null] .filter(Boolean) @@ -401,7 +403,7 @@ export async function startTui(params: StartTuiParams): Promise { footerLines.unshift(''); } footerLines.push( - `${status} | jobs:${activeJobs} | h/? help # jump g update p repos u author / filter s sort f min l layout x closed`, + `${status} | jobs:${activeJobs} | h/? help # jump d diff g update p repos u author / filter s sort f min l layout x closed`, ); footerLines.push( `Tab focus arrows move-or-scroll PgUp/PgDn page r refresh o open q quit`, @@ -1123,6 +1125,23 @@ export async function startTui(params: StartTuiParams): Promise { if (modalOpen) return; promptAuthorThreads(); }); + widgets.screen.key(['d'], () => { + if (modalOpen) return; + if (diffOverlay) { + diffOverlay = null; + render(); + return; + } + if (!currentRepository.owner || !currentRepository.repo) return; + try { + diffOverlay = params.service.diffClusters({ owner: currentRepository.owner, repo: currentRepository.repo }); + status = 'Showing cluster diff'; + } catch { + diffOverlay = null; + status = 'No diff data (run cluster twice to generate transitions)'; + } + render(); + }); widgets.screen.on('resize', () => render()); widgets.screen.on('destroy', () => { @@ -1215,6 +1234,55 @@ function updatePaneStyles(widgets: Widgets, focus: TuiFocusPane): void { widgets.detail.style.border = { fg: focus === 'detail' ? 'white' : '#fde74c' }; } +export function renderDiffPane(diff: ClusterDiffResponse): string { + const s = diff.summary; + const total = s.continuing + s.growing + s.shrinking + s.splitting + s.merging + s.forming + s.dissolving; + const lines: string[] = [ + '{bold}Cluster Diff{/bold} (press d to close)', + `Run ${diff.fromRunId} -> ${diff.toRunId} | ${total} transitions`, + '', + '{bold}Summary{/bold}', + ` {green-fg}continuing{/green-fg} ${s.continuing}`, + ` {green-fg}growing{/green-fg} ${s.growing}`, + ` {yellow-fg}shrinking{/yellow-fg} ${s.shrinking}`, + ` {yellow-fg}splitting{/yellow-fg} ${s.splitting}`, + ` {cyan-fg}merging{/cyan-fg} ${s.merging}`, + ` {blue-fg}forming{/blue-fg} ${s.forming}`, + ` {red-fg}dissolving{/red-fg} ${s.dissolving}`, + '', + '{bold}Transitions{/bold}', + '', + ]; + + const colorTag = (t: string): string => { + switch (t) { + case 'continuing': case 'growing': return 'green-fg'; + case 'shrinking': case 'splitting': return 'yellow-fg'; + case 'merging': return 'cyan-fg'; + case 'forming': return 'blue-fg'; + case 'dissolving': return 'red-fg'; + default: return 'white-fg'; + } + }; + + for (const t of diff.transitions) { + const from = t.fromClusterId !== null ? `#${t.fromClusterId}` : '(new)'; + const to = t.toClusterId !== null ? `#${t.toClusterId}` : '(gone)'; + const jaccard = t.jaccardScore !== null ? `J=${(t.jaccardScore * 100).toFixed(0)}%` : ''; + const delta = t.membersAdded > 0 || t.membersRemoved > 0 + ? ` +${t.membersAdded}/-${t.membersRemoved} (${t.membersRetained} kept)` + : ` (${t.membersRetained} members)`; + const tag = colorTag(t.transition); + lines.push(` ${from} -> ${to} {${tag}}${t.transition}{/${tag}} ${jaccard}${delta}`); + } + + if (diff.transitions.length === 0) { + lines.push(' No transitions recorded. Run `ghcrawl cluster` twice to generate diff data.'); + } + + return lines.join('\n'); +} + export function renderDetailPane( threadDetail: TuiThreadDetail | null, clusterDetail: TuiClusterDetail | null, diff --git a/docs/screenshots/ghcrawl-clusters-real.png b/docs/screenshots/ghcrawl-clusters-real.png new file mode 100644 index 0000000..f823ec4 Binary files /dev/null and b/docs/screenshots/ghcrawl-clusters-real.png differ diff --git a/docs/screenshots/ghcrawl-diff-bug.png b/docs/screenshots/ghcrawl-diff-bug.png new file mode 100644 index 0000000..d4955b2 Binary files /dev/null and b/docs/screenshots/ghcrawl-diff-bug.png differ diff --git a/docs/screenshots/ghcrawl-diff-view.png b/docs/screenshots/ghcrawl-diff-view.png new file mode 100644 index 0000000..4ce05b3 Binary files /dev/null and b/docs/screenshots/ghcrawl-diff-view.png differ diff --git a/docs/screenshots/ghcrawl-perf-real.png b/docs/screenshots/ghcrawl-perf-real.png new file mode 100644 index 0000000..48c60e0 Binary files /dev/null and b/docs/screenshots/ghcrawl-perf-real.png differ diff --git a/docs/screenshots/ghcrawl-perf-tests.png b/docs/screenshots/ghcrawl-perf-tests.png new file mode 100644 index 0000000..0d2889f Binary files /dev/null and b/docs/screenshots/ghcrawl-perf-tests.png differ diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 1771e66..1f1f1a5 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -211,6 +211,44 @@ export const clusterResultSchema = z.object({ }); export type ClusterResultDto = z.infer; +export const transitionTypeSchema = z.enum(['continuing', 'growing', 'shrinking', 'splitting', 'merging', 'forming', 'dissolving']); +export type TransitionType = z.infer; + +export const clusterTransitionSchema = z.object({ + fromClusterId: z.number().int().nullable(), + toClusterId: z.number().int().nullable(), + transition: transitionTypeSchema, + jaccardScore: z.number().nullable(), + membersAdded: z.number().int().nonnegative(), + membersRemoved: z.number().int().nonnegative(), + membersRetained: z.number().int().nonnegative(), +}); +export type ClusterTransitionDto = z.infer; + +export const clusterDiffResponseSchema = z.object({ + repository: repositorySchema, + fromRunId: z.number().int().positive(), + toRunId: z.number().int().positive(), + transitions: z.array(clusterTransitionSchema), + summary: z.object({ + continuing: z.number().int().nonnegative(), + growing: z.number().int().nonnegative(), + shrinking: z.number().int().nonnegative(), + splitting: z.number().int().nonnegative(), + merging: z.number().int().nonnegative(), + forming: z.number().int().nonnegative(), + dissolving: z.number().int().nonnegative(), + }), +}); +export type ClusterDiffResponse = z.infer; + +export const diffResultSchema = z.object({ + fromRunId: z.number().int().positive(), + toRunId: z.number().int().positive(), + transitionCount: z.number().int().nonnegative(), +}); +export type DiffResultDto = z.infer; + export const refreshRequestSchema = z.object({ owner: z.string(), repo: z.string(), diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 79032c8..fd1952a 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -115,6 +115,12 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'GET' && url.pathname === '/diff') { + const params = parseRepoParams(url); + sendJson(res, 200, service.diffClusters(params)); + return; + } + if (req.method === 'GET' && url.pathname === '/cluster-summaries') { const params = parseRepoParams(url); const sortParam = url.searchParams.get('sort'); diff --git a/packages/api-core/src/cluster/lineage-perf.test.ts b/packages/api-core/src/cluster/lineage-perf.test.ts new file mode 100644 index 0000000..066e237 --- /dev/null +++ b/packages/api-core/src/cluster/lineage-perf.test.ts @@ -0,0 +1,123 @@ +import assert from 'node:assert/strict'; +import { performance } from 'node:perf_hooks'; +import test from 'node:test'; + +import { computeClusterTransitions, type ClusterSnapshot } from './lineage.js'; + +/** + * Generate synthetic cluster data at a given scale. + * + * Creates `clusterCount` clusters each with `membersPerCluster` members. + * The "new" run shares ~80% of members with the old run (simulating realistic + * churn between consecutive clustering runs). + */ +function generateScenario(clusterCount: number, membersPerCluster: number): { + oldClusters: ClusterSnapshot[]; + newClusters: ClusterSnapshot[]; + totalMembers: number; +} { + const oldClusters: ClusterSnapshot[] = []; + const newClusters: ClusterSnapshot[] = []; + let nextMemberId = 1; + + for (let i = 0; i < clusterCount; i++) { + const oldMembers = new Set(); + const newMembers = new Set(); + + // 80% overlap: shared members + const sharedCount = Math.floor(membersPerCluster * 0.8); + for (let j = 0; j < sharedCount; j++) { + const id = nextMemberId++; + oldMembers.add(id); + newMembers.add(id); + } + + // 20% churn: old-only and new-only members + const churnCount = membersPerCluster - sharedCount; + for (let j = 0; j < churnCount; j++) { + oldMembers.add(nextMemberId++); + } + for (let j = 0; j < churnCount; j++) { + newMembers.add(nextMemberId++); + } + + oldClusters.push({ clusterId: i + 1, members: oldMembers }); + newClusters.push({ clusterId: clusterCount + i + 1, members: newMembers }); + } + + return { oldClusters, newClusters, totalMembers: nextMemberId - 1 }; +} + +function median(values: number[]): number { + const sorted = [...values].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; +} + +const SCALES = [ + { clusters: 100, membersPerCluster: 8, label: '100 clusters (800 members)' }, + { clusters: 500, membersPerCluster: 8, label: '500 clusters (4,000 members)' }, + { clusters: 1000, membersPerCluster: 10, label: '1,000 clusters (10,000 members)' }, + { clusters: 2000, membersPerCluster: 10, label: '2,000 clusters (20,000 members)' }, +]; + +const WARMUP_RUNS = 2; +const BENCH_RUNS = 5; + +test('lineage performance at multiple scales', () => { + const results: Array<{ label: string; medianMs: number; transitionCount: number }> = []; + + for (const scale of SCALES) { + const { oldClusters, newClusters, totalMembers } = generateScenario( + scale.clusters, + scale.membersPerCluster, + ); + + // Warmup + for (let i = 0; i < WARMUP_RUNS; i++) { + computeClusterTransitions(oldClusters, newClusters); + } + + // Bench + const durations: number[] = []; + let lastResult: ReturnType | null = null; + for (let i = 0; i < BENCH_RUNS; i++) { + const start = performance.now(); + lastResult = computeClusterTransitions(oldClusters, newClusters); + durations.push(performance.now() - start); + } + + const med = median(durations); + results.push({ + label: scale.label, + medianMs: med, + transitionCount: lastResult?.length ?? 0, + }); + } + + // Print results table + console.log('\n=== Lineage Performance Benchmark ===\n'); + console.log('Scale | Median | Transitions'); + console.log('-----------------------------------|------------|------------'); + for (const r of results) { + const label = r.label.padEnd(35); + const ms = `${r.medianMs.toFixed(1)} ms`.padStart(10); + console.log(`${label}| ${ms} | ${r.transitionCount}`); + } + console.log(''); + + // ghcrawl/ghcrawl has ~17k issues. With typical cluster sizes of 8-15, + // that's roughly 1,100-2,100 clusters. Assert sub-second at 2,000 clusters. + const largest = results[results.length - 1]; + assert.ok( + largest.medianMs < 1000, + `Expected <1s at ${largest.label}, got ${largest.medianMs.toFixed(1)}ms`, + ); + + // Assert sub-100ms at 500 clusters (the stated comfortable range) + const mid = results[1]; + assert.ok( + mid.medianMs < 100, + `Expected <100ms at ${mid.label}, got ${mid.medianMs.toFixed(1)}ms`, + ); +}); diff --git a/packages/api-core/src/cluster/lineage.test.ts b/packages/api-core/src/cluster/lineage.test.ts new file mode 100644 index 0000000..7ba2394 --- /dev/null +++ b/packages/api-core/src/cluster/lineage.test.ts @@ -0,0 +1,96 @@ +import assert from 'node:assert/strict'; +import test from 'node:test'; + +import { computeClusterTransitions, type ClusterSnapshot } from './lineage.js'; + +function snapshot(clusterId: number, members: number[]): ClusterSnapshot { + return { clusterId, members: new Set(members) }; +} + +test('identical clusters are continuing with jaccard 1.0', () => { + const transitions = computeClusterTransitions([snapshot(1, [1, 2]), snapshot(2, [3, 4])], [snapshot(10, [1, 2]), snapshot(20, [3, 4])]); + assert.equal(transitions.length, 2); + for (const transition of transitions) { + assert.equal(transition.transition, 'continuing'); + assert.equal(transition.jaccardScore, 1); + } +}); + +test('cluster gains members is growing', () => { + const transitions = computeClusterTransitions([snapshot(1, [1, 2])], [snapshot(10, [1, 2, 3])]); + assert.equal(transitions.length, 1); + assert.equal(transitions[0]?.transition, 'growing'); + assert.equal(transitions[0]?.membersAdded, 1); + assert.equal(transitions[0]?.membersRemoved, 0); +}); + +test('cluster loses members is shrinking', () => { + const transitions = computeClusterTransitions([snapshot(1, [1, 2, 3])], [snapshot(10, [1, 2])]); + assert.equal(transitions.length, 1); + assert.equal(transitions[0]?.transition, 'shrinking'); + assert.equal(transitions[0]?.membersAdded, 0); + assert.equal(transitions[0]?.membersRemoved, 1); +}); + +test('one cluster splitting into two', () => { + const transitions = computeClusterTransitions([snapshot(1, [1, 2, 3, 4, 5])], [snapshot(10, [1, 2]), snapshot(20, [3, 4])]); + assert.equal(transitions.length, 3); + const split = transitions.find((transition) => transition.transition === 'splitting'); + assert.ok(split); + assert.equal(split.fromClusterId, 1); + assert.equal(split.toClusterId, null); +}); + +test('two clusters merging into one', () => { + const transitions = computeClusterTransitions([snapshot(1, [1, 2]), snapshot(2, [3, 4])], [snapshot(10, [1, 2, 3, 4, 5, 6])]); + assert.equal(transitions.length, 3); + const merge = transitions.find((transition) => transition.transition === 'merging'); + assert.ok(merge); + assert.equal(merge.fromClusterId, null); + assert.equal(merge.toClusterId, 10); +}); + +test('brand new cluster is forming', () => { + const transitions = computeClusterTransitions([snapshot(1, [1, 2])], [snapshot(10, [1, 2]), snapshot(20, [3, 4])]); + const forming = transitions.find((transition) => transition.transition === 'forming'); + assert.ok(forming); + assert.equal(forming.toClusterId, 20); +}); + +test('cluster disappearing is dissolving', () => { + const transitions = computeClusterTransitions([snapshot(1, [1, 2])], []); + assert.equal(transitions.length, 1); + assert.equal(transitions[0]?.transition, 'dissolving'); + assert.equal(transitions[0]?.fromClusterId, 1); +}); + +test('mixed scenario includes all seven transition types', () => { + const oldClusters = [ + snapshot(1, [1, 2]), + snapshot(2, [3, 4]), + snapshot(3, [6, 7, 8]), + snapshot(4, [9, 10, 11, 12, 13]), + snapshot(5, [14, 15]), + snapshot(6, [16, 17]), + snapshot(7, [18, 19]), + ]; + const newClusters = [ + snapshot(101, [1, 2]), + snapshot(102, [3, 4, 5]), + snapshot(103, [6, 7]), + snapshot(104, [9, 10]), + snapshot(105, [11, 12]), + snapshot(106, [16, 17, 18, 19, 20, 21]), + snapshot(107, [22, 23]), + ]; + + const transitions = computeClusterTransitions(oldClusters, newClusters); + const types = new Set(transitions.map((transition) => transition.transition)); + assert.deepEqual(types, new Set(['continuing', 'growing', 'shrinking', 'splitting', 'merging', 'forming', 'dissolving'])); +}); + +test('first run with empty old clusters is all forming', () => { + const transitions = computeClusterTransitions([], [snapshot(1, [1, 2]), snapshot(2, [3])]); + assert.equal(transitions.length, 2); + assert.ok(transitions.every((transition) => transition.transition === 'forming')); +}); diff --git a/packages/api-core/src/cluster/lineage.ts b/packages/api-core/src/cluster/lineage.ts new file mode 100644 index 0000000..e9223b5 --- /dev/null +++ b/packages/api-core/src/cluster/lineage.ts @@ -0,0 +1,161 @@ +export type ClusterSnapshot = { + clusterId: number; + members: Set; +}; + +export type TransitionType = + | 'continuing' + | 'growing' + | 'shrinking' + | 'splitting' + | 'merging' + | 'forming' + | 'dissolving'; + +export type ClusterTransition = { + fromClusterId: number | null; + toClusterId: number | null; + transition: TransitionType; + jaccardScore: number | null; + membersAdded: number; + membersRemoved: number; + membersRetained: number; +}; + +type PairScore = { + oldClusterId: number; + newClusterId: number; + intersection: number; + jaccard: number; +}; + +export function computeClusterTransitions( + oldClusters: ClusterSnapshot[], + newClusters: ClusterSnapshot[], +): ClusterTransition[] { + const oldById = new Map(oldClusters.map((cluster) => [cluster.clusterId, cluster])); + const newById = new Map(newClusters.map((cluster) => [cluster.clusterId, cluster])); + + const threadToOld = new Map(); + for (const cluster of oldClusters) { + for (const member of cluster.members) { + threadToOld.set(member, cluster.clusterId); + } + } + + const threadToNew = new Map(); + for (const cluster of newClusters) { + for (const member of cluster.members) { + threadToNew.set(member, cluster.clusterId); + } + } + + const intersections = new Map>(); + for (const [threadId, oldClusterId] of threadToOld.entries()) { + const newClusterId = threadToNew.get(threadId); + if (newClusterId === undefined) continue; + const byNew = intersections.get(oldClusterId) ?? new Map(); + byNew.set(newClusterId, (byNew.get(newClusterId) ?? 0) + 1); + intersections.set(oldClusterId, byNew); + } + + const pairs: PairScore[] = []; + for (const [oldClusterId, byNew] of intersections.entries()) { + const oldCluster = oldById.get(oldClusterId); + if (!oldCluster) continue; + for (const [newClusterId, intersection] of byNew.entries()) { + const newCluster = newById.get(newClusterId); + if (!newCluster) continue; + const denominator = oldCluster.members.size + newCluster.members.size - intersection; + const jaccard = denominator === 0 ? 0 : intersection / denominator; + pairs.push({ oldClusterId, newClusterId, intersection, jaccard }); + } + } + + pairs.sort((left, right) => { + if (right.jaccard !== left.jaccard) return right.jaccard - left.jaccard; + if (right.intersection !== left.intersection) return right.intersection - left.intersection; + if (left.oldClusterId !== right.oldClusterId) return left.oldClusterId - right.oldClusterId; + return left.newClusterId - right.newClusterId; + }); + + const matchedOld = new Set(); + const matchedNew = new Set(); + const transitions: ClusterTransition[] = []; + + for (const pair of pairs) { + if (pair.jaccard < 0.5) break; + if (matchedOld.has(pair.oldClusterId) || matchedNew.has(pair.newClusterId)) continue; + + const oldCluster = oldById.get(pair.oldClusterId); + const newCluster = newById.get(pair.newClusterId); + if (!oldCluster || !newCluster) continue; + + const oldSize = oldCluster.members.size; + const newSize = newCluster.members.size; + const membersRetained = pair.intersection; + const membersAdded = newSize - membersRetained; + const membersRemoved = oldSize - membersRetained; + const transition: TransitionType = + newSize === oldSize ? 'continuing' : newSize > oldSize ? 'growing' : 'shrinking'; + + transitions.push({ + fromClusterId: oldCluster.clusterId, + toClusterId: newCluster.clusterId, + transition, + jaccardScore: pair.jaccard, + membersAdded, + membersRemoved, + membersRetained, + }); + + matchedOld.add(pair.oldClusterId); + matchedNew.add(pair.newClusterId); + } + + for (const oldCluster of oldClusters) { + if (matchedOld.has(oldCluster.clusterId)) continue; + const destinations = new Map(); + let membersRetained = 0; + for (const member of oldCluster.members) { + const destinationClusterId = threadToNew.get(member); + if (destinationClusterId === undefined) continue; + membersRetained += 1; + destinations.set(destinationClusterId, (destinations.get(destinationClusterId) ?? 0) + 1); + } + const splitTargets = Array.from(destinations.values()).filter((count) => count >= 2).length; + transitions.push({ + fromClusterId: oldCluster.clusterId, + toClusterId: null, + transition: splitTargets >= 2 ? 'splitting' : 'dissolving', + jaccardScore: null, + membersAdded: 0, + membersRemoved: oldCluster.members.size - membersRetained, + membersRetained, + }); + } + + for (const newCluster of newClusters) { + if (matchedNew.has(newCluster.clusterId)) continue; + const origins = new Map(); + let membersRetained = 0; + for (const member of newCluster.members) { + const originClusterId = threadToOld.get(member); + if (originClusterId === undefined) continue; + membersRetained += 1; + origins.set(originClusterId, (origins.get(originClusterId) ?? 0) + 1); + } + const mergeSources = Array.from(origins.values()).filter((count) => count >= 2).length; + transitions.push({ + fromClusterId: null, + toClusterId: newCluster.clusterId, + transition: mergeSources >= 2 ? 'merging' : 'forming', + jaccardScore: null, + membersAdded: newCluster.members.size - membersRetained, + membersRemoved: 0, + membersRetained, + }); + } + + return transitions; +} diff --git a/packages/api-core/src/db/migrate.ts b/packages/api-core/src/db/migrate.ts index 7ec4059..d862eb7 100644 --- a/packages/api-core/src/db/migrate.ts +++ b/packages/api-core/src/db/migrate.ts @@ -214,6 +214,22 @@ const migrationStatements = [ created_at text not null, primary key (cluster_id, thread_id) ) + `, + ` + create table if not exists cluster_transitions ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + from_run_id integer not null references cluster_runs(id) on delete cascade, + to_run_id integer not null references cluster_runs(id) on delete cascade, + from_cluster_id integer, + to_cluster_id integer, + transition text not null, + jaccard_score real, + members_added integer not null default 0, + members_removed integer not null default 0, + members_retained integer not null default 0, + created_at text not null + ) ` ]; @@ -254,4 +270,5 @@ export function migrate(db: SqliteDatabase): void { db.exec('create index if not exists idx_cluster_runs_repo_status_id on cluster_runs(repo_id, status, id)'); db.exec('create index if not exists idx_clusters_repo_run_id on clusters(repo_id, cluster_run_id, id)'); db.exec('create index if not exists idx_cluster_members_thread_cluster on cluster_members(thread_id, cluster_id)'); + db.exec('create index if not exists idx_cluster_transitions_repo_run on cluster_transitions(repo_id, to_run_id)'); } diff --git a/packages/api-core/src/index.ts b/packages/api-core/src/index.ts index 15471e0..8a64c1b 100644 --- a/packages/api-core/src/index.ts +++ b/packages/api-core/src/index.ts @@ -3,4 +3,6 @@ export * from './config.js'; export * from './documents/normalize.js'; export * from './search/exact.js'; export * from './cluster/build.js'; +export * from './cluster/lineage.js'; +export type { ClusterDiffResponse } from '@ghcrawl/api-contract'; export * from './service.js'; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 95f7680..51a6a32 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -10,10 +10,12 @@ import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, + clusterDiffResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, + diffResultSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, @@ -26,11 +28,13 @@ import { type ActionResponse, type AuthorThreadsResponse, type CloseResponse, + type ClusterDiffResponse, type ClusterDetailResponse, type ClusterDto, type ClusterResultDto, type ClusterSummariesResponse, type ClustersResponse, + type DiffResultDto, type EmbedResultDto, type HealthResponse, type NeighborsResponse, @@ -42,11 +46,13 @@ import { type SearchResponse, type SyncResultDto, type ThreadDto, + type TransitionType, type ThreadsResponse, } from '@ghcrawl/api-contract'; import { buildClusters } from './cluster/build.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; +import { computeClusterTransitions, type ClusterSnapshot } from './cluster/lineage.js'; import { ensureRuntimeDirs, isLikelyGitHubToken, @@ -1119,7 +1125,24 @@ export class GHCrawlService { items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges, ); + const previousRun = this.db + .prepare("select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(repository.id) as { id: number } | undefined; this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters); + + if (previousRun) { + const oldSnapshots = this.loadClusterSnapshotsForRun(previousRun.id); + const newSnapshots = this.loadClusterSnapshotsForRun(runId); + const transitions = computeClusterTransitions(oldSnapshots, newSnapshots); + const diffResult: DiffResultDto = diffResultSchema.parse({ + fromRunId: previousRun.id, + toRunId: runId, + transitionCount: transitions.length, + }); + this.persistClusterTransitions(repository.id, diffResult, transitions); + params.onProgress?.(`[cluster] computed ${transitions.length} transition(s)`); + } + this.pruneOldClusterRuns(repository.id, runId); params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`); @@ -1381,6 +1404,58 @@ export class GHCrawlService { }); } + diffClusters(params: { owner: string; repo: string }): ClusterDiffResponse { + const repository = this.requireRepository(params.owner, params.repo); + const latestRun = this.db + .prepare("select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(repository.id) as { id: number } | undefined; + if (!latestRun) throw new Error('No completed cluster runs found'); + + const rows = this.db + .prepare('select * from cluster_transitions where repo_id = ? and to_run_id = ? order by transition, id') + .all(repository.id, latestRun.id) as Array<{ + from_cluster_id: number | null; + to_cluster_id: number | null; + transition: string; + jaccard_score: number | null; + members_added: number; + members_removed: number; + members_retained: number; + from_run_id: number; + }>; + + const fromRunId = rows.length > 0 ? rows[0].from_run_id : latestRun.id; + const transitions = rows.map((row) => ({ + fromClusterId: row.from_cluster_id, + toClusterId: row.to_cluster_id, + transition: row.transition as TransitionType, + jaccardScore: row.jaccard_score, + membersAdded: row.members_added, + membersRemoved: row.members_removed, + membersRetained: row.members_retained, + })); + + const summary = { continuing: 0, growing: 0, shrinking: 0, splitting: 0, merging: 0, forming: 0, dissolving: 0 }; + for (const transition of transitions) { + summary[transition.transition as keyof typeof summary] += 1; + } + + return clusterDiffResponseSchema.parse({ + repository: { + id: repository.id, + owner: repository.owner, + name: repository.name, + fullName: repository.fullName, + githubRepoId: repository.githubRepoId ?? null, + updatedAt: repository.updatedAt, + }, + fromRunId, + toRunId: latestRun.id, + transitions, + summary, + }); + } + async refreshRepository(params: { owner: string; repo: string; @@ -3250,7 +3325,83 @@ export class GHCrawlService { })(); } + private loadClusterSnapshotsForRun(clusterRunId: number): ClusterSnapshot[] { + const rows = this.db + .prepare( + `select c.id as cluster_id, cm.thread_id + from clusters c + join cluster_members cm on cm.cluster_id = c.id + where c.cluster_run_id = ? + order by c.id asc, cm.thread_id asc`, + ) + .all(clusterRunId) as Array<{ cluster_id: number; thread_id: number }>; + + const byClusterId = new Map>(); + for (const row of rows) { + const members = byClusterId.get(row.cluster_id) ?? new Set(); + members.add(row.thread_id); + byClusterId.set(row.cluster_id, members); + } + + return Array.from(byClusterId.entries()).map(([clusterId, members]) => ({ clusterId, members })); + } + + private persistClusterTransitions( + repoId: number, + diffResult: DiffResultDto, + transitions: Array<{ + fromClusterId: number | null; + toClusterId: number | null; + transition: string; + jaccardScore: number | null; + membersAdded: number; + membersRemoved: number; + membersRetained: number; + }>, + ): void { + const insertTransition = this.db.prepare( + `insert into cluster_transitions ( + repo_id, + from_run_id, + to_run_id, + from_cluster_id, + to_cluster_id, + transition, + jaccard_score, + members_added, + members_removed, + members_retained, + created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + + this.db.transaction(() => { + this.db.prepare('delete from cluster_transitions where repo_id = ? and to_run_id = ?').run(repoId, diffResult.toRunId); + const createdAt = nowIso(); + for (const transition of transitions) { + insertTransition.run( + repoId, + diffResult.fromRunId, + diffResult.toRunId, + transition.fromClusterId, + transition.toClusterId, + transition.transition, + transition.jaccardScore, + transition.membersAdded, + transition.membersRemoved, + transition.membersRetained, + createdAt, + ); + } + })(); + } + private pruneOldClusterRuns(repoId: number, keepRunId: number): void { + // Keep transitions pointing TO the current run (they record what changed + // between the previous run and keepRunId). Only delete old-to-old transitions. + this.db + .prepare('delete from cluster_transitions where from_run_id in (select id from cluster_runs where repo_id = ? and id <> ?) and to_run_id <> ?') + .run(repoId, keepRunId, keepRunId); this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId); }