Skip to content

Commit dbcdc75

Browse files
committed
Add similarity detection
1 parent 05421d4 commit dbcdc75

File tree

7 files changed

+624
-2
lines changed

7 files changed

+624
-2
lines changed

packages/toolkit/scripts/issue-triage/src/index.ts

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { writeFile } from 'fs/promises'
88
import { GitHubClient, checkGhCli } from './github/gh-client.js'
99
import { GhCliError, GhApiError, GhParseError } from './utils/errors.js'
1010
import { categorizeIssues, CATEGORIES } from './categorize/index.js'
11+
import { findAllDuplicates, createWorkClusters } from './similarity/index.js'
1112

1213
async function main() {
1314
console.log('GitHub Issues Triage Tool v1.0.0')
@@ -89,7 +90,20 @@ async function main() {
8990
console.log(` ✅ Easy Fix: ${easyFixCount}`)
9091
console.log(` 🏷️ Needs Triage: ${needsTriageCount}`)
9192

92-
// Step 7: Display sample categorized issues
93+
// Step 7: Run similarity detection
94+
console.log('\n🔍 Running Similarity Detection')
95+
console.log('================================')
96+
97+
const startTime = Date.now()
98+
const duplicateGroups = findAllDuplicates(issues)
99+
const workClusters = createWorkClusters(issues)
100+
const endTime = Date.now()
101+
102+
console.log(`✓ Similarity detection completed in ${endTime - startTime}ms`)
103+
console.log(` Found ${duplicateGroups.length} potential duplicate groups`)
104+
console.log(` Created ${workClusters.length} work clusters`)
105+
106+
// Step 8: Display sample categorized issues
93107
if (issues.length > 0) {
94108
console.log('\n📝 Sample Categorized Issues')
95109
console.log('============================\n')
@@ -127,7 +141,50 @@ async function main() {
127141
}
128142
}
129143

130-
// Step 8: Export categorization results to JSON
144+
// Step 9: Display sample similarity results
145+
if (duplicateGroups.length > 0) {
146+
console.log('\n🔄 Sample Duplicate Groups')
147+
console.log('==========================\n')
148+
149+
const samplesToShow = Math.min(3, duplicateGroups.length)
150+
for (let i = 0; i < samplesToShow; i++) {
151+
const group = duplicateGroups[i]
152+
console.log(
153+
`Primary Issue #${group.primary.number}: ${group.primary.title}`,
154+
)
155+
console.log(` Potential duplicates: ${group.duplicates.length}`)
156+
for (const dup of group.duplicates.slice(0, 2)) {
157+
console.log(` - #${dup.issue.number}: ${dup.issue.title}`)
158+
console.log(` Confidence: ${(dup.confidence * 100).toFixed(0)}%`)
159+
}
160+
console.log('')
161+
}
162+
}
163+
164+
if (workClusters.length > 0) {
165+
console.log('\n📦 Sample Work Clusters')
166+
console.log('=======================\n')
167+
168+
const samplesToShow = Math.min(3, workClusters.length)
169+
for (let i = 0; i < samplesToShow; i++) {
170+
const cluster = workClusters[i]
171+
console.log(
172+
`Cluster ${cluster.id}: ${cluster.category}${cluster.subcategory ? `/${cluster.subcategory}` : ''}`,
173+
)
174+
console.log(` Issues: ${cluster.issues.length}`)
175+
console.log(` Priority: ${cluster.priority.toFixed(1)}`)
176+
console.log(
177+
` Avg Complexity: ${cluster.metrics.avgComplexity.toFixed(0)}`,
178+
)
179+
console.log(
180+
` Estimated Effort: ${cluster.metrics.estimatedEffort} days`,
181+
)
182+
console.log(` Reasoning: ${cluster.reasoning}`)
183+
console.log('')
184+
}
185+
}
186+
187+
// Step 10: Export categorization results to JSON
131188
console.log('\n💾 Exporting categorization results...')
132189
const outputPath = 'cache/categorization-results.json'
133190

@@ -147,6 +204,29 @@ async function main() {
147204
needsTriage: needsTriageCount,
148205
},
149206
},
207+
similarity: {
208+
duplicateGroups: duplicateGroups.map((group) => ({
209+
primary: {
210+
number: group.primary.number,
211+
title: group.primary.title,
212+
},
213+
duplicates: group.duplicates.map((dup) => ({
214+
number: dup.issue.number,
215+
title: dup.issue.title,
216+
confidence: dup.confidence,
217+
signals: dup.signals,
218+
})),
219+
})),
220+
workClusters: workClusters.map((cluster) => ({
221+
id: cluster.id,
222+
category: cluster.category,
223+
subcategory: cluster.subcategory,
224+
issueNumbers: cluster.issues.map((i) => i.number),
225+
metrics: cluster.metrics,
226+
reasoning: cluster.reasoning,
227+
priority: cluster.priority,
228+
})),
229+
},
150230
items: categorizedItems.map((item) => ({
151231
number: item.number,
152232
title: item.title,
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import type { CategorizedIssue } from '../categorize/types.js'
2+
import type { WorkCluster } from './types.js'
3+
import { groupBy } from './utils.js'
4+
5+
/**
6+
* Create work clusters from categorized issues
7+
* Groups related issues that could be worked on together
8+
*/
9+
export function createWorkClusters(issues: CategorizedIssue[]): WorkCluster[] {
10+
// Group by subcategory
11+
const bySubcategory = groupBy(
12+
issues,
13+
(issue) =>
14+
`${issue.categorization.primary}/${issue.categorization.secondary || 'other'}`,
15+
)
16+
17+
const clusters: WorkCluster[] = []
18+
19+
for (const [category, groupIssues] of Object.entries(bySubcategory)) {
20+
if (groupIssues.length < 2) continue
21+
22+
// Sort by priority (urgency × engagement)
23+
const sorted = groupIssues
24+
.map((issue) => ({
25+
issue,
26+
priority: issue.scores.urgency * issue.scores.engagement,
27+
}))
28+
.sort((a, b) => b.priority - a.priority)
29+
30+
// Create clusters of 3-5 issues
31+
for (let i = 0; i < sorted.length; i += 4) {
32+
const clusterIssues = sorted.slice(i, i + 5).map((s) => s.issue)
33+
if (clusterIssues.length < 2) continue
34+
35+
const metrics = calculateClusterMetrics(clusterIssues)
36+
37+
// Skip clusters that are too easy or too hard
38+
if (metrics.avgComplexity < 30 || metrics.avgComplexity > 80) continue
39+
40+
const [primary, secondary] = category.split('/')
41+
42+
const cluster: WorkCluster = {
43+
id: `cluster-${clusters.length + 1}`,
44+
category: primary,
45+
subcategory: secondary !== 'other' ? secondary : undefined,
46+
issues: clusterIssues,
47+
metrics,
48+
reasoning: generateClusterReasoning(clusterIssues, category),
49+
priority: calculateClusterPriority(metrics),
50+
}
51+
52+
clusters.push(cluster)
53+
}
54+
}
55+
56+
return clusters.sort((a, b) => b.priority - a.priority).slice(0, 10) // Top 10 clusters
57+
}
58+
59+
/**
60+
* Calculate aggregate metrics for a cluster of issues
61+
*/
62+
function calculateClusterMetrics(issues: CategorizedIssue[]) {
63+
const complexities = issues.map((i) => i.scores.complexity)
64+
const engagements = issues.map((i) => i.scores.engagement)
65+
const urgencies = issues.map((i) => i.scores.urgency)
66+
67+
return {
68+
avgComplexity: average(complexities),
69+
totalEngagement: sum(engagements),
70+
avgUrgency: average(urgencies),
71+
estimatedEffort: estimateEffort(average(complexities), issues.length),
72+
}
73+
}
74+
75+
/**
76+
* Generate reasoning text for why issues are clustered together
77+
*/
78+
function generateClusterReasoning(
79+
issues: CategorizedIssue[],
80+
category: string,
81+
): string {
82+
const [primary, secondary] = category.split('/')
83+
return `All issues in ${primary}${secondary !== 'other' ? `/${secondary}` : ''} category with related functionality`
84+
}
85+
86+
/**
87+
* Calculate priority score for a cluster
88+
* Higher scores indicate more important clusters to work on
89+
*/
90+
function calculateClusterPriority(metrics: {
91+
totalEngagement: number
92+
avgComplexity: number
93+
avgUrgency: number
94+
}): number {
95+
return (
96+
(metrics.totalEngagement / 100) * 0.4 +
97+
(100 - Math.abs(metrics.avgComplexity - 55)) * 0.3 +
98+
metrics.avgUrgency * 0.2 +
99+
10 * 0.1
100+
)
101+
}
102+
103+
/**
104+
* Calculate average of an array of numbers
105+
*/
106+
function average(numbers: number[]): number {
107+
return numbers.reduce((a, b) => a + b, 0) / numbers.length
108+
}
109+
110+
/**
111+
* Calculate sum of an array of numbers
112+
*/
113+
function sum(numbers: number[]): number {
114+
return numbers.reduce((a, b) => a + b, 0)
115+
}
116+
117+
/**
118+
* Estimate effort in days based on complexity and issue count
119+
*/
120+
function estimateEffort(avgComplexity: number, issueCount: number): number {
121+
// Rough estimate: 50 complexity = 1 day
122+
const baseEffort = avgComplexity / 50
123+
return Math.round(baseEffort * issueCount * 10) / 10
124+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import type { CategorizedIssue } from '../categorize/types.js'
2+
import type { DuplicateGroup, SimilarityResult } from './types.js'
3+
import { calculateSimilarity } from './similarity.js'
4+
5+
/**
6+
* Find potential duplicates for all issues
7+
* Returns groups where each group has a primary issue and its duplicates
8+
*/
9+
export function findAllDuplicates(
10+
issues: CategorizedIssue[],
11+
): DuplicateGroup[] {
12+
const duplicateGroups: DuplicateGroup[] = []
13+
const processed = new Set<number>()
14+
15+
for (const issue of issues) {
16+
if (processed.has(issue.number)) continue
17+
18+
const duplicates = findPotentialDuplicates(issue, issues).filter(
19+
(result) => result.confidence !== 'low',
20+
)
21+
22+
if (duplicates.length > 0) {
23+
duplicateGroups.push({
24+
primary: issue,
25+
duplicates: duplicates.map((result) => ({
26+
issue: issues.find((i) => i.number === result.issue2)!,
27+
confidence: result.score,
28+
signals: result.signals,
29+
})),
30+
})
31+
32+
// Mark all as processed
33+
processed.add(issue.number)
34+
duplicates.forEach((d) => processed.add(d.issue2))
35+
}
36+
}
37+
38+
return duplicateGroups.sort(
39+
(a, b) => b.duplicates.length - a.duplicates.length,
40+
)
41+
}
42+
43+
/**
44+
* Find potential duplicates for a single issue
45+
* Returns up to 5 most similar issues with medium or high confidence
46+
*/
47+
export function findPotentialDuplicates(
48+
issue: CategorizedIssue,
49+
allIssues: CategorizedIssue[],
50+
): SimilarityResult[] {
51+
return allIssues
52+
.filter((other) => other.number !== issue.number)
53+
.map((other) => calculateSimilarity(issue, other))
54+
.filter((result) => result.score >= 0.6) // Medium confidence threshold
55+
.sort((a, b) => b.score - a.score)
56+
.slice(0, 5) // Top 5 potential duplicates
57+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
export * from './types.js'
2+
export * from './similarity.js'
3+
export * from './duplicates.js'
4+
export * from './clusters.js'
5+
export * from './utils.js'

0 commit comments

Comments
 (0)