Skip to content

Commit 05421d4

Browse files
committed
Add separate codegen category
1 parent 1ce2fd1 commit 05421d4

File tree

2 files changed

+409
-25
lines changed

2 files changed

+409
-25
lines changed

packages/toolkit/scripts/issue-triage/src/categorize/categorizer.ts

Lines changed: 136 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,77 @@ function normalizeText(text: string): string {
1313
return text.toLowerCase().trim()
1414
}
1515

16+
/**
17+
* Remove stack traces from text to avoid false matches
18+
*/
19+
function removeStackTraces(text: string): string {
20+
return text
21+
.split('\n')
22+
.filter((line) => {
23+
const trimmed = line.trim()
24+
// Skip lines starting with "at " (stack traces)
25+
if (trimmed.startsWith('at ')) return false
26+
// Skip lines with file paths (e.g., "node_modules/", "src/", ".js:", ".ts:")
27+
if (/node_modules|\.js:|\.ts:|\.tsx:|\.jsx:/.test(trimmed)) return false
28+
return true
29+
})
30+
.join('\n')
31+
}
32+
33+
/**
34+
* Detect multi-word phrases with higher confidence
35+
*/
36+
function detectPhrases(text: string): Map<string, number> {
37+
const phrases = new Map<string, number>()
38+
const lowerText = text.toLowerCase()
39+
40+
// High-value phrases that should boost category scores
41+
const phrasePatterns: Record<string, string[]> = {
42+
'rtk-query': [
43+
'rtk query',
44+
'rtk-query',
45+
'createapi',
46+
'fetch base query',
47+
'optimistic update',
48+
'cache invalidation',
49+
],
50+
codegen: [
51+
'code generation',
52+
'openapi',
53+
'swagger',
54+
'output files',
55+
'codegen',
56+
'rtk-query codegen',
57+
'@rtk-query/codegen-openapi',
58+
'generate api',
59+
'generate endpoints',
60+
],
61+
'build-tooling': ['webpack', 'vite', 'rollup', 'bundler'],
62+
publishing: [
63+
'npm publish',
64+
'trusted publishing',
65+
'provenance',
66+
'package registry',
67+
],
68+
migration: ['migration guide', 'breaking change', 'upgrade path'],
69+
architecture: ['api design', 'architecture decision', 'design pattern'],
70+
}
71+
72+
for (const [category, categoryPhrases] of Object.entries(phrasePatterns)) {
73+
let score = 0
74+
for (const phrase of categoryPhrases) {
75+
if (lowerText.includes(phrase)) {
76+
score += 3.0 // High score for phrase matches
77+
}
78+
}
79+
if (score > 0) {
80+
phrases.set(category, score)
81+
}
82+
}
83+
84+
return phrases
85+
}
86+
1687
/**
1788
* Check if issue has any matching labels (Tier 1)
1889
*/
@@ -45,30 +116,69 @@ function checkLabels(issue: Issue): Categorization | null {
45116

46117
/**
47118
* Calculate keyword scores for each category (Tier 2)
119+
* Implements context-aware matching with title prioritization
48120
*/
49121
function calculateKeywordScores(issue: Issue): Map<string, number> {
50122
const scores = new Map<string, number>()
51-
const text = normalizeText(`${issue.title} ${issue.body}`)
123+
const title = normalizeText(issue.title)
124+
const body = normalizeText(issue.body || '')
125+
126+
// Remove stack traces from body for matching
127+
const bodyWithoutStackTraces = removeStackTraces(body)
128+
129+
// Add phrase detection before keyword matching
130+
const phraseScores = detectPhrases(`${title} ${bodyWithoutStackTraces}`)
131+
for (const [category, phraseScore] of phraseScores) {
132+
const currentScore = scores.get(category) || 0
133+
scores.set(category, currentScore + phraseScore)
134+
}
52135

53136
for (const category of CATEGORIES) {
54137
if (category.name === 'uncategorized') continue
55138

56139
let score = 0
57140

58-
// Check keywords
141+
// Check keywords with title prioritization (2x weight for title)
59142
for (const keyword of category.keywords) {
60-
if (text.includes(normalizeText(keyword))) {
61-
score += category.weight
62-
}
143+
const normalizedKeyword = normalizeText(keyword)
144+
145+
// Title matches (2x weight)
146+
const titleMatches = (
147+
title.match(
148+
new RegExp(
149+
normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'),
150+
'gi',
151+
),
152+
) || []
153+
).length
154+
score += titleMatches * category.weight * 2.0
155+
156+
// Body matches (1x weight, excluding stack traces)
157+
const bodyMatches = (
158+
bodyWithoutStackTraces.match(
159+
new RegExp(
160+
normalizedKeyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'),
161+
'gi',
162+
),
163+
) || []
164+
).length
165+
score += bodyMatches * category.weight
63166
}
64167

65-
// Check patterns (weighted higher)
168+
// Check patterns (1.5x weight for title, 1x for body)
66169
for (const pattern of category.patterns) {
67-
if (pattern.test(issue.title) || pattern.test(issue.body)) {
170+
if (pattern.test(title)) {
68171
score += category.weight * 1.5
172+
} else if (pattern.test(bodyWithoutStackTraces)) {
173+
score += category.weight
69174
}
70175
}
71176

177+
// Penalty for documentation category on bugs
178+
if (category.name === 'documentation' && detectIssueType(issue) === 'bug') {
179+
score *= 0.5
180+
}
181+
72182
if (score > 0) {
73183
scores.set(category.name, score)
74184
}
@@ -228,15 +338,16 @@ function detectIssueType(
228338

229339
/**
230340
* Main categorization function using multi-tier approach
341+
* Implements confidence scoring based on match location
231342
*/
232343
export function categorizeIssue(issue: Issue): Categorization {
233-
// Tier 1: Label-based (highest confidence)
344+
// Tier 1: Label-based (highest confidence - 95%)
234345
const labelResult = checkLabels(issue)
235346
if (labelResult) {
236347
return labelResult
237348
}
238349

239-
// Tier 2: Keyword matching
350+
// Tier 2: Keyword matching with context-aware scoring
240351
const scores = calculateKeywordScores(issue)
241352
if (scores.size > 0) {
242353
let maxScore = 0
@@ -254,7 +365,19 @@ export function categorizeIssue(issue: Issue): Categorization {
254365
if (categoryConfig) {
255366
const type = detectIssueType(issue)
256367
const subcategory = findSubcategory(issue, categoryConfig)
257-
const confidence = Math.min(0.8, 0.5 + maxScore * 0.08)
368+
369+
// Adjust confidence based on match strength and location
370+
let confidence: number
371+
if (maxScore > 6.0) {
372+
// Strong title match (score > 3 * 2.0)
373+
confidence = 0.9
374+
} else if (maxScore > 4.0) {
375+
// Good body match (score > 2 * 2.0)
376+
confidence = 0.8
377+
} else {
378+
// Pattern detection level
379+
confidence = 0.7
380+
}
258381

259382
return {
260383
primary: topCategory,
@@ -267,17 +390,17 @@ export function categorizeIssue(issue: Issue): Categorization {
267390
}
268391
}
269392

270-
// Tier 3: Pattern detection
393+
// Tier 3: Pattern detection (70% confidence)
271394
const patternResult = detectPatterns(issue)
272395
if (patternResult) {
273396
return patternResult
274397
}
275398

276-
// Fallback: Uncategorized
399+
// Fallback: Uncategorized (60% confidence)
277400
return {
278401
primary: 'uncategorized',
279402
type: detectIssueType(issue),
280-
confidence: 0.3,
403+
confidence: 0.6,
281404
method: 'manual',
282405
}
283406
}

0 commit comments

Comments
 (0)