Skip to content

Commit 7aaf28c

Browse files
committed
fix: ignore articles + prepositions for inconsistent-capitalization tool
1 parent 5846e36 commit 7aaf28c

File tree

1 file changed

+123
-67
lines changed

1 file changed

+123
-67
lines changed

src/commands/inconsistent-capitalization.js

Lines changed: 123 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import * as vscode from 'vscode'
2+
import { repeat } from 'lodash-es'
23

34
/**
45
* @param {vscode.ExtensionContext} context
@@ -17,7 +18,8 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
1718
return vscode.window.showErrorMessage('Unsupported Document Type.')
1819
}
1920

20-
const results = findInconsistentCapitalization(activeDoc.getText())
21+
const ignoreTextRgx = /<name>(?<term>.+)<\/name>/gi
22+
const results = findInconsistentCapitalization(activeDoc.getText().replaceAll(ignoreTextRgx, (_, p1) => `<name>${repeat('_', p1.length)}</name>`))
2123

2224
outputChannel.clear()
2325
outputChannel.appendLine(`List of inconsistent use of capitalization in ${activeDoc.fileName}:\n`)
@@ -30,7 +32,7 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
3032
idx++
3133
const phrase = results[key]
3234
for (const variation of phrase) {
33-
outputChannel.appendLine(`${variation.text} (${variation.count})`)
35+
outputChannel.appendLine(`${variation.text} (${variation.count}) (Ln ${variation.lines.join(', ')})`)
3436
}
3537
}
3638

@@ -55,105 +57,165 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
5557
* -> Generated by Google Gemini 3 Pro
5658
*
5759
* @param {string} text - The input text.
58-
* @return {Object} - Key: lowercase phrase, Value: Array of inconsistent variations.
60+
* @return {Object} - Key: lowercase phrase, Value: Array of objects { text, count, lines }
5961
*/
6062
function findInconsistentCapitalization(text) {
61-
// 1. Pre-process: Tokenize and flag sentence starters
62-
// We match words OR sentence terminators.
63-
const rawTokens = text.match(/[a-zA-Z0-9']+|[.!?>]+/g) || [];
64-
65-
const tokens = [];
66-
let nextIsStart = true; // First word is always start of sentence
67-
68-
for (const t of rawTokens) {
69-
if (/^[.!?>]+$/.test(t)) {
70-
nextIsStart = true
71-
} else {
72-
tokens.push({ word: t, isStart: nextIsStart })
73-
nextIsStart = false
63+
// CONFIG: Words to ignore capitalization for (must be lowercase here)
64+
const IGNORED_TERMS = new Set([
65+
"the", "a", "an", "and", "but", "or", "nor", "for", "yet", "so", // Articles
66+
"in", "on", "at", "to", "of", "by", "with", "from", "up", "about", "into", "over", "after" // Prepositions
67+
])
68+
69+
// HELPER: check if two phrases are effectively the same ignoring preposition caps
70+
function isEffectivelySame(phrase1, phrase2) {
71+
const words1 = phrase1.split(" ")
72+
const words2 = phrase2.split(" ")
73+
74+
if (words1.length !== words2.length) {
75+
return false
7476
}
77+
78+
for (let i = 0; i < words1.length; i++) {
79+
const w1 = words1[i]
80+
const w2 = words2[i]
81+
82+
// If the word is a preposition/article, we don't care about case match
83+
if (IGNORED_TERMS.has(w1.toLowerCase())) {
84+
if (w1.toLowerCase() !== w2.toLowerCase()) {
85+
return false
86+
}
87+
} else {
88+
// For distinct words (nouns, verbs), case MUST match exactly
89+
if (w1 !== w2) {
90+
return false
91+
}
92+
}
93+
}
94+
return true
7595
}
7696

77-
// Map: lowercase_phrase -> Map<original_phrase, { hasStart: bool, hasMiddle: bool }>
97+
// 1. Tokenize & Track Lines
98+
const rawLines = text.split(/\r?\n/)
99+
const tokens = []
100+
let nextIsStart = true
101+
102+
rawLines.forEach((lineText, lineIndex) => {
103+
const lineTokens = lineText.match(/[a-zA-Z0-9']+|[.!?>]+/g) || []
104+
105+
for (const t of lineTokens) {
106+
if (/^[.!?>]+$/.test(t)) {
107+
nextIsStart = true
108+
} else {
109+
tokens.push({
110+
word: t,
111+
isStart: nextIsStart,
112+
lineNumber: lineIndex + 1
113+
})
114+
nextIsStart = false
115+
}
116+
}
117+
})
118+
119+
// Map: lowercase_key -> Array of raw variations found
78120
const phraseMap = new Map()
79-
const MIN_N = 2 // Minimum words (terms)
80-
const MAX_N = 5 // Maximum words (terms)
121+
const MIN_N = 2
122+
const MAX_N = 5
81123

82124
// 2. Sliding Window
83125
for (let i = 0; i < tokens.length; i++) {
84126
let currentPhrase = ""
85127
let lowerPhrase = ""
86-
87-
// The n-gram is considered a "start" n-gram if its FIRST word is a start word
88128
const isPhraseStart = tokens[i].isStart
89-
90-
// Track if the sequence currently being built contains a 1-letter word
91-
let hasSingleLetterTerm = false;
129+
const startLine = tokens[i].lineNumber
130+
let hasSingleLetterTerm = false
92131

93132
for (let j = 0; j < MAX_N && (i + j) < tokens.length; j++) {
94133
const tokenObj = tokens[i + j]
95-
const word = tokenObj.word;
134+
const word = tokenObj.word
96135

97-
// CHECK: If any word in the sequence is 1 char (letter or number), flag it.
98-
if (word.length === 1) {
136+
// Flag single letters only if they are NOT in our ignored list
137+
if (word.length === 1 && !IGNORED_TERMS.has(word.toLowerCase())) {
99138
hasSingleLetterTerm = true
100139
}
101140

102-
// Build phrases
103141
if (j > 0) {
104-
currentPhrase += " " + tokenObj.word
105-
lowerPhrase += " " + tokenObj.word.toLowerCase()
142+
currentPhrase += " " + word
143+
lowerPhrase += " " + word.toLowerCase()
106144
} else {
107-
currentPhrase = tokenObj.word
108-
lowerPhrase = tokenObj.word.toLowerCase()
145+
currentPhrase = word
146+
lowerPhrase = word.toLowerCase()
109147
}
110148

111-
// CHECK: Only store if we have reached the minimum word count (2+)
112-
// j is 0-indexed, so j=1 means 2 words.
113-
if (j >= MIN_N - 1&& !hasSingleLetterTerm) {
149+
if (j >= MIN_N - 1 && !hasSingleLetterTerm) {
114150
if (!phraseMap.has(lowerPhrase)) {
115-
phraseMap.set(lowerPhrase, new Map())
116-
}
117-
118-
const variations = phraseMap.get(lowerPhrase)
119-
if (!variations.has(currentPhrase)) {
120-
variations.set(currentPhrase, { hasStart: false, hasMiddle: false, count: 0 })
151+
phraseMap.set(lowerPhrase, [])
121152
}
122153

123-
const stats = variations.get(currentPhrase)
124-
stats.count += 1
125-
if (isPhraseStart) {
126-
stats.hasStart = true
127-
} else {
128-
stats.hasMiddle = true
129-
}
154+
phraseMap.get(lowerPhrase).push({
155+
text: currentPhrase,
156+
isStart: isPhraseStart,
157+
line: startLine
158+
})
130159
}
131160
}
132161
}
133162

134-
// 3. Initial Filter (Grammar & Length)
163+
// 3. Process & Filter Results
135164
const results = {}
136165

137-
for (const [key, variationMap] of phraseMap.entries()) {
166+
for (const [key, rawOccurrences] of phraseMap.entries()) {
138167
if (key.length <= 3) {
139168
continue
140169
}
141-
if (variationMap.size < 2) {
170+
171+
// Group effectively identical variations (ignoring prepositions)
172+
const uniqueVariations = []
173+
174+
rawOccurrences.forEach(occ => {
175+
// Find if this specific text variation (handling ignored terms) already exists
176+
let match = uniqueVariations.find(v => isEffectivelySame(v.text, occ.text))
177+
178+
if (!match) {
179+
match = {
180+
text: occ.text,
181+
count: 0,
182+
lines: [],
183+
hasStart: false,
184+
hasMiddle: false
185+
}
186+
uniqueVariations.push(match)
187+
}
188+
189+
match.count++
190+
match.lines.push(occ.line)
191+
if (occ.isStart) {
192+
match.hasStart = true
193+
} else {
194+
match.hasMiddle = true
195+
}
196+
})
197+
198+
// Skip if consistent (only 1 valid variation group found)
199+
if (uniqueVariations.length < 2) {
142200
continue
143201
}
144202

145203
const validVariations = []
146-
for (const [phrase, stats] of variationMap.entries()) {
147-
148-
const entry = { text: phrase, count: stats.count }
149-
204+
for (const stats of uniqueVariations) {
150205
if (stats.hasMiddle) {
151-
validVariations.push(entry)
206+
validVariations.push({
207+
text: stats.text,
208+
count: stats.count,
209+
lines: stats.lines
210+
})
152211
} else if (stats.hasStart) {
153-
// Check sentence case
154212
const sentenceCase = key.charAt(0).toUpperCase() + key.slice(1)
155-
if (phrase !== sentenceCase) {
156-
validVariations.push(entry)
213+
if (!isEffectivelySame(stats.text, sentenceCase)) {
214+
validVariations.push({
215+
text: stats.text,
216+
count: stats.count,
217+
lines: stats.lines
218+
})
157219
}
158220
}
159221
}
@@ -164,29 +226,23 @@ function findInconsistentCapitalization(text) {
164226
}
165227

166228
// 4. Prune Subsets
167-
// If 'processing li' and 'processing li ul' both exist, remove 'processing li'
168229
const finalKeys = Object.keys(results)
169230
const keysToDelete = new Set()
170231

171232
for (let i = 0; i < finalKeys.length; i++) {
172233
const shortKey = finalKeys[i]
173-
174234
for (let j = 0; j < finalKeys.length; j++) {
175235
if (i === j) {
176-
continue // Don't compare self
236+
continue
177237
}
178-
179238
const longKey = finalKeys[j]
180-
181-
// If the short key is a substring of the long key, mark short key for deletion
182239
if (longKey.includes(shortKey)) {
183240
keysToDelete.add(shortKey)
184-
break // Found a containing parent, no need to check others
241+
break
185242
}
186243
}
187244
}
188245

189-
// Perform the deletion
190246
keysToDelete.forEach(k => delete results[k])
191247

192248
return results

0 commit comments

Comments
 (0)