11import * as vscode from 'vscode'
2+ import { repeat } from 'lodash-es'
23
34/**
45 * @param {vscode.ExtensionContext } context
@@ -17,7 +18,8 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
1718 return vscode . window . showErrorMessage ( 'Unsupported Document Type.' )
1819 }
1920
20- const results = findInconsistentCapitalization ( activeDoc . getText ( ) )
21+ const ignoreTextRgx = / < n a m e > (?< term > .+ ) < \/ n a m e > / gi
22+ const results = findInconsistentCapitalization ( activeDoc . getText ( ) . replaceAll ( ignoreTextRgx , ( _ , p1 ) => `<name>${ repeat ( '_' , p1 . length ) } </name>` ) )
2123
2224 outputChannel . clear ( )
2325 outputChannel . appendLine ( `List of inconsistent use of capitalization in ${ activeDoc . fileName } :\n` )
@@ -30,7 +32,7 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
3032 idx ++
3133 const phrase = results [ key ]
3234 for ( const variation of phrase ) {
33- outputChannel . appendLine ( `${ variation . text } (${ variation . count } )` )
35+ outputChannel . appendLine ( `${ variation . text } (${ variation . count } ) (Ln ${ variation . lines . join ( ', ' ) } ) ` )
3436 }
3537 }
3638
@@ -55,105 +57,165 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
5557 * -> Generated by Google Gemini 3 Pro
5658 *
5759 * @param {string } text - The input text.
58- * @return {Object } - Key: lowercase phrase, Value: Array of inconsistent variations.
60+ * @return {Object } - Key: lowercase phrase, Value: Array of objects { text, count, lines }
5961 */
6062function findInconsistentCapitalization ( text ) {
61- // 1. Pre-process: Tokenize and flag sentence starters
62- // We match words OR sentence terminators.
63- const rawTokens = text . match ( / [ a - z A - Z 0 - 9 ' ] + | [ . ! ? > ] + / g ) || [ ] ;
64-
65- const tokens = [ ] ;
66- let nextIsStart = true ; // First word is always start of sentence
67-
68- for ( const t of rawTokens ) {
69- if ( / ^ [ . ! ? > ] + $ / . test ( t ) ) {
70- nextIsStart = true
71- } else {
72- tokens . push ( { word : t , isStart : nextIsStart } )
73- nextIsStart = false
63+ // CONFIG: Words to ignore capitalization for (must be lowercase here)
64+ const IGNORED_TERMS = new Set ( [
65+ "the" , "a" , "an" , "and" , "but" , "or" , "nor" , "for" , "yet" , "so" , // Articles
66+ "in" , "on" , "at" , "to" , "of" , "by" , "with" , "from" , "up" , "about" , "into" , "over" , "after" // Prepositions
67+ ] )
68+
69+ // HELPER: check if two phrases are effectively the same ignoring preposition caps
70+ function isEffectivelySame ( phrase1 , phrase2 ) {
71+ const words1 = phrase1 . split ( " " )
72+ const words2 = phrase2 . split ( " " )
73+
74+ if ( words1 . length !== words2 . length ) {
75+ return false
7476 }
77+
78+ for ( let i = 0 ; i < words1 . length ; i ++ ) {
79+ const w1 = words1 [ i ]
80+ const w2 = words2 [ i ]
81+
82+ // If the word is a preposition/article, we don't care about case match
83+ if ( IGNORED_TERMS . has ( w1 . toLowerCase ( ) ) ) {
84+ if ( w1 . toLowerCase ( ) !== w2 . toLowerCase ( ) ) {
85+ return false
86+ }
87+ } else {
88+ // For distinct words (nouns, verbs), case MUST match exactly
89+ if ( w1 !== w2 ) {
90+ return false
91+ }
92+ }
93+ }
94+ return true
7595 }
7696
77- // Map: lowercase_phrase -> Map<original_phrase, { hasStart: bool, hasMiddle: bool }>
97+ // 1. Tokenize & Track Lines
98+ const rawLines = text . split ( / \r ? \n / )
99+ const tokens = [ ]
100+ let nextIsStart = true
101+
102+ rawLines . forEach ( ( lineText , lineIndex ) => {
103+ const lineTokens = lineText . match ( / [ a - z A - Z 0 - 9 ' ] + | [ . ! ? > ] + / g) || [ ]
104+
105+ for ( const t of lineTokens ) {
106+ if ( / ^ [ . ! ? > ] + $ / . test ( t ) ) {
107+ nextIsStart = true
108+ } else {
109+ tokens . push ( {
110+ word : t ,
111+ isStart : nextIsStart ,
112+ lineNumber : lineIndex + 1
113+ } )
114+ nextIsStart = false
115+ }
116+ }
117+ } )
118+
119+ // Map: lowercase_key -> Array of raw variations found
78120 const phraseMap = new Map ( )
79- const MIN_N = 2 // Minimum words (terms)
80- const MAX_N = 5 // Maximum words (terms)
121+ const MIN_N = 2
122+ const MAX_N = 5
81123
82124 // 2. Sliding Window
83125 for ( let i = 0 ; i < tokens . length ; i ++ ) {
84126 let currentPhrase = ""
85127 let lowerPhrase = ""
86-
87- // The n-gram is considered a "start" n-gram if its FIRST word is a start word
88128 const isPhraseStart = tokens [ i ] . isStart
89-
90- // Track if the sequence currently being built contains a 1-letter word
91- let hasSingleLetterTerm = false ;
129+ const startLine = tokens [ i ] . lineNumber
130+ let hasSingleLetterTerm = false
92131
93132 for ( let j = 0 ; j < MAX_N && ( i + j ) < tokens . length ; j ++ ) {
94133 const tokenObj = tokens [ i + j ]
95- const word = tokenObj . word ;
134+ const word = tokenObj . word
96135
97- // CHECK: If any word in the sequence is 1 char (letter or number), flag it.
98- if ( word . length === 1 ) {
136+ // Flag single letters only if they are NOT in our ignored list
137+ if ( word . length === 1 && ! IGNORED_TERMS . has ( word . toLowerCase ( ) ) ) {
99138 hasSingleLetterTerm = true
100139 }
101140
102- // Build phrases
103141 if ( j > 0 ) {
104- currentPhrase += " " + tokenObj . word
105- lowerPhrase += " " + tokenObj . word . toLowerCase ( )
142+ currentPhrase += " " + word
143+ lowerPhrase += " " + word . toLowerCase ( )
106144 } else {
107- currentPhrase = tokenObj . word
108- lowerPhrase = tokenObj . word . toLowerCase ( )
145+ currentPhrase = word
146+ lowerPhrase = word . toLowerCase ( )
109147 }
110148
111- // CHECK: Only store if we have reached the minimum word count (2+)
112- // j is 0-indexed, so j=1 means 2 words.
113- if ( j >= MIN_N - 1 && ! hasSingleLetterTerm ) {
149+ if ( j >= MIN_N - 1 && ! hasSingleLetterTerm ) {
114150 if ( ! phraseMap . has ( lowerPhrase ) ) {
115- phraseMap . set ( lowerPhrase , new Map ( ) )
116- }
117-
118- const variations = phraseMap . get ( lowerPhrase )
119- if ( ! variations . has ( currentPhrase ) ) {
120- variations . set ( currentPhrase , { hasStart : false , hasMiddle : false , count : 0 } )
151+ phraseMap . set ( lowerPhrase , [ ] )
121152 }
122153
123- const stats = variations . get ( currentPhrase )
124- stats . count += 1
125- if ( isPhraseStart ) {
126- stats . hasStart = true
127- } else {
128- stats . hasMiddle = true
129- }
154+ phraseMap . get ( lowerPhrase ) . push ( {
155+ text : currentPhrase ,
156+ isStart : isPhraseStart ,
157+ line : startLine
158+ } )
130159 }
131160 }
132161 }
133162
134- // 3. Initial Filter (Grammar & Length)
163+ // 3. Process & Filter Results
135164 const results = { }
136165
137- for ( const [ key , variationMap ] of phraseMap . entries ( ) ) {
166+ for ( const [ key , rawOccurrences ] of phraseMap . entries ( ) ) {
138167 if ( key . length <= 3 ) {
139168 continue
140169 }
141- if ( variationMap . size < 2 ) {
170+
171+ // Group effectively identical variations (ignoring prepositions)
172+ const uniqueVariations = [ ]
173+
174+ rawOccurrences . forEach ( occ => {
175+ // Find if this specific text variation (handling ignored terms) already exists
176+ let match = uniqueVariations . find ( v => isEffectivelySame ( v . text , occ . text ) )
177+
178+ if ( ! match ) {
179+ match = {
180+ text : occ . text ,
181+ count : 0 ,
182+ lines : [ ] ,
183+ hasStart : false ,
184+ hasMiddle : false
185+ }
186+ uniqueVariations . push ( match )
187+ }
188+
189+ match . count ++
190+ match . lines . push ( occ . line )
191+ if ( occ . isStart ) {
192+ match . hasStart = true
193+ } else {
194+ match . hasMiddle = true
195+ }
196+ } )
197+
198+ // Skip if consistent (only 1 valid variation group found)
199+ if ( uniqueVariations . length < 2 ) {
142200 continue
143201 }
144202
145203 const validVariations = [ ]
146- for ( const [ phrase , stats ] of variationMap . entries ( ) ) {
147-
148- const entry = { text : phrase , count : stats . count }
149-
204+ for ( const stats of uniqueVariations ) {
150205 if ( stats . hasMiddle ) {
151- validVariations . push ( entry )
206+ validVariations . push ( {
207+ text : stats . text ,
208+ count : stats . count ,
209+ lines : stats . lines
210+ } )
152211 } else if ( stats . hasStart ) {
153- // Check sentence case
154212 const sentenceCase = key . charAt ( 0 ) . toUpperCase ( ) + key . slice ( 1 )
155- if ( phrase !== sentenceCase ) {
156- validVariations . push ( entry )
213+ if ( ! isEffectivelySame ( stats . text , sentenceCase ) ) {
214+ validVariations . push ( {
215+ text : stats . text ,
216+ count : stats . count ,
217+ lines : stats . lines
218+ } )
157219 }
158220 }
159221 }
@@ -164,29 +226,23 @@ function findInconsistentCapitalization(text) {
164226 }
165227
166228 // 4. Prune Subsets
167- // If 'processing li' and 'processing li ul' both exist, remove 'processing li'
168229 const finalKeys = Object . keys ( results )
169230 const keysToDelete = new Set ( )
170231
171232 for ( let i = 0 ; i < finalKeys . length ; i ++ ) {
172233 const shortKey = finalKeys [ i ]
173-
174234 for ( let j = 0 ; j < finalKeys . length ; j ++ ) {
175235 if ( i === j ) {
176- continue // Don't compare self
236+ continue
177237 }
178-
179238 const longKey = finalKeys [ j ]
180-
181- // If the short key is a substring of the long key, mark short key for deletion
182239 if ( longKey . includes ( shortKey ) ) {
183240 keysToDelete . add ( shortKey )
184- break // Found a containing parent, no need to check others
241+ break
185242 }
186243 }
187244 }
188245
189- // Perform the deletion
190246 keysToDelete . forEach ( k => delete results [ k ] )
191247
192248 return results
0 commit comments