fix: ignore articles + prepositions for inconsistent-capitalization tool

NGPixel · NGPixel · commit 7aaf28caa3ce · 2025-12-30T21:19:33.000-05:00
diff --git a/src/commands/inconsistent-capitalization.js b/src/commands/inconsistent-capitalization.js
@@ -1,4 +1,5 @@
 import * as vscode from 'vscode'
+import { repeat } from 'lodash-es'
 
 /**
  * @param {vscode.ExtensionContext} context
@@ -17,7 +18,8 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
         return vscode.window.showErrorMessage('Unsupported Document Type.')
       }
 
-      const results = findInconsistentCapitalization(activeDoc.getText())
+      const ignoreTextRgx = /<name>(?<term>.+)<\/name>/gi
+      const results = findInconsistentCapitalization(activeDoc.getText().replaceAll(ignoreTextRgx, (_, p1) => `<name>${repeat('_', p1.length)}</name>`))
 
       outputChannel.clear()
       outputChannel.appendLine(`List of inconsistent use of capitalization in ${activeDoc.fileName}:\n`)
@@ -30,7 +32,7 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
         idx++
         const phrase = results[key]
         for (const variation of phrase) {
-          outputChannel.appendLine(`${variation.text} (${variation.count})`)
+          outputChannel.appendLine(`${variation.text} (${variation.count}) (Ln ${variation.lines.join(', ')})`)
         }
       }
 
@@ -55,105 +57,165 @@ export function registerListInconsistentCapitalizationCommand (context, outputCh
  * -> Generated by Google Gemini 3 Pro
  *
  * @param {string} text - The input text.
- * @return {Object} - Key: lowercase phrase, Value: Array of inconsistent variations.
+ * @return {Object} - Key: lowercase phrase, Value: Array of objects { text, count, lines }
  */
 function findInconsistentCapitalization(text) {
-  // 1. Pre-process: Tokenize and flag sentence starters
-  // We match words OR sentence terminators.
-  const rawTokens = text.match(/[a-zA-Z0-9']+|[.!?>]+/g) || [];
-
-  const tokens = [];
-  let nextIsStart = true; // First word is always start of sentence
-
-  for (const t of rawTokens) {
-    if (/^[.!?>]+$/.test(t)) {
-      nextIsStart = true
-    } else {
-      tokens.push({ word: t, isStart: nextIsStart })
-      nextIsStart = false
+  // CONFIG: Words to ignore capitalization for (must be lowercase here)
+  const IGNORED_TERMS = new Set([
+    "the", "a", "an", "and", "but", "or", "nor", "for", "yet", "so", // Articles
+    "in", "on", "at", "to", "of", "by", "with", "from", "up", "about", "into", "over", "after" // Prepositions
+  ])
+
+  // HELPER: check if two phrases are effectively the same ignoring preposition caps
+  function isEffectivelySame(phrase1, phrase2) {
+    const words1 = phrase1.split(" ")
+    const words2 = phrase2.split(" ")
+
+    if (words1.length !== words2.length) {
+      return false
     }
+
+    for (let i = 0; i < words1.length; i++) {
+      const w1 = words1[i]
+      const w2 = words2[i]
+
+      // If the word is a preposition/article, we don't care about case match
+      if (IGNORED_TERMS.has(w1.toLowerCase())) {
+        if (w1.toLowerCase() !== w2.toLowerCase()) {
+          return false
+        }
+      } else {
+        // For distinct words (nouns, verbs), case MUST match exactly
+        if (w1 !== w2) {
+          return false
+        }
+      }
+    }
+    return true
   }
 
-  // Map: lowercase_phrase -> Map<original_phrase, { hasStart: bool, hasMiddle: bool }>
+  // 1. Tokenize & Track Lines
+  const rawLines = text.split(/\r?\n/)
+  const tokens = []
+  let nextIsStart = true
+
+  rawLines.forEach((lineText, lineIndex) => {
+    const lineTokens = lineText.match(/[a-zA-Z0-9']+|[.!?>]+/g) || []
+
+    for (const t of lineTokens) {
+      if (/^[.!?>]+$/.test(t)) {
+        nextIsStart = true
+      } else {
+        tokens.push({
+          word: t,
+          isStart: nextIsStart,
+          lineNumber: lineIndex + 1
+        })
+        nextIsStart = false
+      }
+    }
+  })
+
+  // Map: lowercase_key -> Array of raw variations found
   const phraseMap = new Map()
-  const MIN_N = 2 // Minimum words (terms)
-  const MAX_N = 5 // Maximum words (terms)
+  const MIN_N = 2
+  const MAX_N = 5
 
   // 2. Sliding Window
   for (let i = 0; i < tokens.length; i++) {
     let currentPhrase = ""
     let lowerPhrase = ""
-
-    // The n-gram is considered a "start" n-gram if its FIRST word is a start word
     const isPhraseStart = tokens[i].isStart
-
-    // Track if the sequence currently being built contains a 1-letter word
-    let hasSingleLetterTerm = false;
+    const startLine = tokens[i].lineNumber
+    let hasSingleLetterTerm = false
 
     for (let j = 0; j < MAX_N && (i + j) < tokens.length; j++) {
       const tokenObj = tokens[i + j]
-      const word = tokenObj.word;
+      const word = tokenObj.word
 
-      // CHECK: If any word in the sequence is 1 char (letter or number), flag it.
-      if (word.length === 1) {
+      // Flag single letters only if they are NOT in our ignored list
+      if (word.length === 1 && !IGNORED_TERMS.has(word.toLowerCase())) {
         hasSingleLetterTerm = true
       }
 
-      // Build phrases
       if (j > 0) {
-        currentPhrase += " " + tokenObj.word
-        lowerPhrase += " " + tokenObj.word.toLowerCase()
+        currentPhrase += " " + word
+        lowerPhrase += " " + word.toLowerCase()
       } else {
-        currentPhrase = tokenObj.word
-        lowerPhrase = tokenObj.word.toLowerCase()
+        currentPhrase = word
+        lowerPhrase = word.toLowerCase()
       }
 
-      // CHECK: Only store if we have reached the minimum word count (2+)
-      // j is 0-indexed, so j=1 means 2 words.
-      if (j >= MIN_N - 1&& !hasSingleLetterTerm) {
+      if (j >= MIN_N - 1 && !hasSingleLetterTerm) {
         if (!phraseMap.has(lowerPhrase)) {
-          phraseMap.set(lowerPhrase, new Map())
-        }
-
-        const variations = phraseMap.get(lowerPhrase)
-        if (!variations.has(currentPhrase)) {
-          variations.set(currentPhrase, { hasStart: false, hasMiddle: false, count: 0 })
+          phraseMap.set(lowerPhrase, [])
         }
 
-        const stats = variations.get(currentPhrase)
-        stats.count += 1
-        if (isPhraseStart) {
-          stats.hasStart = true
-        } else {
-          stats.hasMiddle = true
-        }
+        phraseMap.get(lowerPhrase).push({
+          text: currentPhrase,
+          isStart: isPhraseStart,
+          line: startLine
+        })
       }
     }
   }
 
-  // 3. Initial Filter (Grammar & Length)
+  // 3. Process & Filter Results
   const results = {}
 
-  for (const [key, variationMap] of phraseMap.entries()) {
+  for (const [key, rawOccurrences] of phraseMap.entries()) {
     if (key.length <= 3) {
       continue
     }
-    if (variationMap.size < 2) {
+
+    // Group effectively identical variations (ignoring prepositions)
+    const uniqueVariations = []
+
+    rawOccurrences.forEach(occ => {
+      // Find if this specific text variation (handling ignored terms) already exists
+      let match = uniqueVariations.find(v => isEffectivelySame(v.text, occ.text))
+
+      if (!match) {
+        match = {
+          text: occ.text,
+          count: 0,
+          lines: [],
+          hasStart: false,
+          hasMiddle: false
+        }
+        uniqueVariations.push(match)
+      }
+
+      match.count++
+      match.lines.push(occ.line)
+      if (occ.isStart) {
+        match.hasStart = true
+      } else {
+        match.hasMiddle = true
+      }
+    })
+
+    // Skip if consistent (only 1 valid variation group found)
+    if (uniqueVariations.length < 2) {
       continue
     }
 
     const validVariations = []
-    for (const [phrase, stats] of variationMap.entries()) {
-
-      const entry = { text: phrase, count: stats.count }
-
+    for (const stats of uniqueVariations) {
       if (stats.hasMiddle) {
-        validVariations.push(entry)
+        validVariations.push({
+          text: stats.text,
+          count: stats.count,
+          lines: stats.lines
+        })
       } else if (stats.hasStart) {
-        // Check sentence case
         const sentenceCase = key.charAt(0).toUpperCase() + key.slice(1)
-        if (phrase !== sentenceCase) {
-          validVariations.push(entry)
+        if (!isEffectivelySame(stats.text, sentenceCase)) {
+          validVariations.push({
+            text: stats.text,
+            count: stats.count,
+            lines: stats.lines
+          })
         }
       }
     }
@@ -164,29 +226,23 @@ function findInconsistentCapitalization(text) {
   }
 
   // 4. Prune Subsets
-  // If 'processing li' and 'processing li ul' both exist, remove 'processing li'
   const finalKeys = Object.keys(results)
   const keysToDelete = new Set()
 
   for (let i = 0; i < finalKeys.length; i++) {
     const shortKey = finalKeys[i]
-
     for (let j = 0; j < finalKeys.length; j++) {
       if (i === j) {
-        continue // Don't compare self
+        continue
       }
-
       const longKey = finalKeys[j]
-
-      // If the short key is a substring of the long key, mark short key for deletion
       if (longKey.includes(shortKey)) {
         keysToDelete.add(shortKey)
-        break // Found a containing parent, no need to check others
+        break
       }
     }
   }
 
-  // Perform the deletion
   keysToDelete.forEach(k => delete results[k])
 
   return results