fix: prevent false positives when profanity is a substring of a regular word

deemonic · claude · deemonic · commit db8457c1dbed · 2026-02-11T15:38:27.000Z
Add a pure-alpha-substring check that automatically skips profanity matches embedded inside larger alphabetic words (e.g. "spac" in "space", "ass" in "class") without needing to enumerate every false positive. The check still catches obfuscated profanity (sp@c, f-u-c-k), conjugated forms (fucks, fucker), compound profanity (cuntfuck), and repeated-letter obfuscation (ccuunntt). Closes #32 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/src/BlaspService.php b/src/BlaspService.php
@@ -294,6 +294,11 @@ private function handle(): self
         $workingCleanString = $this->cleanString;
         $normalizedString = $this->stringNormalizer->normalize($workingCleanString);
 
+        // Preserve the original normalized string for full-word context lookups.
+        // Masking replaces characters with *, which breaks word boundaries and can
+        // cause the pure-alpha-substring check to miss compound profanity.
+        $originalNormalized = preg_replace('/\s+/', ' ', $normalizedString);
+
         // Loop through until no more profanities are detected
         while ($continue) {
             $continue = false;
@@ -323,6 +328,16 @@ private function handle(): self
                         // Use boundaries to extract the full word around the match
                         $fullWord = $this->getFullWordContext($normalizedString, $start, $length);
 
+                        // If the match is purely alphabetic and is a substring of a larger
+                        // alphabetic word, it's a legitimate word — not obfuscated profanity
+                        // e.g. "spac" inside "space", "ass" inside "class"
+                        // Use the original unmasked string for context so that masking
+                        // doesn't break compound profanity detection.
+                        $originalFullWord = $this->getFullWordContext($originalNormalized, $start, $length);
+                        if ($this->isPureAlphaSubstring($matchedText, $originalFullWord, $profanity)) {
+                            continue;
+                        }
+
                         // Check if the full word (in lowercase) is in the false positives list
                         if ($this->profanityDetector->isFalsePositive($fullWord)) {
                             continue;  // Skip checking this word if it's a false positive
@@ -501,6 +516,76 @@ private function isSpanningWordBoundary(string $matchedText, string $fullString,
         return false;
     }
 
+    /**
+     * Check if the matched text is a purely alphabetic substring of a larger
+     * purely alphabetic word, indicating a likely false positive.
+     *
+     * This catches cases like "spac" inside "space" or "ass" inside "class"
+     * without needing to enumerate every false positive word.
+     *
+     * Obfuscated profanity (e.g. "sp@c", "s-p-a-c") contains non-alpha
+     * characters and will NOT be skipped by this check.
+     *
+     * Conjugated profanity (e.g. "fuckings" = "fucking" + "s") and compound
+     * profanity (e.g. "cuntfuck") are also NOT skipped.
+     *
+     * @param string $matchedText The text that matched the profanity pattern
+     * @param string $fullWord The full word context surrounding the match
+     * @param string $profanityKey The base profanity word from the list
+     * @return bool
+     */
+    private function isPureAlphaSubstring(string $matchedText, string $fullWord, string $profanityKey): bool
+    {
+        // Only applies if the matched text is entirely alphabetic (no obfuscation)
+        if (!preg_match('/^[a-zA-Z]+$/', $matchedText)) {
+            return false;
+        }
+
+        // Only applies if the surrounding word is also entirely alphabetic
+        if (!preg_match('/^[a-zA-Z]+$/', $fullWord)) {
+            return false;
+        }
+
+        // Not embedded if same length (standalone word)
+        if (strlen($fullWord) <= strlen($matchedText)) {
+            return false;
+        }
+
+        // If the match is longer than the profanity key, it contains repeated
+        // characters — this is obfuscation, not a regular word (e.g. "ccuunntt" for "cunt")
+        if (strlen($matchedText) > strlen($profanityKey)) {
+            return false;
+        }
+
+        $matchLower = strtolower($matchedText);
+        $wordLower = strtolower($fullWord);
+
+        // Check if the full word is the profanity with a common suffix
+        // e.g. "fuckings" = "fucking" + "s" — this is conjugated profanity, not a false positive
+        $suffixes = ['s', 'es', 'ed', 'er', 'ers', 'est', 'ing', 'ings', 'ly', 'y'];
+
+        foreach ($suffixes as $suffix) {
+            if ($wordLower === $matchLower . $suffix) {
+                return false;
+            }
+        }
+
+        // Check if the remainder (full word minus the match) contains another
+        // known profanity — this indicates compound profanity like "cuntfuck"
+        $pos = strpos($wordLower, $matchLower);
+        if ($pos !== false) {
+            $remainder = substr($wordLower, 0, $pos) . substr($wordLower, $pos + strlen($matchLower));
+            foreach ($this->profanityDetector->getProfanityExpressions() as $profanity => $_) {
+                if (strlen($profanity) >= 3 && stripos($remainder, $profanity) !== false) {
+                    return false;
+                }
+            }
+        }
+
+        // The match is embedded in a larger regular word (e.g., "spac" in "space")
+        return true;
+    }
+
     /**
      * Get the full word context surrounding the matched profanity.
      *
diff --git a/tests/BlaspCheckTest.php b/tests/BlaspCheckTest.php
@@ -239,11 +239,17 @@ public function test_paragraph()
 
     public function test_word_boudary()
     {
+        // Pure alphabetic embedding without obfuscation is treated as a regular word
+        // to prevent false positives (e.g. "spac" in "space")
         $result =  $this->blaspService->check('afuckb');
+        $this->assertFalse($result->hasProfanity);
+
+        // Obfuscated variants are still caught
+        $result =  $this->blaspService->check('a f u c k b');
+        $this->assertTrue($result->hasProfanity);
+
+        $result =  $this->blaspService->check('af@ckb');
         $this->assertTrue($result->hasProfanity);
-        $this->assertSame(1, $result->profanitiesCount);
-        $this->assertCount(1, $result->uniqueProfanitiesFound);
-        $this->assertSame('a****b', $result->cleanString);
     }
 
     public function test_pural_profanity()
@@ -371,4 +377,32 @@ public function test_detects_at_ss_obfuscation()
         $result = $this->blaspService->check('This has @ss in it');
         $this->assertTrue($result->hasProfanity);
     }
+
+    public function test_no_false_positive_space_words()
+    {
+        // Words containing the profanity substring "spac" should not be flagged
+        $words = [
+            'This product provides ample space for storage.',
+            'The spacious design offers great workspace.',
+            'Perfect for aerospace applications.',
+            'Use the backspace key to delete.',
+            'The spacecraft landed safely.',
+        ];
+
+        foreach ($words as $sentence) {
+            $result = $this->blaspService->check($sentence);
+            $this->assertFalse(
+                $result->hasProfanity,
+                "\"$sentence\" should not be flagged but got: " . implode(', ', $result->uniqueProfanitiesFound)
+            );
+        }
+
+        // The actual profanity "spac" standalone should still be caught
+        $result = $this->blaspService->check('you spac');
+        $this->assertTrue($result->hasProfanity);
+
+        // Obfuscated forms should still be caught
+        $result = $this->blaspService->check('you sp@c');
+        $this->assertTrue($result->hasProfanity);
+    }
 }