fix: detect partial spacing profanity obfuscation (#44)

deemonic · claude · web-flow · commit 7190213211f6 · 2026-01-28T21:17:13.000Z
* fix: detect partial spacing profanity obfuscation

Profanity obfuscation using partial spacing was not being detected:
- "s hit" not detected as "shit"
- "f uck" not detected as "fuck"
- "t wat" not detected as "twat"

The isSpanningWordBoundary() method had overly strict logic that
rejected legitimate partial spacing patterns.

This fix modifies the method to check surrounding context instead
of relying on heuristics about single-character parts:
- If alphanumeric char immediately before match → embedded in word → reject
- If alphanumeric char immediately after match → embedded in word → reject
- Otherwise → standalone text, likely intentional obfuscation → allow

Added 6 new test cases for partial spacing detection.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* fix: convert byte offset to character offset for multibyte support

preg_match_all returns byte offsets, but mb_substr/mb_strlen expect
character offsets. This fix converts the byte offset to a character
offset before performing boundary checks, ensuring correct behavior
with multibyte characters (accented letters, etc.).

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/src/BlaspService.php b/src/BlaspService.php
@@ -311,7 +311,7 @@ private function handle(): self
                         $matchedText = $match[0];
 
                         // Check if the match inappropriately spans across word boundaries
-                        if ($this->isSpanningWordBoundary($matchedText)) {
+                        if ($this->isSpanningWordBoundary($matchedText, $normalizedString, $start)) {
                             continue;  // Skip this match as it spans word boundaries
                         }
 
@@ -406,42 +406,85 @@ private function isInsideHexToken(string $string, int $start, int $length): bool
     /**
      * Determine whether a matched substring inappropriately spans word boundaries.
      */
-    private function isSpanningWordBoundary(string $matchedText): bool
+    private function isSpanningWordBoundary(string $matchedText, string $fullString, int $matchStart): bool
     {
-        // If the match contains spaces, it might be spanning word boundaries
-        if (preg_match('/\s+/', $matchedText)) {
-            $parts = preg_split('/\s+/', $matchedText);
-
-            if (count($parts) > 1) {
-                // Count how many parts are single characters
-                $singleCharCount = 0;
-                foreach ($parts as $part) {
-                    if (strlen($part) === 1 && preg_match('/[a-z]/i', $part)) {
-                        $singleCharCount++;
-                    }
-                }
+        // No spaces = not spanning
+        if (!preg_match('/\s+/', $matchedText)) {
+            return false;
+        }
 
-                // If ALL parts are single characters, this is intentional obfuscation
-                // (e.g., "f u c k i n g") - allow it
-                if ($singleCharCount === count($parts)) {
-                    return false;
-                }
+        $parts = preg_split('/\s+/', $matchedText);
 
-                // If SOME parts are single characters at edges, this is likely
-                // a cross-word match (e.g., "t êt" from "pourrait être") - reject it
-                $firstPart = $parts[0];
-                $lastPart = end($parts);
+        if (count($parts) <= 1) {
+            return false;
+        }
 
-                if (strlen($lastPart) === 1 && preg_match('/[a-z]/i', $lastPart)) {
-                    return true;
-                }
+        // Count single-character parts
+        $singleCharCount = 0;
+        foreach ($parts as $part) {
+            if (mb_strlen($part, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $part)) {
+                $singleCharCount++;
+            }
+        }
 
-                if (strlen($firstPart) === 1 && preg_match('/[a-z]/i', $firstPart)) {
-                    return true;
-                }
+        // ALL parts are single characters = definitely intentional (e.g., "f u c k i n g")
+        if ($singleCharCount === count($parts)) {
+            return false;
+        }
+
+        // Check if match is embedded in a larger word
+        // Note: preg_match_all returns byte offsets, convert to character offset for mb_* ops
+        $matchStartChar = mb_strlen(substr($fullString, 0, $matchStart), 'UTF-8');
+        $matchEndChar = $matchStartChar + mb_strlen($matchedText, 'UTF-8');
+
+        $embeddedAtStart = false;
+        $embeddedAtEnd = false;
+
+        // Character before match?
+        if ($matchStartChar > 0) {
+            $charBefore = mb_substr($fullString, $matchStartChar - 1, 1, 'UTF-8');
+            if (preg_match('/\w/u', $charBefore)) {
+                $embeddedAtStart = true;
+            }
+        }
+
+        // Character after match?
+        if ($matchEndChar < mb_strlen($fullString, 'UTF-8')) {
+            $charAfter = mb_substr($fullString, $matchEndChar, 1, 'UTF-8');
+            if (preg_match('/\w/u', $charAfter)) {
+                $embeddedAtEnd = true;
+            }
+        }
+
+        // If embedded on BOTH sides, it's completely within text - reject
+        if ($embeddedAtStart && $embeddedAtEnd) {
+            return true;
+        }
+
+        // If embedded only at START: check if first part is single-char (likely accidental)
+        // If first part is multi-char, the regex was just greedy - allow it
+        if ($embeddedAtStart && !$embeddedAtEnd) {
+            $firstPart = $parts[0];
+            // If first part is a single letter, this is likely accidental word spanning
+            // (e.g., "s hit" from "musicals hit" where 's' is from "musicals")
+            if (mb_strlen($firstPart, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $firstPart)) {
+                return true;
+            }
+            // If first part is multi-char, the regex was greedy but there's still
+            // a valid profanity in the non-embedded portion (e.g., "as @ss" from "has @ss")
+            return false;
+        }
+
+        // If embedded only at END: check if last part is single-char (likely accidental)
+        if (!$embeddedAtStart && $embeddedAtEnd) {
+            $lastPart = end($parts);
+            if (mb_strlen($lastPart, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $lastPart)) {
+                return true;
             }
+            return false;
         }
 
+        // Standalone partial spacing = intentional obfuscation
         return false;
     }
 
diff --git a/tests/BlaspCheckTest.php b/tests/BlaspCheckTest.php
@@ -306,4 +306,46 @@ public function test_spaced_profanity_without_substitution()
 
         $this->assertTrue($result->hasProfanity);
     }
+
+    public function test_partial_spacing_s_hit()
+    {
+        $result = $this->blaspService->check('s hit');
+        $this->assertTrue($result->hasProfanity);
+        $this->assertContains('shit', $result->uniqueProfanitiesFound);
+    }
+
+    public function test_partial_spacing_f_uck()
+    {
+        $result = $this->blaspService->check('f uck');
+        $this->assertTrue($result->hasProfanity);
+        $this->assertContains('fuck', $result->uniqueProfanitiesFound);
+    }
+
+    public function test_partial_spacing_t_wat()
+    {
+        $result = $this->blaspService->check('t wat');
+        $this->assertTrue($result->hasProfanity);
+        $this->assertContains('twat', $result->uniqueProfanitiesFound);
+    }
+
+    public function test_partial_spacing_fu_c_k()
+    {
+        $result = $this->blaspService->check('fu c k');
+        $this->assertTrue($result->hasProfanity);
+        $this->assertContains('fuck', $result->uniqueProfanitiesFound);
+    }
+
+    public function test_partial_spacing_tw_a_t()
+    {
+        $result = $this->blaspService->check('tw a t');
+        $this->assertTrue($result->hasProfanity);
+        $this->assertContains('twat', $result->uniqueProfanitiesFound);
+    }
+
+    public function test_no_false_positive_musicals_hit_embedded()
+    {
+        $result = $this->blaspService->check('This musicals hit');
+        $this->assertFalse($result->hasProfanity);
+        $this->assertSame('This musicals hit', $result->cleanString);
+    }
 }