Skip to content

Commit 7190213

Browse files
deemonicclaude
andauthored
fix: detect partial spacing profanity obfuscation (#44)
* fix: detect partial spacing profanity obfuscation Profanity obfuscation using partial spacing was not being detected: - "s hit" not detected as "shit" - "f uck" not detected as "fuck" - "t wat" not detected as "twat" The isSpanningWordBoundary() method had overly strict logic that rejected legitimate partial spacing patterns. This fix modifies the method to check surrounding context instead of relying on heuristics about single-character parts: - If alphanumeric char immediately before match → embedded in word → reject - If alphanumeric char immediately after match → embedded in word → reject - Otherwise → standalone text, likely intentional obfuscation → allow Added 6 new test cases for partial spacing detection. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: convert byte offset to character offset for multibyte support preg_match_all returns byte offsets, but mb_substr/mb_strlen expect character offsets. This fix converts the byte offset to a character offset before performing boundary checks, ensuring correct behavior with multibyte characters (accented letters, etc.). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 5e1e0fc commit 7190213

File tree

2 files changed

+114
-29
lines changed

2 files changed

+114
-29
lines changed

src/BlaspService.php

Lines changed: 72 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ private function handle(): self
311311
$matchedText = $match[0];
312312

313313
// Check if the match inappropriately spans across word boundaries
314-
if ($this->isSpanningWordBoundary($matchedText)) {
314+
if ($this->isSpanningWordBoundary($matchedText, $normalizedString, $start)) {
315315
continue; // Skip this match as it spans word boundaries
316316
}
317317

@@ -406,42 +406,85 @@ private function isInsideHexToken(string $string, int $start, int $length): bool
406406
/**
407407
* Determine whether a matched substring inappropriately spans word boundaries.
408408
*/
409-
private function isSpanningWordBoundary(string $matchedText): bool
409+
private function isSpanningWordBoundary(string $matchedText, string $fullString, int $matchStart): bool
410410
{
411-
// If the match contains spaces, it might be spanning word boundaries
412-
if (preg_match('/\s+/', $matchedText)) {
413-
$parts = preg_split('/\s+/', $matchedText);
414-
415-
if (count($parts) > 1) {
416-
// Count how many parts are single characters
417-
$singleCharCount = 0;
418-
foreach ($parts as $part) {
419-
if (strlen($part) === 1 && preg_match('/[a-z]/i', $part)) {
420-
$singleCharCount++;
421-
}
422-
}
411+
// No spaces = not spanning
412+
if (!preg_match('/\s+/', $matchedText)) {
413+
return false;
414+
}
423415

424-
// If ALL parts are single characters, this is intentional obfuscation
425-
// (e.g., "f u c k i n g") - allow it
426-
if ($singleCharCount === count($parts)) {
427-
return false;
428-
}
416+
$parts = preg_split('/\s+/', $matchedText);
429417

430-
// If SOME parts are single characters at edges, this is likely
431-
// a cross-word match (e.g., "t êt" from "pourrait être") - reject it
432-
$firstPart = $parts[0];
433-
$lastPart = end($parts);
418+
if (count($parts) <= 1) {
419+
return false;
420+
}
434421

435-
if (strlen($lastPart) === 1 && preg_match('/[a-z]/i', $lastPart)) {
436-
return true;
437-
}
422+
// Count single-character parts
423+
$singleCharCount = 0;
424+
foreach ($parts as $part) {
425+
if (mb_strlen($part, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $part)) {
426+
$singleCharCount++;
427+
}
428+
}
438429

439-
if (strlen($firstPart) === 1 && preg_match('/[a-z]/i', $firstPart)) {
440-
return true;
441-
}
430+
// ALL parts are single characters = definitely intentional (e.g., "f u c k i n g")
431+
if ($singleCharCount === count($parts)) {
432+
return false;
433+
}
434+
435+
// Check if match is embedded in a larger word
436+
// Note: preg_match_all returns byte offsets, convert to character offset for mb_* ops
437+
$matchStartChar = mb_strlen(substr($fullString, 0, $matchStart), 'UTF-8');
438+
$matchEndChar = $matchStartChar + mb_strlen($matchedText, 'UTF-8');
439+
440+
$embeddedAtStart = false;
441+
$embeddedAtEnd = false;
442+
443+
// Character before match?
444+
if ($matchStartChar > 0) {
445+
$charBefore = mb_substr($fullString, $matchStartChar - 1, 1, 'UTF-8');
446+
if (preg_match('/\w/u', $charBefore)) {
447+
$embeddedAtStart = true;
448+
}
449+
}
450+
451+
// Character after match?
452+
if ($matchEndChar < mb_strlen($fullString, 'UTF-8')) {
453+
$charAfter = mb_substr($fullString, $matchEndChar, 1, 'UTF-8');
454+
if (preg_match('/\w/u', $charAfter)) {
455+
$embeddedAtEnd = true;
456+
}
457+
}
458+
459+
// If embedded on BOTH sides, it's completely within text - reject
460+
if ($embeddedAtStart && $embeddedAtEnd) {
461+
return true;
462+
}
463+
464+
// If embedded only at START: check if first part is single-char (likely accidental)
465+
// If first part is multi-char, the regex was just greedy - allow it
466+
if ($embeddedAtStart && !$embeddedAtEnd) {
467+
$firstPart = $parts[0];
468+
// If first part is a single letter, this is likely accidental word spanning
469+
// (e.g., "s hit" from "musicals hit" where 's' is from "musicals")
470+
if (mb_strlen($firstPart, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $firstPart)) {
471+
return true;
472+
}
473+
// If first part is multi-char, the regex was greedy but there's still
474+
// a valid profanity in the non-embedded portion (e.g., "as @ss" from "has @ss")
475+
return false;
476+
}
477+
478+
// If embedded only at END: check if last part is single-char (likely accidental)
479+
if (!$embeddedAtStart && $embeddedAtEnd) {
480+
$lastPart = end($parts);
481+
if (mb_strlen($lastPart, 'UTF-8') === 1 && preg_match('/[a-z]/iu', $lastPart)) {
482+
return true;
442483
}
484+
return false;
443485
}
444486

487+
// Standalone partial spacing = intentional obfuscation
445488
return false;
446489
}
447490

tests/BlaspCheckTest.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,4 +306,46 @@ public function test_spaced_profanity_without_substitution()
306306

307307
$this->assertTrue($result->hasProfanity);
308308
}
309+
310+
public function test_partial_spacing_s_hit()
311+
{
312+
$result = $this->blaspService->check('s hit');
313+
$this->assertTrue($result->hasProfanity);
314+
$this->assertContains('shit', $result->uniqueProfanitiesFound);
315+
}
316+
317+
public function test_partial_spacing_f_uck()
318+
{
319+
$result = $this->blaspService->check('f uck');
320+
$this->assertTrue($result->hasProfanity);
321+
$this->assertContains('fuck', $result->uniqueProfanitiesFound);
322+
}
323+
324+
public function test_partial_spacing_t_wat()
325+
{
326+
$result = $this->blaspService->check('t wat');
327+
$this->assertTrue($result->hasProfanity);
328+
$this->assertContains('twat', $result->uniqueProfanitiesFound);
329+
}
330+
331+
public function test_partial_spacing_fu_c_k()
332+
{
333+
$result = $this->blaspService->check('fu c k');
334+
$this->assertTrue($result->hasProfanity);
335+
$this->assertContains('fuck', $result->uniqueProfanitiesFound);
336+
}
337+
338+
public function test_partial_spacing_tw_a_t()
339+
{
340+
$result = $this->blaspService->check('tw a t');
341+
$this->assertTrue($result->hasProfanity);
342+
$this->assertContains('twat', $result->uniqueProfanitiesFound);
343+
}
344+
345+
public function test_no_false_positive_musicals_hit_embedded()
346+
{
347+
$result = $this->blaspService->check('This musicals hit');
348+
$this->assertFalse($result->hasProfanity);
349+
$this->assertSame('This musicals hit', $result->cleanString);
350+
}
309351
}

0 commit comments

Comments
 (0)