Skip to content

Commit db8457c

Browse files
deemonicclaude
andcommitted
fix: prevent false positives when profanity is a substring of a regular word
Add a pure-alpha-substring check that automatically skips profanity matches embedded inside larger alphabetic words (e.g. "spac" in "space", "ass" in "class") without needing to enumerate every false positive. The check still catches obfuscated profanity (sp@c, f-u-c-k), conjugated forms (fucks, fucker), compound profanity (cuntfuck), and repeated-letter obfuscation (ccuunntt). Closes #32 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7188a28 commit db8457c

File tree

2 files changed

+122
-3
lines changed

2 files changed

+122
-3
lines changed

src/BlaspService.php

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,11 @@ private function handle(): self
294294
$workingCleanString = $this->cleanString;
295295
$normalizedString = $this->stringNormalizer->normalize($workingCleanString);
296296

297+
// Preserve the original normalized string for full-word context lookups.
298+
// Masking replaces characters with *, which breaks word boundaries and can
299+
// cause the pure-alpha-substring check to miss compound profanity.
300+
$originalNormalized = preg_replace('/\s+/', ' ', $normalizedString);
301+
297302
// Loop through until no more profanities are detected
298303
while ($continue) {
299304
$continue = false;
@@ -323,6 +328,16 @@ private function handle(): self
323328
// Use boundaries to extract the full word around the match
324329
$fullWord = $this->getFullWordContext($normalizedString, $start, $length);
325330

331+
// If the match is purely alphabetic and is a substring of a larger
332+
// alphabetic word, it's a legitimate word — not obfuscated profanity
333+
// e.g. "spac" inside "space", "ass" inside "class"
334+
// Use the original unmasked string for context so that masking
335+
// doesn't break compound profanity detection.
336+
$originalFullWord = $this->getFullWordContext($originalNormalized, $start, $length);
337+
if ($this->isPureAlphaSubstring($matchedText, $originalFullWord, $profanity)) {
338+
continue;
339+
}
340+
326341
// Check if the full word (in lowercase) is in the false positives list
327342
if ($this->profanityDetector->isFalsePositive($fullWord)) {
328343
continue; // Skip checking this word if it's a false positive
@@ -501,6 +516,76 @@ private function isSpanningWordBoundary(string $matchedText, string $fullString,
501516
return false;
502517
}
503518

519+
/**
520+
* Check if the matched text is a purely alphabetic substring of a larger
521+
* purely alphabetic word, indicating a likely false positive.
522+
*
523+
* This catches cases like "spac" inside "space" or "ass" inside "class"
524+
* without needing to enumerate every false positive word.
525+
*
526+
* Obfuscated profanity (e.g. "sp@c", "s-p-a-c") contains non-alpha
527+
* characters and will NOT be skipped by this check.
528+
*
529+
* Conjugated profanity (e.g. "fuckings" = "fucking" + "s") and compound
530+
* profanity (e.g. "cuntfuck") are also NOT skipped.
531+
*
532+
* @param string $matchedText The text that matched the profanity pattern
533+
* @param string $fullWord The full word context surrounding the match
534+
* @param string $profanityKey The base profanity word from the list
535+
* @return bool
536+
*/
537+
private function isPureAlphaSubstring(string $matchedText, string $fullWord, string $profanityKey): bool
538+
{
539+
// Only applies if the matched text is entirely alphabetic (no obfuscation)
540+
if (!preg_match('/^[a-zA-Z]+$/', $matchedText)) {
541+
return false;
542+
}
543+
544+
// Only applies if the surrounding word is also entirely alphabetic
545+
if (!preg_match('/^[a-zA-Z]+$/', $fullWord)) {
546+
return false;
547+
}
548+
549+
// Not embedded if same length (standalone word)
550+
if (strlen($fullWord) <= strlen($matchedText)) {
551+
return false;
552+
}
553+
554+
// If the match is longer than the profanity key, it contains repeated
555+
// characters — this is obfuscation, not a regular word (e.g. "ccuunntt" for "cunt")
556+
if (strlen($matchedText) > strlen($profanityKey)) {
557+
return false;
558+
}
559+
560+
$matchLower = strtolower($matchedText);
561+
$wordLower = strtolower($fullWord);
562+
563+
// Check if the full word is the profanity with a common suffix
564+
// e.g. "fuckings" = "fucking" + "s" — this is conjugated profanity, not a false positive
565+
$suffixes = ['s', 'es', 'ed', 'er', 'ers', 'est', 'ing', 'ings', 'ly', 'y'];
566+
567+
foreach ($suffixes as $suffix) {
568+
if ($wordLower === $matchLower . $suffix) {
569+
return false;
570+
}
571+
}
572+
573+
// Check if the remainder (full word minus the match) contains another
574+
// known profanity — this indicates compound profanity like "cuntfuck"
575+
$pos = strpos($wordLower, $matchLower);
576+
if ($pos !== false) {
577+
$remainder = substr($wordLower, 0, $pos) . substr($wordLower, $pos + strlen($matchLower));
578+
foreach ($this->profanityDetector->getProfanityExpressions() as $profanity => $_) {
579+
if (strlen($profanity) >= 3 && stripos($remainder, $profanity) !== false) {
580+
return false;
581+
}
582+
}
583+
}
584+
585+
// The match is embedded in a larger regular word (e.g., "spac" in "space")
586+
return true;
587+
}
588+
504589
/**
505590
* Get the full word context surrounding the matched profanity.
506591
*

tests/BlaspCheckTest.php

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,11 +239,17 @@ public function test_paragraph()
239239

240240
public function test_word_boudary()
241241
{
242+
// Pure alphabetic embedding without obfuscation is treated as a regular word
243+
// to prevent false positives (e.g. "spac" in "space")
242244
$result = $this->blaspService->check('afuckb');
245+
$this->assertFalse($result->hasProfanity);
246+
247+
// Obfuscated variants are still caught
248+
$result = $this->blaspService->check('a f u c k b');
249+
$this->assertTrue($result->hasProfanity);
250+
251+
$result = $this->blaspService->check('af@ckb');
243252
$this->assertTrue($result->hasProfanity);
244-
$this->assertSame(1, $result->profanitiesCount);
245-
$this->assertCount(1, $result->uniqueProfanitiesFound);
246-
$this->assertSame('a****b', $result->cleanString);
247253
}
248254

249255
public function test_pural_profanity()
@@ -371,4 +377,32 @@ public function test_detects_at_ss_obfuscation()
371377
$result = $this->blaspService->check('This has @ss in it');
372378
$this->assertTrue($result->hasProfanity);
373379
}
380+
381+
public function test_no_false_positive_space_words()
382+
{
383+
// Words containing the profanity substring "spac" should not be flagged
384+
$words = [
385+
'This product provides ample space for storage.',
386+
'The spacious design offers great workspace.',
387+
'Perfect for aerospace applications.',
388+
'Use the backspace key to delete.',
389+
'The spacecraft landed safely.',
390+
];
391+
392+
foreach ($words as $sentence) {
393+
$result = $this->blaspService->check($sentence);
394+
$this->assertFalse(
395+
$result->hasProfanity,
396+
"\"$sentence\" should not be flagged but got: " . implode(', ', $result->uniqueProfanitiesFound)
397+
);
398+
}
399+
400+
// The actual profanity "spac" standalone should still be caught
401+
$result = $this->blaspService->check('you spac');
402+
$this->assertTrue($result->hasProfanity);
403+
404+
// Obfuscated forms should still be caught
405+
$result = $this->blaspService->check('you sp@c');
406+
$this->assertTrue($result->hasProfanity);
407+
}
374408
}

0 commit comments

Comments
 (0)