fix: load and merge language-specific substitutions (#35) (#39)

deemonic · claude · web-flow · commit cf7e986088f4 · 2026-01-27T10:19:33.000Z
* fix: load and merge language-specific substitutions (#35) Language-specific substitutions in config files (e.g., french.php) were being ignored. The ConfigurationLoader now merges language substitutions with the main config. To avoid regex conflicts, only basic a-z letter patterns from language files are merged. Accented character patterns (like /ù/, /é/) are excluded because they can match inside already-substituted character classes, creating malformed regex. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * docs: add language-specific substitutions section to README Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: merge language substitutions instead of replacing Addresses CodeRabbit feedback: preserve base substitution variants when merging language-specific substitutions using array_merge/array_unique. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: use main config substitutions for all languages Language-specific substitutions in config files contain circular references (e.g., c→k and k→c in French) that cause regex conflicts when merged. The main config already includes comprehensive substitutions for all supported languages including accented characters. Updated README to clarify that substitutions should be customized in the main config/blasp.php file. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: single-pass substitution to handle circular and multi-char patterns (#35) Replace sequential preg_replace with a single-pass character walker that processes each position once, preventing circular substitutions (e.g., c→k and k→c) from corrupting regex output. Use alternation instead of character classes when substitution values contain multi-char strings. Re-enable language-specific substitution merging for single-language mode and accent-only merging for multi-language mode. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/README.md b/README.md
@@ -149,6 +149,22 @@ This will publish:
 - `config/blasp.php` - Main configuration with default language settings
 - `config/languages/` - Language-specific profanity lists (English, Spanish, German, French)
 
+### Character Substitutions
+
+Character substitutions (like `@` for `a`, `0` for `o`) are defined in the main `config/blasp.php` file and apply to all languages. The main config includes comprehensive substitutions for accented characters across all supported languages:
+
+```php
+// config/blasp.php
+'substitutions' => [
+    '/a/' => ['a', '4', '@', 'á', 'à', 'â', 'ä', ...],
+    '/c/' => ['c', 'Ç', 'ç', '¢', ...],
+    '/e/' => ['e', '3', '€', 'é', 'è', 'ê', ...],
+    // ... all 26 letters with their variants
+],
+```
+
+To customize substitutions, modify the main `config/blasp.php` file after publishing.
+
 ### Custom Configuration
 
 You can specify custom profanity and false positive lists using the `configure()` method:
diff --git a/src/Config/ConfigurationLoader.php b/src/Config/ConfigurationLoader.php
@@ -79,7 +79,23 @@ public function load(?array $customProfanities = null, ?array $customFalsePositi
         }
 
         $separators = config('blasp.separators');
+
         $substitutions = config('blasp.substitutions');
+        try {
+            $languageData = $this->loadLanguage($targetLanguage);
+            if (isset($languageData['substitutions']) && is_array($languageData['substitutions'])) {
+                foreach ($languageData['substitutions'] as $pattern => $values) {
+                    if (is_array($values)) {
+                        $substitutions[$pattern] = array_values(array_unique(array_merge(
+                            $substitutions[$pattern] ?? [],
+                            $values
+                        )));
+                    }
+                }
+            }
+        } catch (\Exception $e) {
+            // Keep main config substitutions
+        }
 
         $config = new DetectionConfig(
             $profanities,
@@ -107,7 +123,28 @@ public function loadMultiLanguage(array $languageData = [], string $defaultLangu
         }
 
         $separators = config('blasp.separators');
+
         $substitutions = config('blasp.substitutions');
+        foreach ($languageData as $langConfig) {
+            if (isset($langConfig['substitutions']) && is_array($langConfig['substitutions'])) {
+                foreach ($langConfig['substitutions'] as $pattern => $values) {
+                    if (is_array($values)) {
+                        // Only merge accent/diacritic substitution keys (e.g., /ç/, /ß/, /ñ/).
+                        // Skip base ASCII letter keys (e.g., /z/, /c/, /j/) and multi-char
+                        // keys (e.g., /ck/, /sch/) as these are language-specific phonetic
+                        // patterns that cause false positives when applied across all languages.
+                        $plainKey = trim($pattern, '/');
+                        if (mb_strlen($plainKey, 'UTF-8') > 1 || preg_match('/^[a-zA-Z]$/', $plainKey)) {
+                            continue;
+                        }
+                        $substitutions[$pattern] = array_values(array_unique(array_merge(
+                            $substitutions[$pattern] ?? [],
+                            $values
+                        )));
+                    }
+                }
+            }
+        }
 
         $config = new MultiLanguageDetectionConfig(
             $languageData,
diff --git a/src/Generators/ProfanityExpressionGenerator.php b/src/Generators/ProfanityExpressionGenerator.php
@@ -81,7 +81,28 @@ public function generateSubstitutionExpressions(array $substitutions): array
         $characterExpressions = [];
 
         foreach ($substitutions as $character => $substitutionOptions) {
-            $characterExpressions[$character] = $this->generateEscapedExpression($substitutionOptions, [], '+') . self::SEPARATOR_PLACEHOLDER;
+            $hasMultiChar = false;
+            foreach ($substitutionOptions as $option) {
+                // Check if option is a genuine multi-char string (not a pre-escaped single char like \$)
+                if (mb_strlen($option, 'UTF-8') > 1 && !preg_match('/^\\\\.$/u', $option)) {
+                    $hasMultiChar = true;
+                    break;
+                }
+            }
+
+            if ($hasMultiChar) {
+                // Use alternation for multi-char options: (?:sch|sh|ch|s)+
+                $escaped = array_map(function ($opt) {
+                    // Options that are already regex-escaped (like \$) should be kept as-is
+                    if (preg_match('/^\\\\.$/u', $opt)) {
+                        return $opt;
+                    }
+                    return preg_quote($opt, '/');
+                }, $substitutionOptions);
+                $characterExpressions[$character] = '(?:' . implode('|', $escaped) . ')+' . self::SEPARATOR_PLACEHOLDER;
+            } else {
+                $characterExpressions[$character] = $this->generateEscapedExpression($substitutionOptions, [], '+') . self::SEPARATOR_PLACEHOLDER;
+            }
         }
 
         return $characterExpressions;
@@ -97,13 +118,43 @@ public function generateSubstitutionExpressions(array $substitutions): array
      */
     public function generateProfanityExpression(string $profanity, array $substitutionExpressions, string $separatorExpression): string
     {
-        $expression = preg_replace(array_keys($substitutionExpressions), array_values($substitutionExpressions), $profanity);
+        // Build plain-key lookup: strip regex delimiters from keys
+        $plainSubstitutions = [];
+        foreach ($substitutionExpressions as $pattern => $replacement) {
+            $plainKey = trim($pattern, '/');
+            $plainSubstitutions[$plainKey] = $replacement;
+        }
 
-        $expression = str_replace(self::SEPARATOR_PLACEHOLDER, $separatorExpression, $expression);
+        // Sort by key length descending so multi-char keys (ph, qu) match first
+        uksort($plainSubstitutions, function ($a, $b) {
+            return mb_strlen($b, 'UTF-8') - mb_strlen($a, 'UTF-8');
+        });
+
+        // Single-pass: walk through profanity, match longest key at each position
+        $expression = '';
+        $i = 0;
+        $len = mb_strlen($profanity, 'UTF-8');
+
+        while ($i < $len) {
+            $matched = false;
+            foreach ($plainSubstitutions as $key => $replacement) {
+                $keyLen = mb_strlen($key, 'UTF-8');
+                if ($i + $keyLen <= $len && mb_substr($profanity, $i, $keyLen, 'UTF-8') === $key) {
+                    $expression .= $replacement;
+                    $i += $keyLen;
+                    $matched = true;
+                    break;
+                }
+            }
+            if (!$matched) {
+                $expression .= preg_quote(mb_substr($profanity, $i, 1, 'UTF-8'), '/');
+                $i++;
+            }
+        }
 
-        // Allow for non-word characters or spaces around the profanity
+        $expression = str_replace(self::SEPARATOR_PLACEHOLDER, $separatorExpression, $expression);
         $expression = '/' . $expression . '/i';
-        
+
         return $expression;
     }
 
diff --git a/tests/ConfigurationLoaderLanguageTest.php b/tests/ConfigurationLoaderLanguageTest.php
@@ -227,4 +227,27 @@ public function test_string_normalizer_for_languages()
         $normalizer = $config->getStringNormalizer();
         $this->assertInstanceOf(\Blaspsoft\Blasp\Normalizers\FrenchStringNormalizer::class, $normalizer);
     }
+
+    /**
+     * Test that language-specific substitutions are merged with main config.
+     */
+    public function test_language_substitutions_are_merged()
+    {
+        $config = $this->loader->load(null, null, 'french');
+        $substitutions = $config->getSubstitutions();
+
+        // Main config base patterns should be present
+        $this->assertArrayHasKey('/a/', $substitutions);
+        $this->assertArrayHasKey('/z/', $substitutions);
+
+        // French-specific patterns should be merged
+        $this->assertArrayHasKey('/c/', $substitutions);
+        $this->assertContains('k', $substitutions['/c/']);  // French adds k→c mapping
+        $this->assertContains('ç', $substitutions['/c/']);  // Both main + French have ç
+
+        // Verify substitution-dependent detection works
+        $service = new \Blaspsoft\Blasp\BlaspService();
+        $result = $service->language('french')->check('connard');
+        $this->assertTrue($result->hasProfanity);
+    }
 }
diff --git a/tests/ProfanityExpressionGeneratorTest.php b/tests/ProfanityExpressionGeneratorTest.php
@@ -276,6 +276,43 @@ public function test_period_handling_in_separators()
         $this->assertEquals(1, preg_match($testPattern, 't-.e.s-t'));
     }
 
+    public function test_circular_substitutions_produce_valid_regex()
+    {
+        $substitutions = [
+            '/c/' => ['c', 'k', 'ç'],
+            '/k/' => ['k', 'c', 'q'],
+        ];
+        $subExpressions = $this->generator->generateSubstitutionExpressions($substitutions);
+        $separatorExpr = $this->generator->generateSeparatorExpression([]);
+        $regex = $this->generator->generateProfanityExpression('cock', $subExpressions, $separatorExpr);
+
+        // Regex should be valid (no nested brackets)
+        $this->assertNotFalse(@preg_match($regex, ''));
+
+        // Should match the original word
+        $this->assertMatchesRegularExpression($regex, 'cock');
+
+        // Should match with substitutions
+        $this->assertMatchesRegularExpression($regex, 'kokk');
+        $this->assertMatchesRegularExpression($regex, 'çoçk');
+    }
+
+    public function test_multi_char_substitutions()
+    {
+        $substitutions = [
+            '/p/' => ['p'],
+            '/h/' => ['h'],
+            '/ph/' => ['ph', 'f'],
+        ];
+        $subExpressions = $this->generator->generateSubstitutionExpressions($substitutions);
+        $separatorExpr = $this->generator->generateSeparatorExpression([]);
+        $regex = $this->generator->generateProfanityExpression('phone', $subExpressions, $separatorExpr);
+
+        // 'ph' should be consumed as one unit, matching 'f'
+        $this->assertMatchesRegularExpression($regex, 'phone');
+        $this->assertMatchesRegularExpression($regex, 'fone');
+    }
+
     public function test_basic_profanity_matching()
     {
         $profanities = ['damn', 'hell'];