Skip to content

Commit cf7e986

Browse files
deemonicclaude
andauthored
fix: load and merge language-specific substitutions (#35) (#39)
* fix: load and merge language-specific substitutions (#35) Language-specific substitutions in config files (e.g., french.php) were being ignored. The ConfigurationLoader now merges language substitutions with the main config. To avoid regex conflicts, only basic a-z letter patterns from language files are merged. Accented character patterns (like /ù/, /é/) are excluded because they can match inside already-substituted character classes, creating malformed regex. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * docs: add language-specific substitutions section to README Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: merge language substitutions instead of replacing Addresses CodeRabbit feedback: preserve base substitution variants when merging language-specific substitutions using array_merge/array_unique. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: use main config substitutions for all languages Language-specific substitutions in config files contain circular references (e.g., c→k and k→c in French) that cause regex conflicts when merged. The main config already includes comprehensive substitutions for all supported languages including accented characters. Updated README to clarify that substitutions should be customized in the main config/blasp.php file. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: single-pass substitution to handle circular and multi-char patterns (#35) Replace sequential preg_replace with a single-pass character walker that processes each position once, preventing circular substitutions (e.g., c→k and k→c) from corrupting regex output. Use alternation instead of character classes when substitution values contain multi-char strings. Re-enable language-specific substitution merging for single-language mode and accent-only merging for multi-language mode. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent f056f5f commit cf7e986

File tree

5 files changed

+169
-5
lines changed

5 files changed

+169
-5
lines changed

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,22 @@ This will publish:
149149
- `config/blasp.php` - Main configuration with default language settings
150150
- `config/languages/` - Language-specific profanity lists (English, Spanish, German, French)
151151

152+
### Character Substitutions
153+
154+
Character substitutions (like `@` for `a`, `0` for `o`) are defined in the main `config/blasp.php` file and apply to all languages. The main config includes comprehensive substitutions for accented characters across all supported languages:
155+
156+
```php
157+
// config/blasp.php
158+
'substitutions' => [
159+
'/a/' => ['a', '4', '@', 'á', 'à', 'â', 'ä', ...],
160+
'/c/' => ['c', 'Ç', 'ç', '¢', ...],
161+
'/e/' => ['e', '3', '€', 'é', 'è', 'ê', ...],
162+
// ... all 26 letters with their variants
163+
],
164+
```
165+
166+
To customize substitutions, modify the main `config/blasp.php` file after publishing.
167+
152168
### Custom Configuration
153169

154170
You can specify custom profanity and false positive lists using the `configure()` method:

src/Config/ConfigurationLoader.php

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,23 @@ public function load(?array $customProfanities = null, ?array $customFalsePositi
7979
}
8080

8181
$separators = config('blasp.separators');
82+
8283
$substitutions = config('blasp.substitutions');
84+
try {
85+
$languageData = $this->loadLanguage($targetLanguage);
86+
if (isset($languageData['substitutions']) && is_array($languageData['substitutions'])) {
87+
foreach ($languageData['substitutions'] as $pattern => $values) {
88+
if (is_array($values)) {
89+
$substitutions[$pattern] = array_values(array_unique(array_merge(
90+
$substitutions[$pattern] ?? [],
91+
$values
92+
)));
93+
}
94+
}
95+
}
96+
} catch (\Exception $e) {
97+
// Keep main config substitutions
98+
}
8399

84100
$config = new DetectionConfig(
85101
$profanities,
@@ -107,7 +123,28 @@ public function loadMultiLanguage(array $languageData = [], string $defaultLangu
107123
}
108124

109125
$separators = config('blasp.separators');
126+
110127
$substitutions = config('blasp.substitutions');
128+
foreach ($languageData as $langConfig) {
129+
if (isset($langConfig['substitutions']) && is_array($langConfig['substitutions'])) {
130+
foreach ($langConfig['substitutions'] as $pattern => $values) {
131+
if (is_array($values)) {
132+
// Only merge accent/diacritic substitution keys (e.g., /ç/, /ß/, /ñ/).
133+
// Skip base ASCII letter keys (e.g., /z/, /c/, /j/) and multi-char
134+
// keys (e.g., /ck/, /sch/) as these are language-specific phonetic
135+
// patterns that cause false positives when applied across all languages.
136+
$plainKey = trim($pattern, '/');
137+
if (mb_strlen($plainKey, 'UTF-8') > 1 || preg_match('/^[a-zA-Z]$/', $plainKey)) {
138+
continue;
139+
}
140+
$substitutions[$pattern] = array_values(array_unique(array_merge(
141+
$substitutions[$pattern] ?? [],
142+
$values
143+
)));
144+
}
145+
}
146+
}
147+
}
111148

112149
$config = new MultiLanguageDetectionConfig(
113150
$languageData,

src/Generators/ProfanityExpressionGenerator.php

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,28 @@ public function generateSubstitutionExpressions(array $substitutions): array
8181
$characterExpressions = [];
8282

8383
foreach ($substitutions as $character => $substitutionOptions) {
84-
$characterExpressions[$character] = $this->generateEscapedExpression($substitutionOptions, [], '+') . self::SEPARATOR_PLACEHOLDER;
84+
$hasMultiChar = false;
85+
foreach ($substitutionOptions as $option) {
86+
// Check if option is a genuine multi-char string (not a pre-escaped single char like \$)
87+
if (mb_strlen($option, 'UTF-8') > 1 && !preg_match('/^\\\\.$/u', $option)) {
88+
$hasMultiChar = true;
89+
break;
90+
}
91+
}
92+
93+
if ($hasMultiChar) {
94+
// Use alternation for multi-char options: (?:sch|sh|ch|s)+
95+
$escaped = array_map(function ($opt) {
96+
// Options that are already regex-escaped (like \$) should be kept as-is
97+
if (preg_match('/^\\\\.$/u', $opt)) {
98+
return $opt;
99+
}
100+
return preg_quote($opt, '/');
101+
}, $substitutionOptions);
102+
$characterExpressions[$character] = '(?:' . implode('|', $escaped) . ')+' . self::SEPARATOR_PLACEHOLDER;
103+
} else {
104+
$characterExpressions[$character] = $this->generateEscapedExpression($substitutionOptions, [], '+') . self::SEPARATOR_PLACEHOLDER;
105+
}
85106
}
86107

87108
return $characterExpressions;
@@ -97,13 +118,43 @@ public function generateSubstitutionExpressions(array $substitutions): array
97118
*/
98119
public function generateProfanityExpression(string $profanity, array $substitutionExpressions, string $separatorExpression): string
99120
{
100-
$expression = preg_replace(array_keys($substitutionExpressions), array_values($substitutionExpressions), $profanity);
121+
// Build plain-key lookup: strip regex delimiters from keys
122+
$plainSubstitutions = [];
123+
foreach ($substitutionExpressions as $pattern => $replacement) {
124+
$plainKey = trim($pattern, '/');
125+
$plainSubstitutions[$plainKey] = $replacement;
126+
}
101127

102-
$expression = str_replace(self::SEPARATOR_PLACEHOLDER, $separatorExpression, $expression);
128+
// Sort by key length descending so multi-char keys (ph, qu) match first
129+
uksort($plainSubstitutions, function ($a, $b) {
130+
return mb_strlen($b, 'UTF-8') - mb_strlen($a, 'UTF-8');
131+
});
132+
133+
// Single-pass: walk through profanity, match longest key at each position
134+
$expression = '';
135+
$i = 0;
136+
$len = mb_strlen($profanity, 'UTF-8');
137+
138+
while ($i < $len) {
139+
$matched = false;
140+
foreach ($plainSubstitutions as $key => $replacement) {
141+
$keyLen = mb_strlen($key, 'UTF-8');
142+
if ($i + $keyLen <= $len && mb_substr($profanity, $i, $keyLen, 'UTF-8') === $key) {
143+
$expression .= $replacement;
144+
$i += $keyLen;
145+
$matched = true;
146+
break;
147+
}
148+
}
149+
if (!$matched) {
150+
$expression .= preg_quote(mb_substr($profanity, $i, 1, 'UTF-8'), '/');
151+
$i++;
152+
}
153+
}
103154

104-
// Allow for non-word characters or spaces around the profanity
155+
$expression = str_replace(self::SEPARATOR_PLACEHOLDER, $separatorExpression, $expression);
105156
$expression = '/' . $expression . '/i';
106-
157+
107158
return $expression;
108159
}
109160

tests/ConfigurationLoaderLanguageTest.php

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,4 +227,27 @@ public function test_string_normalizer_for_languages()
227227
$normalizer = $config->getStringNormalizer();
228228
$this->assertInstanceOf(\Blaspsoft\Blasp\Normalizers\FrenchStringNormalizer::class, $normalizer);
229229
}
230+
231+
/**
232+
* Test that language-specific substitutions are merged with main config.
233+
*/
234+
public function test_language_substitutions_are_merged()
235+
{
236+
$config = $this->loader->load(null, null, 'french');
237+
$substitutions = $config->getSubstitutions();
238+
239+
// Main config base patterns should be present
240+
$this->assertArrayHasKey('/a/', $substitutions);
241+
$this->assertArrayHasKey('/z/', $substitutions);
242+
243+
// French-specific patterns should be merged
244+
$this->assertArrayHasKey('/c/', $substitutions);
245+
$this->assertContains('k', $substitutions['/c/']); // French adds k→c mapping
246+
$this->assertContains('ç', $substitutions['/c/']); // Both main + French have ç
247+
248+
// Verify substitution-dependent detection works
249+
$service = new \Blaspsoft\Blasp\BlaspService();
250+
$result = $service->language('french')->check('connard');
251+
$this->assertTrue($result->hasProfanity);
252+
}
230253
}

tests/ProfanityExpressionGeneratorTest.php

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,43 @@ public function test_period_handling_in_separators()
276276
$this->assertEquals(1, preg_match($testPattern, 't-.e.s-t'));
277277
}
278278

279+
public function test_circular_substitutions_produce_valid_regex()
280+
{
281+
$substitutions = [
282+
'/c/' => ['c', 'k', 'ç'],
283+
'/k/' => ['k', 'c', 'q'],
284+
];
285+
$subExpressions = $this->generator->generateSubstitutionExpressions($substitutions);
286+
$separatorExpr = $this->generator->generateSeparatorExpression([]);
287+
$regex = $this->generator->generateProfanityExpression('cock', $subExpressions, $separatorExpr);
288+
289+
// Regex should be valid (no nested brackets)
290+
$this->assertNotFalse(@preg_match($regex, ''));
291+
292+
// Should match the original word
293+
$this->assertMatchesRegularExpression($regex, 'cock');
294+
295+
// Should match with substitutions
296+
$this->assertMatchesRegularExpression($regex, 'kokk');
297+
$this->assertMatchesRegularExpression($regex, 'çoçk');
298+
}
299+
300+
public function test_multi_char_substitutions()
301+
{
302+
$substitutions = [
303+
'/p/' => ['p'],
304+
'/h/' => ['h'],
305+
'/ph/' => ['ph', 'f'],
306+
];
307+
$subExpressions = $this->generator->generateSubstitutionExpressions($substitutions);
308+
$separatorExpr = $this->generator->generateSeparatorExpression([]);
309+
$regex = $this->generator->generateProfanityExpression('phone', $subExpressions, $separatorExpr);
310+
311+
// 'ph' should be consumed as one unit, matching 'f'
312+
$this->assertMatchesRegularExpression($regex, 'phone');
313+
$this->assertMatchesRegularExpression($regex, 'fone');
314+
}
315+
279316
public function test_basic_profanity_matching()
280317
{
281318
$profanities = ['damn', 'hell'];

0 commit comments

Comments
 (0)