Skip to content

Commit 6464ef6

Browse files
CopilotMrHinsh
andcommitted
Fix StringManipulatorTool to support Unicode characters
Co-authored-by: MrHinsh <[email protected]>
1 parent d8dadad commit 6464ef6

File tree

2 files changed

+54
-2
lines changed

2 files changed

+54
-2
lines changed

src/MigrationTools.Tests/ProcessorEnrichers/StringManipulatorEnricherTests.cs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,58 @@ public void StringManipulatorTool_MultipleManipulators(string? value, string? ex
169169
Assert.AreEqual(expected, newValue);
170170
}
171171

172+
[DataTestMethod(), TestCategory("L1")]
173+
[DataRow("Hello", "Hello")]
174+
[DataRow("Héllo", "Héllo")] // New behavior: accented chars preserved
175+
[DataRow("Привет", "Привет")] // New behavior: Cyrillic chars preserved
176+
[DataRow("你好", "你好")] // New behavior: Chinese chars preserved
177+
[DataRow("Café résumé", "Café résumé")] // New behavior: accented chars preserved
178+
[DataRow("Test\u0001\u0002", "Test")] // Control chars should be removed
179+
[DataRow("Line1\nLine2", "Line1\nLine2")] // Newlines should be preserved
180+
[DataRow("Tab\tSeparated", "Tab\tSeparated")] // Tabs should be preserved
181+
public void StringManipulatorTool_DefaultManipulator_UnicodeSupport(string value, string expected)
182+
{
183+
var options = new StringManipulatorToolOptions();
184+
options.Enabled = true;
185+
options.MaxStringLength = 1000;
186+
// No manipulators set - should use default
187+
var x = GetStringManipulatorTool(options);
188+
189+
string? newValue = x.ProcessString(value);
190+
Assert.AreEqual(expected, newValue);
191+
}
192+
193+
[DataTestMethod(), TestCategory("L1")]
194+
[DataRow("Hello", "Hello")]
195+
[DataRow("Héllo", "Héllo")] // Expected behavior: accented chars preserved
196+
[DataRow("Привет", "Привет")] // Expected behavior: Cyrillic chars preserved
197+
[DataRow("你好", "你好")] // Expected behavior: Chinese chars preserved
198+
[DataRow("Café résumé", "Café résumé")] // Expected behavior: accented chars preserved
199+
[DataRow("Test\u0001\u0002", "Test")] // Control chars should still be removed
200+
[DataRow("Line1\nLine2", "Line1\nLine2")] // Newlines should be preserved
201+
[DataRow("Tab\tSeparated", "Tab\tSeparated")] // Tabs should be preserved
202+
public void StringManipulatorTool_DefaultManipulator_ExpectedBehavior(string value, string expected)
203+
{
204+
var options = new StringManipulatorToolOptions();
205+
options.Enabled = true;
206+
options.MaxStringLength = 1000;
207+
// Use improved Unicode-supporting pattern
208+
options.Manipulators = new List<RegexStringManipulator>
209+
{
210+
new RegexStringManipulator
211+
{
212+
Enabled = true,
213+
Description = "Default: Removes control characters but preserves Unicode letters and symbols",
214+
Pattern = @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]+",
215+
Replacement = ""
216+
}
217+
};
218+
var x = GetStringManipulatorTool(options);
219+
220+
string? newValue = x.ProcessString(value);
221+
Assert.AreEqual(expected, newValue);
222+
}
223+
172224
private static StringManipulatorTool GetStringManipulatorTool(StringManipulatorToolOptions options)
173225
{
174226
var services = new ServiceCollection();

src/MigrationTools/Tools/StringManipulatorTool.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ private void AddDefaultManipulator()
7373
Options.Manipulators.Add(new RegexStringManipulator()
7474
{
7575
Enabled = true,
76-
Description = "Default: Removes invalid chars!",
77-
Pattern = "[^( -~)\n\r\t]+",
76+
Description = "Default: Removes control characters but preserves Unicode letters and symbols",
77+
Pattern = @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]+",
7878
Replacement = ""
7979
});
8080
}

0 commit comments

Comments
 (0)