Skip to content

Commit c878093

Browse files
CopilotMrHinsh
andcommitted
Add emoji stripping to StringManipulatorTool for SOAP compatibility
Co-authored-by: MrHinsh <[email protected]>
1 parent d933c2d commit c878093

File tree

3 files changed

+48
-5
lines changed

3 files changed

+48
-5
lines changed

docs/content/docs/reference/tools/stringmanipulatortool/index.md

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,14 @@ Each manipulator supports these properties:
9999

100100
### Removing Invalid Characters
101101

102-
Remove control characters that may cause issues while preserving Unicode content:
102+
Remove control characters and emojis while preserving Unicode content:
103103

104104
```json
105105
{
106106
"$type": "RegexStringManipulator",
107-
"Description": "Remove control characters but preserve Unicode letters and symbols",
107+
"Description": "Remove control characters and emojis but preserve Unicode letters and symbols",
108108
"Enabled": true,
109-
"Pattern": "[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F-\\x9F]+",
109+
"Pattern": "[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F-\\x9F]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|\\uFE0F",
110110
"Replacement": ""
111111
}
112112
```
@@ -151,6 +151,25 @@ Remove or clean HTML tags from text fields:
151151
}
152152
```
153153

154+
### Removing Emojis for SOAP Compatibility
155+
156+
Remove emojis that can cause issues with SOAP interfaces while preserving other Unicode symbols:
157+
158+
```json
159+
{
160+
"$type": "RegexStringManipulator",
161+
"Description": "Remove emojis but preserve Unicode letters and symbols",
162+
"Enabled": true,
163+
"Pattern": "[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|\\uFE0F",
164+
"Replacement": ""
165+
}
166+
```
167+
168+
This pattern removes:
169+
- Emoji surrogate pairs (😀🔥💻🇺🇸)
170+
- Variation selectors that control emoji presentation
171+
- But preserves mathematical symbols (∑), arrows (→), checkmarks (✓), stars (★), and accented letters (café)
172+
154173
### Fixing Encoding Issues
155174

156175
Replace common encoding artifacts:

src/MigrationTools.Tests/ProcessorEnrichers/StringManipulatorEnricherTests.cs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,30 @@ public void StringManipulatorTool_DefaultManipulator_ExpectedBehavior(string val
221221
Assert.AreEqual(expected, newValue);
222222
}
223223

224+
[DataTestMethod(), TestCategory("L1")]
225+
[DataRow("Hello 😀 World", "Hello World")] // Basic emoticons should be stripped (surrogate pairs)
226+
[DataRow("Test 🔥 Fire", "Test Fire")] // Fire emoji should be stripped (surrogate pairs)
227+
[DataRow("Code 💻 Work", "Code Work")] // Laptop emoji should be stripped (surrogate pairs)
228+
[DataRow("Heart ❤️ Love", "Heart ❤ Love")] // Variation selector stripped, heart symbol preserved
229+
[DataRow("Flag 🇺🇸 Country", "Flag Country")] // Regional indicators stripped (surrogate pairs)
230+
[DataRow("Math ∑ Symbol", "Math ∑ Symbol")] // Mathematical symbols preserved (not surrogate pairs)
231+
[DataRow("Arrow → Direction", "Arrow → Direction")] // Arrows preserved (not surrogate pairs)
232+
[DataRow("Check ✓ Mark", "Check ✓ Mark")] // Useful dingbats preserved (not surrogate pairs)
233+
[DataRow("Star ★ Rating", "Star ★ Rating")] // Miscellaneous symbols preserved (not surrogate pairs)
234+
[DataRow("Café résumé", "Café résumé")] // Regular Unicode letters preserved
235+
[DataRow("Test\u0001\u0002", "Test")] // Control chars should be removed
236+
public void StringManipulatorTool_DefaultManipulator_EmojiStripping(string value, string expected)
237+
{
238+
var options = new StringManipulatorToolOptions();
239+
options.Enabled = true;
240+
options.MaxStringLength = 1000;
241+
// No manipulators set - should use default (which should strip emojis)
242+
var x = GetStringManipulatorTool(options);
243+
244+
string? newValue = x.ProcessString(value);
245+
Assert.AreEqual(expected, newValue);
246+
}
247+
224248
private static StringManipulatorTool GetStringManipulatorTool(StringManipulatorToolOptions options)
225249
{
226250
var services = new ServiceCollection();

src/MigrationTools/Tools/StringManipulatorTool.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ private void AddDefaultManipulator()
7373
Options.Manipulators.Add(new RegexStringManipulator()
7474
{
7575
Enabled = true,
76-
Description = "Default: Removes control characters but preserves Unicode letters and symbols",
77-
Pattern = @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]+",
76+
Description = "Default: Removes control characters and emojis but preserves Unicode letters and symbols",
77+
Pattern = @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]|[\uD800-\uDBFF][\uDC00-\uDFFF]|\uFE0F",
7878
Replacement = ""
7979
});
8080
}

0 commit comments

Comments
 (0)