Skip to content

Commit 7b63718

Browse files
Strip invalid char data from strings on save (#329)
* write a test for xml serialization * use regex to strip bad chars from strings before they get into xml * add a link which documents the valid xml chars * include characters in the range U+10000–U+10FFFF, and add a test ensuring they aren't stripped --------- Co-authored-by: Jason Naylor <[email protected]>
1 parent 3a1ce4f commit 7b63718

File tree

4 files changed

+64
-2
lines changed

4 files changed

+64
-2
lines changed

src/SIL.LCModel.Core/Text/TsStringSerializer.cs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
using System;
1212
using System.Linq;
1313
using System.Text;
14+
using System.Text.RegularExpressions;
1415
using System.Xml;
1516
using System.Xml.Linq;
1617
using System.Xml.Schema;
@@ -173,7 +174,7 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
173174
if (runText != string.Empty && runText.All(char.IsWhiteSpace))
174175
writer.WriteAttributeString("xml", "space", "", "preserve");
175176
// TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation
176-
writer.WriteString(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC));
177+
writer.WriteString(StripInvalidXmlChars(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC)));
177178
}
178179

179180
writer.WriteEndElement();
@@ -187,6 +188,14 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
187188
return xml.ToString();
188189
}
189190

191+
private static readonly Regex InvalidXmlRegex = new Regex(@"[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]", RegexOptions.Compiled);
192+
public static string StripInvalidXmlChars(string text)
193+
{
194+
// Remove characters not allowed in XML
195+
// Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML
196+
return InvalidXmlRegex.Replace(text, string.Empty);
197+
}
198+
190199
#endregion
191200

192201
#region Serialization Helper Methods

src/SIL.LCModel/DomainImpl/Strings.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -900,7 +900,7 @@ protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int
900900

901901
writer.WriteStartElement("AUni");
902902
writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id);
903-
text = Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC);
903+
text = TsStringSerializer.StripInvalidXmlChars(Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC));
904904
writer.WriteString(text);
905905
writer.WriteEndElement();
906906
}

tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,29 @@ public void SerializeTsStringToXml_Simple()
4242
Assert.That(StripNewLines(xml), Is.EqualTo("<Str><Run ws=\"en\">This is a test!</Run></Str>"));
4343
}
4444

45+
[Test]
46+
public void SerializeTsStringToXml_StripsInvalidControlCharacter()
47+
{
48+
ITsString tss = TsStringUtils.MakeString("This is a te\u0002st!", EnWS);
49+
string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
50+
Assert.That(StripNewLines(xml), Is.EqualTo("<Str><Run ws=\"en\">This is a test!</Run></Str>"));
51+
}
52+
53+
[Test]
54+
[TestCase("This is a test!")]
55+
[TestCase(" 𐰉 (dǒng)")]//Nushu script
56+
[TestCase("𠔤野 (Nishino)")]//Japanese Kanji
57+
[TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese
58+
[TestCase("🦊")]//emoji
59+
[TestCase("\u200B\u200D\u200E\uDA00\uDC01")]
60+
public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word)
61+
{
62+
ITsString tss = TsStringUtils.MakeString(word, EnWS);
63+
string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
64+
Assert.That(StripNewLines(xml),
65+
Is.EqualTo($"<Str><Run ws=\"en\">{word}</Run></Str>"));
66+
}
67+
4568
///--------------------------------------------------------------------------------------
4669
/// <summary>
4770
/// Tests the method SerializeTsStringToXml with a MultiString. This should

tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
// (http://www.gnu.org/licenses/lgpl-2.1.html)
44

55
using System;
6+
using System.IO;
67
using System.Linq;
8+
using System.Text;
79
using NUnit.Framework;
810
using SIL.LCModel.Core.KernelInterfaces;
911
using SIL.LCModel.Core.Text;
@@ -283,6 +285,34 @@ public void AppendAlternativesTest()
283285
Assert.AreEqual("Saltillo Mexico", Cache.LangProject.FieldWorkLocation.get_String(english.Handle).Text);
284286
Assert.AreEqual("Saltillo Mejico", Cache.LangProject.FieldWorkLocation.get_String(spanish.Handle).Text);
285287
}
288+
289+
[Test]
290+
public void ToXml_WorksAsExpected()
291+
{
292+
var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
293+
Cache.LangProject.MainCountry.set_String(english.Handle, TsStringUtils.MakeString("Mexico", english.Handle));
294+
var xml = ToXml(Cache.LangProject.MainCountry);
295+
Assert.AreEqual("<AUni ws=\"en\">Mexico</AUni>", xml, "XML does not contain expected string element.");
296+
}
297+
298+
[Test]
299+
public void ToXml_WithControlCharacterWorks()
300+
{
301+
var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
302+
var tsString = TsStringUtils.MakeString("te\u0002st", english.Handle);
303+
Cache.LangProject.MainCountry.set_String(english.Handle, tsString);
304+
var xml = ToXml(Cache.LangProject.MainCountry);
305+
Assert.AreEqual("<AUni ws=\"en\">test</AUni>", xml, "XML does not contain expected string element.");
306+
}
307+
308+
private string ToXml(ITsMultiString multiString)
309+
{
310+
using var ms = new MemoryStream();
311+
using var xmlWriter = XmlServices.CreateWriter(ms);
312+
((MultiAccessor)multiString).ToXMLString(xmlWriter);
313+
xmlWriter.Flush();
314+
return Encoding.UTF8.GetString(ms.ToArray());
315+
}
286316
}
287317

288318
/// ----------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)