From 60c6dc90c8d9754ffd909ed52a247e095fe9b565 Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Tue, 17 Jun 2025 11:27:32 +0700 Subject: [PATCH 1/8] write a test for xml serialization --- .../DomainImpl/StringsTests.cs | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs index 55a16db6..47314f03 100644 --- a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs +++ b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs @@ -3,7 +3,9 @@ // (http://www.gnu.org/licenses/lgpl-2.1.html) using System; +using System.IO; using System.Linq; +using System.Text; using NUnit.Framework; using SIL.LCModel.Core.KernelInterfaces; using SIL.LCModel.Core.Text; @@ -283,6 +285,33 @@ public void AppendAlternativesTest() Assert.AreEqual("Saltillo Mexico", Cache.LangProject.FieldWorkLocation.get_String(english.Handle).Text); Assert.AreEqual("Saltillo Mejico", Cache.LangProject.FieldWorkLocation.get_String(spanish.Handle).Text); } + + [Test] + public void ToXml_WorksAsExpected() + { + var english = Cache.LangProject.CurrentAnalysisWritingSystems.First(); + Cache.LangProject.MainCountry.set_String(english.Handle, TsStringUtils.MakeString("Mexico", english.Handle)); + var xml = ToXml(Cache.LangProject.MainCountry); + Assert.AreEqual("Mexico", xml, "XML does not contain expected string element."); + } + + [Test] + public void ToXml_WithControlCharacterWorks() + { + var english = Cache.LangProject.CurrentAnalysisWritingSystems.First(); + var tsString = TsStringUtils.MakeString("te\u0002st", english.Handle); + Cache.LangProject.MainCountry.set_String(english.Handle, tsString); + var xml = ToXml(Cache.LangProject.MainCountry); + Assert.AreEqual("test", xml, "XML does not contain expected string element."); + } + + private string ToXml(ITsMultiString multiString) + { + using var ms = new MemoryStream(); + using var xmlWriter = XmlServices.CreateWriter(ms); + ((MultiAccessor)multiString).ToXMLString(xmlWriter); + return Encoding.UTF8.GetString(ms.ToArray()); + } } /// ---------------------------------------------------------------------------------------- From 54e3b8087dd6e5d8c640a1dabe413039fa5a13ca Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Tue, 17 Jun 2025 13:04:20 +0700 Subject: [PATCH 2/8] use regex to strip bad chars from strings before they get into xml --- src/SIL.LCModel.Core/Text/TsStringSerializer.cs | 11 ++++++++++- .../Text/TsStringSerializerTests.cs | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs index 56f0eac7..9171166f 100644 --- a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs +++ b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs @@ -11,6 +11,7 @@ using System; using System.Linq; using System.Text; +using System.Text.RegularExpressions; using System.Xml; using System.Xml.Linq; using System.Xml.Schema; @@ -173,7 +174,7 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto if (runText != string.Empty && runText.All(char.IsWhiteSpace)) writer.WriteAttributeString("xml", "space", "", "preserve"); // TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation - writer.WriteString(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC)); + writer.WriteString(StripInvalidXmlChars(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC))); } writer.WriteEndElement(); @@ -187,6 +188,14 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto return xml.ToString(); } + private static readonly Regex InvalidXmlRegex = new Regex(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", RegexOptions.Compiled); + public static string StripInvalidXmlChars(string text) + { + // Remove characters not allowed in XML: + // Valid are: #x9, #xA, #xD, #x20-#xD7FF, #xE000-#xFFFD. + return InvalidXmlRegex.Replace(text, string.Empty); + } + #endregion #region Serialization Helper Methods diff --git a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs index fadb7179..0a5597cb 100644 --- a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs +++ b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs @@ -42,6 +42,14 @@ public void SerializeTsStringToXml_Simple() Assert.That(StripNewLines(xml), Is.EqualTo("This is a test!")); } + [Test] + public void SerializeTsStringToXml_StripsInvalidControlCharacter() + { + ITsString tss = TsStringUtils.MakeString("This is a te\u0002st!", EnWS); + string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager); + Assert.That(StripNewLines(xml), Is.EqualTo("This is a test!")); + } + ///-------------------------------------------------------------------------------------- /// /// Tests the method SerializeTsStringToXml with a MultiString. This should From 099990e8b136d8e2f0a7c42016738f0497695972 Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Tue, 17 Jun 2025 13:11:22 +0700 Subject: [PATCH 3/8] flush xml writer to ensure the memory stream isn't empty --- tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs index 47314f03..ee6360b8 100644 --- a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs +++ b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs @@ -310,6 +310,7 @@ private string ToXml(ITsMultiString multiString) using var ms = new MemoryStream(); using var xmlWriter = XmlServices.CreateWriter(ms); ((MultiAccessor)multiString).ToXMLString(xmlWriter); + xmlWriter.Flush(); return Encoding.UTF8.GetString(ms.ToArray()); } } From 3f891a154f1e4b2c2a81a3ed046976fe42d7d691 Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Tue, 17 Jun 2025 13:12:48 +0700 Subject: [PATCH 4/8] use strip invalid xml chars for Unicode strings --- src/SIL.LCModel/DomainImpl/Strings.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SIL.LCModel/DomainImpl/Strings.cs b/src/SIL.LCModel/DomainImpl/Strings.cs index 2220b9c0..c554f5e9 100644 --- a/src/SIL.LCModel/DomainImpl/Strings.cs +++ b/src/SIL.LCModel/DomainImpl/Strings.cs @@ -900,7 +900,7 @@ protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int writer.WriteStartElement("AUni"); writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id); - text = Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC); + text = TsStringSerializer.StripInvalidXmlChars(Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC)); writer.WriteString(text); writer.WriteEndElement(); } From 9334b537968f5ae0cadb6caba31878a3eb6b5117 Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Mon, 23 Jun 2025 13:08:09 +0700 Subject: [PATCH 5/8] add a link which documents the valid xml chars --- src/SIL.LCModel.Core/Text/TsStringSerializer.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs index 9171166f..f5d4a73f 100644 --- a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs +++ b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs @@ -193,6 +193,7 @@ public static string StripInvalidXmlChars(string text) { // Remove characters not allowed in XML: // Valid are: #x9, #xA, #xD, #x20-#xD7FF, #xE000-#xFFFD. + // Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML return InvalidXmlRegex.Replace(text, string.Empty); } From 07aa6cec86aac89c05c1d5ee7d5ca21dac1987a1 Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Wed, 6 Aug 2025 16:32:06 +0700 Subject: [PATCH 6/8] =?UTF-8?q?include=20characters=20in=20the=20range=20U?= =?UTF-8?q?+10000=E2=80=93U+10FFFF,=20and=20add=20a=20test=20ensuring=20th?= =?UTF-8?q?ey=20aren't=20stripped?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/SIL.LCModel.Core/Text/TsStringSerializer.cs | 4 ++-- .../Text/TsStringSerializerTests.cs | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs index f5d4a73f..23e33cf5 100644 --- a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs +++ b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs @@ -188,11 +188,11 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto return xml.ToString(); } - private static readonly Regex InvalidXmlRegex = new Regex(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD]", RegexOptions.Compiled); + private static readonly Regex InvalidXmlRegex = new Regex(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]", RegexOptions.Compiled); public static string StripInvalidXmlChars(string text) { // Remove characters not allowed in XML: - // Valid are: #x9, #xA, #xD, #x20-#xD7FF, #xE000-#xFFFD. + // Valid are: #x9, #xA, #xD, #x20-#xD7FF, #xE000-#xFFFD, #x10000-#x10FFFF. // Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML return InvalidXmlRegex.Replace(text, string.Empty); } diff --git a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs index 0a5597cb..8fd7a75d 100644 --- a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs +++ b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs @@ -50,6 +50,19 @@ public void SerializeTsStringToXml_StripsInvalidControlCharacter() Assert.That(StripNewLines(xml), Is.EqualTo("This is a test!")); } + [Test] + [TestCase("This is a test!")] + [TestCase(" 𐰉 (dǒng)")]//Nushu script + [TestCase("𠔤野 (Nishino)")]//Japanese Kanji + [TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese + public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word) + { + ITsString tss = TsStringUtils.MakeString(word, EnWS); + string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager); + Assert.That(StripNewLines(xml), + Is.EqualTo($"{word}")); + } + ///-------------------------------------------------------------------------------------- /// /// Tests the method SerializeTsStringToXml with a MultiString. This should From f44b8d487d56a1c8619e9292c3663be515bb7624 Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Wed, 6 Aug 2025 16:49:56 +0700 Subject: [PATCH 7/8] use the correct mather for the surrogate pair code range --- src/SIL.LCModel.Core/Text/TsStringSerializer.cs | 2 +- tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs index 23e33cf5..083c8347 100644 --- a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs +++ b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs @@ -188,7 +188,7 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto return xml.ToString(); } - private static readonly Regex InvalidXmlRegex = new Regex(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]", RegexOptions.Compiled); + private static readonly Regex InvalidXmlRegex = new Regex(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\p{Cs}]", RegexOptions.Compiled); public static string StripInvalidXmlChars(string text) { // Remove characters not allowed in XML: diff --git a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs index 8fd7a75d..adf2acbb 100644 --- a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs +++ b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs @@ -55,6 +55,7 @@ public void SerializeTsStringToXml_StripsInvalidControlCharacter() [TestCase(" 𐰉 (dǒng)")]//Nushu script [TestCase("𠔤野 (Nishino)")]//Japanese Kanji [TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese + [TestCase("🦊")]//emoji public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word) { ITsString tss = TsStringUtils.MakeString(word, EnWS); From cbd23aab243fbe708de046f89746892cca842fb8 Mon Sep 17 00:00:00 2001 From: Kevin Hahn Date: Thu, 7 Aug 2025 13:52:49 +0700 Subject: [PATCH 8/8] switched regex to match on invalid chars --- src/SIL.LCModel.Core/Text/TsStringSerializer.cs | 5 ++--- tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs index 083c8347..173b6a7a 100644 --- a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs +++ b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs @@ -188,11 +188,10 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto return xml.ToString(); } - private static readonly Regex InvalidXmlRegex = new Regex(@"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\p{Cs}]", RegexOptions.Compiled); + private static readonly Regex InvalidXmlRegex = new Regex(@"[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]", RegexOptions.Compiled); public static string StripInvalidXmlChars(string text) { - // Remove characters not allowed in XML: - // Valid are: #x9, #xA, #xD, #x20-#xD7FF, #xE000-#xFFFD, #x10000-#x10FFFF. + // Remove characters not allowed in XML // Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML return InvalidXmlRegex.Replace(text, string.Empty); } diff --git a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs index adf2acbb..374a43ab 100644 --- a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs +++ b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs @@ -56,6 +56,7 @@ public void SerializeTsStringToXml_StripsInvalidControlCharacter() [TestCase("𠔤野 (Nishino)")]//Japanese Kanji [TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese [TestCase("🦊")]//emoji + [TestCase("\u200B\u200D\u200E\uDA00\uDC01")] public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word) { ITsString tss = TsStringUtils.MakeString(word, EnWS);