Skip to content
11 changes: 10 additions & 1 deletion src/SIL.LCModel.Core/Text/TsStringSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
using System;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
using System.Xml.Schema;
Expand Down Expand Up @@ -173,7 +174,7 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
if (runText != string.Empty && runText.All(char.IsWhiteSpace))
writer.WriteAttributeString("xml", "space", "", "preserve");
// TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation
writer.WriteString(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC));
writer.WriteString(StripInvalidXmlChars(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC)));
}

writer.WriteEndElement();
Expand All @@ -187,6 +188,14 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
return xml.ToString();
}

private static readonly Regex InvalidXmlRegex = new Regex(@"[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]", RegexOptions.Compiled);
public static string StripInvalidXmlChars(string text)
{
// Remove characters not allowed in XML
// Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML
return InvalidXmlRegex.Replace(text, string.Empty);
}

#endregion

#region Serialization Helper Methods
Expand Down
2 changes: 1 addition & 1 deletion src/SIL.LCModel/DomainImpl/Strings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -900,7 +900,7 @@ protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int

writer.WriteStartElement("AUni");
writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id);
text = Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC);
text = TsStringSerializer.StripInvalidXmlChars(Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC));
writer.WriteString(text);
writer.WriteEndElement();
}
Expand Down
23 changes: 23 additions & 0 deletions tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,29 @@ public void SerializeTsStringToXml_Simple()
Assert.That(StripNewLines(xml), Is.EqualTo("<Str><Run ws=\"en\">This is a test!</Run></Str>"));
}

[Test]
public void SerializeTsStringToXml_StripsInvalidControlCharacter()
{
ITsString tss = TsStringUtils.MakeString("This is a te\u0002st!", EnWS);
string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
Assert.That(StripNewLines(xml), Is.EqualTo("<Str><Run ws=\"en\">This is a test!</Run></Str>"));
}

[Test]
[TestCase("This is a test!")]
[TestCase(" 𐰉 (dǒng)")]//Nushu script
[TestCase("𠔤野 (Nishino)")]//Japanese Kanji
[TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese
[TestCase("🦊")]//emoji
[TestCase("\u200B\u200D\u200E\uDA00\uDC01")]
public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word)
{
ITsString tss = TsStringUtils.MakeString(word, EnWS);
string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
Assert.That(StripNewLines(xml),
Is.EqualTo($"<Str><Run ws=\"en\">{word}</Run></Str>"));
}

///--------------------------------------------------------------------------------------
/// <summary>
/// Tests the method SerializeTsStringToXml with a MultiString. This should
Expand Down
30 changes: 30 additions & 0 deletions tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
// (http://www.gnu.org/licenses/lgpl-2.1.html)

using System;
using System.IO;
using System.Linq;
using System.Text;
using NUnit.Framework;
using SIL.LCModel.Core.KernelInterfaces;
using SIL.LCModel.Core.Text;
Expand Down Expand Up @@ -283,6 +285,34 @@ public void AppendAlternativesTest()
Assert.AreEqual("Saltillo Mexico", Cache.LangProject.FieldWorkLocation.get_String(english.Handle).Text);
Assert.AreEqual("Saltillo Mejico", Cache.LangProject.FieldWorkLocation.get_String(spanish.Handle).Text);
}

[Test]
public void ToXml_WorksAsExpected()
{
var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
Cache.LangProject.MainCountry.set_String(english.Handle, TsStringUtils.MakeString("Mexico", english.Handle));
var xml = ToXml(Cache.LangProject.MainCountry);
Assert.AreEqual("<AUni ws=\"en\">Mexico</AUni>", xml, "XML does not contain expected string element.");
}

[Test]
public void ToXml_WithControlCharacterWorks()
{
var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
var tsString = TsStringUtils.MakeString("te\u0002st", english.Handle);
Cache.LangProject.MainCountry.set_String(english.Handle, tsString);
var xml = ToXml(Cache.LangProject.MainCountry);
Assert.AreEqual("<AUni ws=\"en\">test</AUni>", xml, "XML does not contain expected string element.");
}

private string ToXml(ITsMultiString multiString)
{
using var ms = new MemoryStream();
using var xmlWriter = XmlServices.CreateWriter(ms);
((MultiAccessor)multiString).ToXMLString(xmlWriter);
xmlWriter.Flush();
return Encoding.UTF8.GetString(ms.ToArray());
}
}

/// ----------------------------------------------------------------------------------------
Expand Down
Loading