Skip to content

Commit 03621d1

Browse files
authored
Fix crash when surrogate pair handling multibyte Unicode characters (#361)
1 parent 7651a39 commit 03621d1

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

src/SIL.Machine/PunctuationAnalysis/TextSegment.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -187,13 +187,13 @@ public class SurrogatePairString
187187
public SurrogatePairString(string stringValue)
188188
{
189189
_stringValue = stringValue;
190-
IEnumerable<(int SurrogatePairIndex, int StringIndex)> indexPairs = _stringValue
191-
.Select((c, i) => (c, i))
190+
IEnumerable<(int StringIndex, int SurrogatePairIndex)> indexPairs = _stringValue
191+
.Select((c, stringIndex) => (c, stringIndex))
192192
.Where(tup => !char.IsLowSurrogate(tup.c))
193-
.Select((tup, i) => (tup.i, i));
193+
.Select((tup, surrogatePairIndex) => (tup.stringIndex, surrogatePairIndex));
194194
_surrogatePairIndexByStringIndex = new Dictionary<int, int>();
195195
_stringIndexBySurrogatePairIndex = new Dictionary<int, int>();
196-
foreach ((int surrogatePairIndex, int stringIndex) in indexPairs)
196+
foreach ((int stringIndex, int surrogatePairIndex) in indexPairs)
197197
{
198198
_surrogatePairIndexByStringIndex[stringIndex] = surrogatePairIndex;
199199
_stringIndexBySurrogatePairIndex[surrogatePairIndex] = stringIndex;
@@ -251,11 +251,11 @@ public string Substring(int startSurrogatePairIndex, int length)
251251

252252
public int GetStringIndexForSurrogatePairIndex(int surrogatePairIndex)
253253
{
254-
if (surrogatePairIndex == _surrogatePairIndexByStringIndex.Count)
254+
if (surrogatePairIndex == _stringIndexBySurrogatePairIndex.Count)
255255
{
256256
return _stringValue.Length;
257257
}
258-
return _surrogatePairIndexByStringIndex[surrogatePairIndex];
258+
return _stringIndexBySurrogatePairIndex[surrogatePairIndex];
259259
}
260260
}
261261
}

tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,4 +441,32 @@ public void ThatItUsesTheQuoteConventionSet()
441441
)
442442
);
443443
}
444+
445+
[Test]
446+
public void SupportsMultibyteUnicodeCharacters()
447+
{
448+
var quotationMarkFinder = new QuotationMarkFinder(QuoteConventions.Standard);
449+
450+
// [grinning face], [left double quotation mark][grinning face with big eyes][right double quotation mark]
451+
Assert.That(
452+
quotationMarkFinder
453+
.FindAllPotentialQuotationMarksInTextSegment(
454+
new TextSegment.Builder().SetText("\U0001f600, \u201c\U0001f603\u201d").Build()
455+
)
456+
.SequenceEqual(
457+
[
458+
new QuotationMarkStringMatch(
459+
new TextSegment.Builder().SetText("\U0001f600, \u201c\U0001f603\u201d").Build(),
460+
3,
461+
4
462+
),
463+
new QuotationMarkStringMatch(
464+
new TextSegment.Builder().SetText("\U0001f600, \u201c\U0001f603\u201d").Build(),
465+
5,
466+
6
467+
),
468+
]
469+
)
470+
);
471+
}
444472
}

0 commit comments

Comments
 (0)