Skip to content

Commit 8b77aca

Browse files
committed
CharPointer_UTF16: Make behaviour consistent when iterating through unpaired surrogates
There's a few things going on in this commit: - The implementation of CharPointer_UTF16 now uses helpers from CharacterFunctions to avoid a few instances of magic numbers. - Where it makes sense, member functions of that class have been DRYed by e.g. implementing getAndAdvance in terms of operator*() and operator++(). - Added more tests for incrementing/decrementing/dereferencing CharPointer_UTF16. After this change, a CharPointer_UTF16 that points to an unpaired surrogate will always dereference to a 32-bit character with that surrogate's value. Note that dereferencing a CharPointer_UTF16 that points to a high surrogate at the final code unit in a memory region is inherently unsafe, because CharPointer_UTF16 can't track its own size, and the dereference operation will check the following code unit to see whether it is a low surrogate.
1 parent c514c95 commit 8b77aca

File tree

2 files changed

+114
-21
lines changed

2 files changed

+114
-21
lines changed

modules/juce_core/text/juce_CharPointer_UTF16.h

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -90,46 +90,42 @@ class CharPointer_UTF16 final
9090
/** Returns the unicode character that this pointer is pointing to. */
9191
juce_wchar operator*() const noexcept
9292
{
93-
auto n = (uint32) (uint16) *data;
93+
const auto first = (uint32) (uint16) data[0];
9494

95-
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) data[1]) >= 0xdc00)
96-
n = 0x10000 + (((n - 0xd800) << 10) | (((uint32) (uint16) data[1]) - 0xdc00));
95+
if ((++CharPointer_UTF16 (*this)).data - data == 1)
96+
return (juce_wchar) first;
9797

98-
return (juce_wchar) n;
98+
const auto second = (uint32) (uint16) data[1];
99+
return (juce_wchar) (0x10000 + (((first - 0xd800) << 10) | (second - 0xdc00)));
99100
}
100101

101102
/** Moves this pointer along to the next character in the string. */
102103
CharPointer_UTF16& operator++() noexcept
103104
{
104-
auto n = (uint32) (uint16) *data++;
105-
106-
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
107-
++data;
108-
105+
data += (CharacterFunctions::isHighSurrogate ((uint16) data[0])
106+
&& CharacterFunctions::isLowSurrogate ((uint16) data[1]))
107+
? 2
108+
: 1;
109109
return *this;
110110
}
111111

112112
/** Moves this pointer back to the previous character in the string. */
113113
CharPointer_UTF16& operator--() noexcept
114114
{
115-
auto n = (uint32) (uint16) (*--data);
116-
117-
if (n >= 0xdc00 && n <= 0xdfff)
118-
--data;
119-
115+
data -= (CharacterFunctions::isLowSurrogate ((uint16) data[-1])
116+
&& CharacterFunctions::isHighSurrogate ((uint16) data[-2]))
117+
? 2
118+
: 1;
120119
return *this;
121120
}
122121

123122
/** Returns the character that this pointer is currently pointing to, and then
124123
advances the pointer to point to the next character. */
125124
juce_wchar getAndAdvance() noexcept
126125
{
127-
auto n = (uint32) (uint16) *data++;
128-
129-
if (n >= 0xd800 && n <= 0xdfff && ((uint32) (uint16) *data) >= 0xdc00)
130-
n = 0x10000 + ((((n - 0xd800) << 10) | (((uint32) (uint16) *data++) - 0xdc00)));
131-
132-
return (juce_wchar) n;
126+
const auto result = **this;
127+
++(*this);
128+
return result;
133129
}
134130

135131
/** Moves this pointer along to the next character in the string. */
@@ -214,7 +210,7 @@ class CharPointer_UTF16 final
214210
{
215211
auto n = (uint32) (uint16) *d++;
216212

217-
if (n >= 0xd800 && n <= 0xdfff)
213+
if (CharacterFunctions::isHighSurrogate ((juce_wchar) n))
218214
{
219215
if (*d++ == 0)
220216
break;

modules/juce_core/text/juce_CharPointer_UTF16_test.cpp

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,103 @@ class CharPointer_UTF16Test final : public UnitTest
110110
expect (CharPointer_UTF16::isValidString (string.data(), 4) == CharPointer_UTF32::canRepresent ((juce_wchar) c));
111111
}
112112
}
113+
114+
beginTest ("Iterating string starting with unpaired high surrogate produces a wide character with the surrogate value");
115+
{
116+
const std::vector<char16_t> stringA { 0xd800, 0xa, 0xb };
117+
expect (rangesEqual (Span (stringA), Span (stringA)));
118+
119+
const std::vector<char16_t> stringB { 0xd800, 0xe000, 0xb };
120+
expect (rangesEqual (Span (stringB), Span (stringB)));
121+
}
122+
123+
beginTest ("Iterating string ending with unpaired high surrogate produces a wide character with the surrogate value");
124+
{
125+
const std::vector<char16_t> string { 0xa, 0xb, 0xd800, 0x0 };
126+
expect (rangesEqual (Span (string), Span (string)));
127+
}
128+
129+
beginTest ("Iterating string starting with unpaired low surrogate produces a wide character with the surrogate value");
130+
{
131+
const std::vector<char16_t> stringA { 0xdc00, 0xa, 0xb };
132+
expect (rangesEqual (Span (stringA), Span (stringA)));
133+
134+
const std::vector<char16_t> stringB { 0xdc00, 0xe000, 0xb };
135+
expect (rangesEqual (Span (stringB), Span (stringB)));
136+
}
137+
138+
beginTest ("Iterating string ending with unpaired low surrogate produces a wide character with the surrogate value");
139+
{
140+
const std::vector<char16_t> string { 0xa, 0xb, 0xdc00 };
141+
expect (rangesEqual (Span (string), Span (string)));
142+
}
143+
144+
beginTest ("Iterating string with multiple unpaired surrogates produces produces wide characters with those surrogate values");
145+
{
146+
const std::vector<char16_t> string { 0xd800, 0xd800, 0xdc00, 0xdc00, 0xa, 0xb };
147+
const std::vector<juce_wchar> expected { 0xd800, 0x10000, 0xdc00, 0xa, 0xb };
148+
expect (rangesEqual (Span (expected), Span (string)));
149+
}
150+
151+
beginTest ("Can decrement to unpaired low surrogate");
152+
{
153+
const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xdc00, 0xb};
154+
CharPointer_UTF16 ptr { chars + 2 };
155+
156+
expect (*ptr == 0xb);
157+
--ptr;
158+
expect (ptr == CharPointer_UTF16 { chars + 1 });
159+
expect (*ptr == 0xdc00);
160+
}
161+
162+
beginTest ("Can decrement to unpaired high surrogate");
163+
{
164+
const CharPointer_UTF16::CharType chars[] { 0xa, (CharPointer_UTF16::CharType) 0xd800, 0xb };
165+
CharPointer_UTF16 ptr { chars + 2 };
166+
167+
expect (*ptr == 0xb);
168+
--ptr;
169+
expect (ptr == CharPointer_UTF16 { chars + 1 });
170+
expect (*ptr == 0xd800);
171+
}
172+
173+
beginTest ("Can decrement through surrogate pair");
174+
{
175+
const CharPointer_UTF16::CharType chars[] { 0xa,
176+
(CharPointer_UTF16::CharType) 0xd800,
177+
(CharPointer_UTF16::CharType) 0xdc00,
178+
0xb };
179+
CharPointer_UTF16 ptr { chars + 3 };
180+
181+
expect (*ptr == 0xb);
182+
183+
--ptr;
184+
expect (ptr == CharPointer_UTF16 { chars + 1 });
185+
expect (*ptr == 0x10000);
186+
187+
--ptr;
188+
expect (ptr == CharPointer_UTF16 { chars });
189+
expect (*ptr == (CharPointer_UTF16::CharType) 0xa);
190+
}
191+
}
192+
193+
private:
194+
template <typename Expected>
195+
static bool rangesEqual (Span<const Expected> expected, Span<const char16_t> units)
196+
{
197+
const auto dataPtr = reinterpret_cast<const CharPointer_UTF16::CharType*> (units.data());
198+
std::vector<juce_wchar> converted;
199+
200+
for (const auto it : makeRange (CharPointer_UTF16 { dataPtr },
201+
CharPointer_UTF16 { dataPtr + units.size() }))
202+
{
203+
converted.push_back (it);
204+
}
205+
206+
// Some stdlibs require the arguments to std::equal to have the full complement of iterator
207+
// member type aliases, so compare using std::vector iterators instead of CharPointer_UTF16
208+
// directly.
209+
return std::equal (expected.begin(), expected.end(), converted.begin(), converted.end());
113210
}
114211
};
115212

0 commit comments

Comments
 (0)