Skip to content

Commit b00f6e5

Browse files
authored
Optimized the string escape helper for larger strings. (#8946)
1 parent 6beb9f8 commit b00f6e5

File tree

2 files changed

+671
-76
lines changed

2 files changed

+671
-76
lines changed

src/HotChocolate/Language/src/Language.Utf8/Utf8Helper.cs

Lines changed: 225 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
using System.Runtime.CompilerServices;
2+
#if NET8_0_OR_GREATER
3+
using System.Numerics;
4+
using System.Runtime.InteropServices;
5+
using System.Runtime.Intrinsics;
6+
#endif
27
using static HotChocolate.Language.Properties.LangUtf8Resources;
38

49
namespace HotChocolate.Language;
@@ -10,100 +15,244 @@ public static void Unescape(
1015
ref Span<byte> unescapedString,
1116
bool isBlockString)
1217
{
13-
var readPosition = -1;
14-
var writePosition = 0;
15-
var eofPosition = escapedString.Length - 1;
16-
int? highSurrogate = null;
18+
if (escapedString.Length == 0)
19+
{
20+
if (unescapedString.Length > 0)
21+
{
22+
unescapedString = unescapedString.Slice(0, 0);
23+
}
24+
return;
25+
}
1726

18-
if (escapedString.Length > 0)
27+
// Fast path: no escapes just copy.
28+
var firstBackslash = escapedString.IndexOf(GraphQLConstants.Backslash);
29+
if (firstBackslash == -1)
1930
{
20-
do
31+
escapedString.CopyTo(unescapedString);
32+
unescapedString = unescapedString.Slice(0, escapedString.Length);
33+
return;
34+
}
35+
36+
// Copy everything before first backslash
37+
if (firstBackslash > 0)
38+
{
39+
escapedString.Slice(0, firstBackslash).CopyTo(unescapedString);
40+
}
41+
42+
var readPos = firstBackslash;
43+
var writePos = firstBackslash;
44+
45+
// -1 means no surrogate pending
46+
var highSurrogate = -1;
47+
48+
// Process the first escape we already found
49+
ProcessEscapeSequence(
50+
escapedString, unescapedString,
51+
ref readPos, ref writePos,
52+
ref highSurrogate, isBlockString);
53+
54+
#if NET8_0_OR_GREATER
55+
var remaining = escapedString.Length - readPos;
56+
57+
// Vector256 path (32 bytes at a time) if we have enough bytes remain
58+
if (Vector256.IsHardwareAccelerated && remaining >= Vector256<byte>.Count)
59+
{
60+
ref var srcStart = ref MemoryMarshal.GetReference(escapedString);
61+
ref var dstStart = ref MemoryMarshal.GetReference(unescapedString);
62+
var backslashVec = Vector256.Create(GraphQLConstants.Backslash);
63+
64+
while (readPos <= escapedString.Length - Vector256<byte>.Count)
2165
{
22-
var code = escapedString[++readPosition];
66+
var chunk = Vector256.LoadUnsafe(ref srcStart, (nuint)readPos);
67+
var matches = Vector256.Equals(chunk, backslashVec);
68+
var mask = matches.ExtractMostSignificantBits();
2369

24-
if (code == GraphQLConstants.Backslash)
70+
if (mask == 0)
71+
{
72+
// No escapes in 32 bytes so we simply copy
73+
chunk.StoreUnsafe(ref dstStart, (nuint)writePos);
74+
readPos += Vector256<byte>.Count;
75+
writePos += Vector256<byte>.Count;
76+
}
77+
else
2578
{
26-
code = escapedString[++readPosition];
79+
// Found backslash, copy up to it, then handle escape
80+
var firstEscape = BitOperations.TrailingZeroCount(mask);
81+
if (firstEscape > 0)
82+
{
83+
escapedString.Slice(readPos, firstEscape)
84+
.CopyTo(unescapedString.Slice(writePos));
85+
writePos += firstEscape;
86+
}
87+
readPos += firstEscape;
2788

28-
if (isBlockString && code == GraphQLConstants.Quote)
89+
ProcessEscapeSequence(
90+
escapedString, unescapedString,
91+
ref readPos, ref writePos,
92+
ref highSurrogate, isBlockString);
93+
}
94+
}
95+
}
96+
// Vector128 fallback (16 bytes at a time), if we have enough bytes remaining
97+
else if (Vector128.IsHardwareAccelerated && remaining >= Vector128<byte>.Count)
98+
{
99+
ref var srcStart = ref MemoryMarshal.GetReference(escapedString);
100+
ref var dstStart = ref MemoryMarshal.GetReference(unescapedString);
101+
var backslashVec = Vector128.Create(GraphQLConstants.Backslash);
102+
103+
while (readPos <= escapedString.Length - Vector128<byte>.Count)
104+
{
105+
var chunk = Vector128.LoadUnsafe(ref srcStart, (nuint)readPos);
106+
var matches = Vector128.Equals(chunk, backslashVec);
107+
var mask = matches.ExtractMostSignificantBits();
108+
109+
if (mask == 0)
110+
{
111+
// No escapes in 16 bytes so we simply copy
112+
chunk.StoreUnsafe(ref dstStart, (nuint)writePos);
113+
readPos += Vector128<byte>.Count;
114+
writePos += Vector128<byte>.Count;
115+
}
116+
else
117+
{
118+
// Found backslash, copy up to it, then handle escape
119+
var firstEscape = BitOperations.TrailingZeroCount(mask);
120+
if (firstEscape > 0)
29121
{
30-
if (escapedString[readPosition + 1] == GraphQLConstants.Quote
31-
&& escapedString[readPosition + 2] == GraphQLConstants.Quote)
32-
{
33-
readPosition += 2;
34-
unescapedString[writePosition++] = GraphQLConstants.Quote;
35-
unescapedString[writePosition++] = GraphQLConstants.Quote;
36-
unescapedString[writePosition++] = GraphQLConstants.Quote;
37-
}
38-
else
39-
{
40-
throw new Utf8EncodingException(Utf8Helper_InvalidQuoteEscapeCount);
41-
}
122+
escapedString.Slice(readPos, firstEscape)
123+
.CopyTo(unescapedString.Slice(writePos));
124+
writePos += firstEscape;
42125
}
43-
else if (code.IsValidEscapeCharacter())
126+
readPos += firstEscape;
127+
128+
ProcessEscapeSequence(
129+
escapedString, unescapedString,
130+
ref readPos, ref writePos,
131+
ref highSurrogate, isBlockString);
132+
}
133+
}
134+
}
135+
#endif
136+
137+
// Scalar tail for remaining bytes
138+
while (readPos < escapedString.Length)
139+
{
140+
var code = escapedString[readPos];
141+
142+
if (code == GraphQLConstants.Backslash)
143+
{
144+
ProcessEscapeSequence(
145+
escapedString, unescapedString,
146+
ref readPos, ref writePos,
147+
ref highSurrogate, isBlockString);
148+
}
149+
else
150+
{
151+
unescapedString[writePos++] = code;
152+
readPos++;
153+
}
154+
}
155+
156+
if (unescapedString.Length > writePos)
157+
{
158+
unescapedString = unescapedString.Slice(0, writePos);
159+
}
160+
}
161+
162+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
163+
private static void ProcessEscapeSequence(
164+
in ReadOnlySpan<byte> escaped,
165+
Span<byte> unescaped,
166+
ref int readPos,
167+
ref int writePos,
168+
ref int highSurrogate,
169+
bool isBlockString)
170+
{
171+
if (readPos + 1 >= escaped.Length)
172+
{
173+
throw new Utf8EncodingException(
174+
string.Format(Utf8Helper_InvalidEscapeChar, '\\'));
175+
}
176+
177+
// skip backslash
178+
readPos++;
179+
var code = escaped[readPos++];
180+
181+
if (isBlockString && code == GraphQLConstants.Quote)
182+
{
183+
if (readPos + 1 < escaped.Length
184+
&& escaped[readPos] == GraphQLConstants.Quote
185+
&& escaped[readPos + 1] == GraphQLConstants.Quote)
186+
{
187+
readPos += 2;
188+
unescaped[writePos++] = GraphQLConstants.Quote;
189+
unescaped[writePos++] = GraphQLConstants.Quote;
190+
unescaped[writePos++] = GraphQLConstants.Quote;
191+
}
192+
else
193+
{
194+
throw new Utf8EncodingException(Utf8Helper_InvalidQuoteEscapeCount);
195+
}
196+
}
197+
else if (code.IsValidEscapeCharacter())
198+
{
199+
if (code == GraphQLConstants.U)
200+
{
201+
if (readPos + 3 >= escaped.Length)
202+
{
203+
throw new Utf8EncodingException(
204+
string.Format(Utf8Helper_InvalidEscapeChar, 'u'));
205+
}
206+
207+
var unicodeDecimal = UnescapeUtf8Hex(
208+
escaped[readPos],
209+
escaped[readPos + 1],
210+
escaped[readPos + 2],
211+
escaped[readPos + 3]);
212+
readPos += 4;
213+
214+
if (unicodeDecimal >= 0xD800 && unicodeDecimal <= 0xDBFF)
215+
{
216+
// High surrogate
217+
if (highSurrogate >= 0)
44218
{
45-
if (code == GraphQLConstants.U)
46-
{
47-
var unicodeDecimal = UnescapeUtf8Hex(
48-
escapedString[++readPosition],
49-
escapedString[++readPosition],
50-
escapedString[++readPosition],
51-
escapedString[++readPosition]);
52-
53-
if (unicodeDecimal >= 0xD800 && unicodeDecimal <= 0xDBFF)
54-
{
55-
// High surrogate
56-
if (highSurrogate != null)
57-
{
58-
throw new Utf8EncodingException("Unexpected high surrogate.");
59-
}
60-
highSurrogate = unicodeDecimal;
61-
}
62-
else if (unicodeDecimal >= 0xDC00 && unicodeDecimal <= 0xDFFF)
63-
{
64-
// Low surrogate
65-
if (highSurrogate == null)
66-
{
67-
throw new Utf8EncodingException("Unexpected low surrogate.");
68-
}
69-
var fullUnicode = ((highSurrogate.Value - 0xD800) << 10)
70-
+ (unicodeDecimal - 0xDC00)
71-
+ 0x10000;
72-
UnescapeUtf8Hex(fullUnicode, ref writePosition, unescapedString);
73-
highSurrogate = null;
74-
}
75-
else
76-
{
77-
if (highSurrogate != null)
78-
{
79-
throw new Utf8EncodingException("High surrogate not followed by low surrogate.");
80-
}
81-
UnescapeUtf8Hex(unicodeDecimal, ref writePosition, unescapedString);
82-
}
83-
}
84-
else
85-
{
86-
unescapedString[writePosition++] = code.EscapeCharacter();
87-
}
219+
throw new Utf8EncodingException("Unexpected high surrogate.");
88220
}
89-
else
221+
highSurrogate = unicodeDecimal;
222+
}
223+
else if (unicodeDecimal >= 0xDC00 && unicodeDecimal <= 0xDFFF)
224+
{
225+
// Low surrogate
226+
if (highSurrogate < 0)
90227
{
91-
throw new Utf8EncodingException(
92-
string.Format(
93-
Utf8Helper_InvalidEscapeChar,
94-
(char)code));
228+
throw new Utf8EncodingException("Unexpected low surrogate.");
95229
}
230+
var fullUnicode = ((highSurrogate - 0xD800) << 10)
231+
+ (unicodeDecimal - 0xDC00)
232+
+ 0x10000;
233+
UnescapeUtf8Hex(fullUnicode, ref writePos, unescaped);
234+
highSurrogate = -1;
96235
}
97236
else
98237
{
99-
unescapedString[writePosition++] = code;
238+
if (highSurrogate >= 0)
239+
{
240+
throw new Utf8EncodingException("High surrogate not followed by low surrogate.");
241+
}
242+
UnescapeUtf8Hex(unicodeDecimal, ref writePos, unescaped);
100243
}
101-
} while (readPosition < eofPosition);
244+
}
245+
else
246+
{
247+
unescaped[writePos++] = code.EscapeCharacter();
248+
}
102249
}
103-
104-
if (unescapedString.Length - writePosition > 0)
250+
else
105251
{
106-
unescapedString = unescapedString.Slice(0, writePosition);
252+
throw new Utf8EncodingException(
253+
string.Format(
254+
Utf8Helper_InvalidEscapeChar,
255+
(char)code));
107256
}
108257
}
109258

0 commit comments

Comments
 (0)