@@ -68,34 +68,7 @@ namespace glz
6868 ++it; // skip opening quote
6969 auto start = it;
7070
71- // Pass 1: Find closing quote using SWAR
72- // Process 8 bytes at a time looking for quote or backslash
73- const auto remaining = static_cast <size_t >(end - it);
74- if (remaining >= 8 ) {
75- const auto * end8 = &*(end - 7 ); // Safe to read 8 bytes up to here
76- while (it < end8) {
77- uint64_t chunk;
78- std::memcpy (&chunk, &*it, 8 );
79- if constexpr (std::endian::native == std::endian::big) {
80- chunk = std::byteswap (chunk);
81- }
82-
83- // Check for quote or backslash
84- const uint64_t has_quote_mask = glz::has_quote (chunk);
85- const uint64_t has_backslash_mask = glz::has_escape (chunk);
86- const uint64_t special = has_quote_mask | has_backslash_mask;
87-
88- if (special) {
89- // Found a special character - process byte by byte from here
90- const auto offset = countr_zero (special) >> 3 ;
91- it += offset;
92- break ;
93- }
94- it += 8 ;
95- }
96- }
97-
98- // Finish finding the closing quote byte-by-byte
71+ // Pass 1: Find closing quote byte-by-byte (need to handle newlines and escapes)
9972 while (it != end && *it != ' "' ) {
10073 if (*it == ' \\ ' ) {
10174 ++it;
@@ -119,42 +92,56 @@ namespace glz
11992 auto * dst = value.data ();
12093 auto * const dst_start = dst;
12194
122- // Pass 2: Copy and process escapes using SWAR
95+ // Pass 2: Copy and process escapes and line folding
12396 auto src = start;
12497 const auto * const src_end = &*it;
12598
12699 while (src < src_end) {
127- const auto src_remaining = static_cast <size_t >(src_end - src);
128-
129- // Try to copy 8 bytes at a time when no escapes
130- if (src_remaining >= 8 ) {
131- uint64_t chunk;
132- std::memcpy (&chunk, src, 8 );
133- if constexpr (std::endian::native == std::endian::big) {
134- chunk = std::byteswap (chunk);
135- }
136-
137- // Check for backslash using SWAR
138- const uint64_t has_backslash_mask = glz::has_escape (chunk);
139- if (!has_backslash_mask) {
140- // No backslash in this chunk - copy all 8 bytes
141- std::memcpy (dst, src, 8 );
142- src += 8 ;
143- dst += 8 ;
144- continue ;
100+ // Check for newline - needs line folding
101+ if (*src == ' \n ' || *src == ' \r ' ) {
102+ // Trim trailing whitespace from output before processing newline
103+ while (dst > dst_start && (*(dst - 1 ) == ' ' || *(dst - 1 ) == ' \t ' )) {
104+ --dst;
145105 }
146106
147- // Found a backslash - copy bytes up to it
148- const auto offset = countr_zero (has_backslash_mask) >> 3 ;
149- if (offset > 0 ) {
150- std::memcpy (dst, src, offset);
151- dst += offset;
152- src += offset ;
107+ // Skip the newline
108+ if (*src == ' \r ' && (src + 1 ) < src_end && *(src + 1 ) == ' \n ' ) {
109+ src += 2 ; // CRLF
110+ }
111+ else {
112+ ++src ;
153113 }
154- }
155114
156- // Process one character (possibly an escape)
157- if (src >= src_end) break ;
115+ // Skip leading whitespace on next line
116+ while (src < src_end && (*src == ' ' || *src == ' \t ' )) {
117+ ++src;
118+ }
119+
120+ // Check if this is a blank line (another newline follows)
121+ if (src < src_end && (*src == ' \n ' || *src == ' \r ' )) {
122+ // Blank line(s) - output newlines for each blank line
123+ while (src < src_end && (*src == ' \n ' || *src == ' \r ' )) {
124+ *dst++ = ' \n ' ;
125+ // Skip the newline
126+ if (*src == ' \r ' && (src + 1 ) < src_end && *(src + 1 ) == ' \n ' ) {
127+ src += 2 ; // CRLF
128+ }
129+ else {
130+ ++src;
131+ }
132+ // Skip leading whitespace on next line
133+ while (src < src_end && (*src == ' ' || *src == ' \t ' )) {
134+ ++src;
135+ }
136+ }
137+ // Don't add space - we're now at content after blank line(s)
138+ }
139+ else {
140+ // Single newline - fold to space
141+ *dst++ = ' ' ;
142+ }
143+ continue ;
144+ }
158145
159146 if (*src == ' \\ ' ) {
160147 ++src;
@@ -166,6 +153,24 @@ namespace glz
166153
167154 const unsigned char esc = static_cast <unsigned char >(*src);
168155
156+ // Check for escaped newline (line continuation - no space)
157+ if (esc == ' \n ' || esc == ' \r ' ) {
158+ // Skip the newline
159+ if (esc == ' \r ' && (src + 1 ) < src_end && *(src + 1 ) == ' \n ' ) {
160+ src += 2 ; // CRLF
161+ }
162+ else {
163+ ++src;
164+ }
165+
166+ // Skip leading whitespace on next line
167+ while (src < src_end && (*src == ' ' || *src == ' \t ' )) {
168+ ++src;
169+ }
170+ // No output - this is line continuation without space
171+ continue ;
172+ }
173+
169174 // Check simple escape table first
170175 if (yaml_escape_is_simple[esc]) {
171176 *dst++ = yaml_unescape_table[esc];
@@ -266,8 +271,9 @@ namespace glz
266271 ++it; // skip closing quote
267272 }
268273
269- // SWAR-optimized single- quoted string parsing
274+ // Single- quoted string parsing with line folding
270275 // Only escape is '' -> ' (doubled single quote)
276+ // Line breaks are folded: single newline -> space, blank line -> newline
271277 template <class Ctx , class It , class End >
272278 GLZ_ALWAYS_INLINE void parse_single_quoted_string (std::string& value, Ctx& ctx, It& it, End end)
273279 {
@@ -281,65 +287,11 @@ namespace glz
281287 ++it; // skip opening quote
282288 auto start = it;
283289
284- // Pass 1: Find closing quote using SWAR
285- const auto remaining = static_cast <size_t >(end - it);
286- if (remaining >= 8 ) {
287- const auto * end8 = &*(end - 7 );
288- while (it < end8) {
289- uint64_t chunk;
290- std::memcpy (&chunk, &*it, 8 );
291- if constexpr (std::endian::native == std::endian::big) {
292- chunk = std::byteswap (chunk);
293- }
294-
295- const uint64_t has_sq = glz::has_char<' \' ' >(chunk);
296- if (has_sq) {
297- const auto offset = countr_zero (has_sq) >> 3 ;
298- it += offset;
299- break ;
300- }
301- it += 8 ;
302- }
303- }
304-
305- // Finish finding quote byte-by-byte
306- while (it != end && *it != ' \' ' ) {
307- ++it;
308- }
309-
310- if (it == end) [[unlikely]] {
311- ctx.error = error_code::unexpected_end;
312- return ;
313- }
314-
315- // Now scan for '' escapes and find actual end
316- auto scan = start;
317- auto actual_end = it;
318- bool has_escapes = false ;
319-
320- while (scan < actual_end) {
321- if (*scan == ' \' ' ) {
322- if (scan + 1 < end && *(scan + 1 ) == ' \' ' ) {
323- has_escapes = true ;
324- scan += 2 ;
325- }
326- else {
327- actual_end = scan;
328- break ;
329- }
330- }
331- else {
332- ++scan;
333- }
334- }
335-
336- // Continue scanning past '' escapes to find true end
337- it = scan;
290+ // Pass 1: Find closing quote (handling '' escapes)
338291 while (it != end) {
339292 if (*it == ' \' ' ) {
340293 if ((it + 1 ) != end && *(it + 1 ) == ' \' ' ) {
341- has_escapes = true ;
342- it += 2 ;
294+ it += 2 ; // Skip escaped quote
343295 }
344296 else {
345297 break ; // Found closing quote
@@ -355,54 +307,63 @@ namespace glz
355307 return ;
356308 }
357309
358- actual_end = it;
359- const auto input_len = static_cast <size_t >(actual_end - start);
360-
361- if (!has_escapes) {
362- // Fast path: no escapes, direct assign
363- value.assign (&*start, input_len);
364- ++it; // skip closing quote
365- return ;
366- }
367-
368- // Has escapes: need to process
310+ const auto input_len = static_cast <size_t >(it - start);
369311 value.resize (input_len + string_padding_bytes);
370312 auto * dst = value.data ();
371313 auto * const dst_start = dst;
372314 auto src = start;
373- const auto * const src_end = &*actual_end ;
315+ const auto * const src_end = &*it ;
374316
317+ // Pass 2: Process content with line folding and '' escapes
375318 while (src < src_end) {
376- const auto src_remaining = static_cast <size_t >(src_end - src);
319+ // Check for newline - needs line folding
320+ if (*src == ' \n ' || *src == ' \r ' ) {
321+ // Trim trailing whitespace from output before processing newline
322+ while (dst > dst_start && (*(dst - 1 ) == ' ' || *(dst - 1 ) == ' \t ' )) {
323+ --dst;
324+ }
377325
378- // Try to copy 8 bytes at a time
379- if (src_remaining >= 8 ) {
380- uint64_t chunk;
381- std::memcpy (&chunk, src, 8 );
382- if constexpr (std::endian::native == std::endian::big) {
383- chunk = std::byteswap (chunk) ;
326+ // Skip the newline
327+ if (*src == ' \r ' && (src + 1 ) < src_end && *(src + 1 ) == ' \n ' ) {
328+ src += 2 ; // CRLF
329+ }
330+ else {
331+ ++src ;
384332 }
385333
386- const uint64_t has_sq = glz::has_char<' \' ' >(chunk);
387- if (!has_sq) {
388- std::memcpy (dst, src, 8 );
389- src += 8 ;
390- dst += 8 ;
391- continue ;
334+ // Skip leading whitespace on next line
335+ while (src < src_end && (*src == ' ' || *src == ' \t ' )) {
336+ ++src;
392337 }
393338
394- const auto offset = countr_zero (has_sq) >> 3 ;
395- if (offset > 0 ) {
396- std::memcpy (dst, src, offset);
397- dst += offset;
398- src += offset;
339+ // Check if this is a blank line (another newline follows)
340+ if (src < src_end && (*src == ' \n ' || *src == ' \r ' )) {
341+ // Blank line(s) - output newlines for each blank line
342+ while (src < src_end && (*src == ' \n ' || *src == ' \r ' )) {
343+ *dst++ = ' \n ' ;
344+ // Skip the newline
345+ if (*src == ' \r ' && (src + 1 ) < src_end && *(src + 1 ) == ' \n ' ) {
346+ src += 2 ; // CRLF
347+ }
348+ else {
349+ ++src;
350+ }
351+ // Skip leading whitespace on next line
352+ while (src < src_end && (*src == ' ' || *src == ' \t ' )) {
353+ ++src;
354+ }
355+ }
356+ // Don't add space - we're now at content after blank line(s)
399357 }
358+ else {
359+ // Single newline - fold to space
360+ *dst++ = ' ' ;
361+ }
362+ continue ;
400363 }
401364
402- if (src >= src_end) break ;
403-
404365 if (*src == ' \' ' ) {
405- // Must be '' (escaped quote)
366+ // Must be '' (escaped quote) - we validated this in pass 1
406367 *dst++ = ' \' ' ;
407368 src += 2 ;
408369 }
0 commit comments