Skip to content

Commit 23030af

Browse files
authored
YAML fixes (#2302)
1 parent b9c2f3e commit 23030af

File tree

2 files changed

+257
-146
lines changed

2 files changed

+257
-146
lines changed

include/glaze/yaml/read.hpp

Lines changed: 107 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -68,34 +68,7 @@ namespace glz
6868
++it; // skip opening quote
6969
auto start = it;
7070

71-
// Pass 1: Find closing quote using SWAR
72-
// Process 8 bytes at a time looking for quote or backslash
73-
const auto remaining = static_cast<size_t>(end - it);
74-
if (remaining >= 8) {
75-
const auto* end8 = &*(end - 7); // Safe to read 8 bytes up to here
76-
while (it < end8) {
77-
uint64_t chunk;
78-
std::memcpy(&chunk, &*it, 8);
79-
if constexpr (std::endian::native == std::endian::big) {
80-
chunk = std::byteswap(chunk);
81-
}
82-
83-
// Check for quote or backslash
84-
const uint64_t has_quote_mask = glz::has_quote(chunk);
85-
const uint64_t has_backslash_mask = glz::has_escape(chunk);
86-
const uint64_t special = has_quote_mask | has_backslash_mask;
87-
88-
if (special) {
89-
// Found a special character - process byte by byte from here
90-
const auto offset = countr_zero(special) >> 3;
91-
it += offset;
92-
break;
93-
}
94-
it += 8;
95-
}
96-
}
97-
98-
// Finish finding the closing quote byte-by-byte
71+
// Pass 1: Find closing quote byte-by-byte (need to handle newlines and escapes)
9972
while (it != end && *it != '"') {
10073
if (*it == '\\') {
10174
++it;
@@ -119,42 +92,56 @@ namespace glz
11992
auto* dst = value.data();
12093
auto* const dst_start = dst;
12194

122-
// Pass 2: Copy and process escapes using SWAR
95+
// Pass 2: Copy and process escapes and line folding
12396
auto src = start;
12497
const auto* const src_end = &*it;
12598

12699
while (src < src_end) {
127-
const auto src_remaining = static_cast<size_t>(src_end - src);
128-
129-
// Try to copy 8 bytes at a time when no escapes
130-
if (src_remaining >= 8) {
131-
uint64_t chunk;
132-
std::memcpy(&chunk, src, 8);
133-
if constexpr (std::endian::native == std::endian::big) {
134-
chunk = std::byteswap(chunk);
135-
}
136-
137-
// Check for backslash using SWAR
138-
const uint64_t has_backslash_mask = glz::has_escape(chunk);
139-
if (!has_backslash_mask) {
140-
// No backslash in this chunk - copy all 8 bytes
141-
std::memcpy(dst, src, 8);
142-
src += 8;
143-
dst += 8;
144-
continue;
100+
// Check for newline - needs line folding
101+
if (*src == '\n' || *src == '\r') {
102+
// Trim trailing whitespace from output before processing newline
103+
while (dst > dst_start && (*(dst - 1) == ' ' || *(dst - 1) == '\t')) {
104+
--dst;
145105
}
146106

147-
// Found a backslash - copy bytes up to it
148-
const auto offset = countr_zero(has_backslash_mask) >> 3;
149-
if (offset > 0) {
150-
std::memcpy(dst, src, offset);
151-
dst += offset;
152-
src += offset;
107+
// Skip the newline
108+
if (*src == '\r' && (src + 1) < src_end && *(src + 1) == '\n') {
109+
src += 2; // CRLF
110+
}
111+
else {
112+
++src;
153113
}
154-
}
155114

156-
// Process one character (possibly an escape)
157-
if (src >= src_end) break;
115+
// Skip leading whitespace on next line
116+
while (src < src_end && (*src == ' ' || *src == '\t')) {
117+
++src;
118+
}
119+
120+
// Check if this is a blank line (another newline follows)
121+
if (src < src_end && (*src == '\n' || *src == '\r')) {
122+
// Blank line(s) - output newlines for each blank line
123+
while (src < src_end && (*src == '\n' || *src == '\r')) {
124+
*dst++ = '\n';
125+
// Skip the newline
126+
if (*src == '\r' && (src + 1) < src_end && *(src + 1) == '\n') {
127+
src += 2; // CRLF
128+
}
129+
else {
130+
++src;
131+
}
132+
// Skip leading whitespace on next line
133+
while (src < src_end && (*src == ' ' || *src == '\t')) {
134+
++src;
135+
}
136+
}
137+
// Don't add space - we're now at content after blank line(s)
138+
}
139+
else {
140+
// Single newline - fold to space
141+
*dst++ = ' ';
142+
}
143+
continue;
144+
}
158145

159146
if (*src == '\\') {
160147
++src;
@@ -166,6 +153,24 @@ namespace glz
166153

167154
const unsigned char esc = static_cast<unsigned char>(*src);
168155

156+
// Check for escaped newline (line continuation - no space)
157+
if (esc == '\n' || esc == '\r') {
158+
// Skip the newline
159+
if (esc == '\r' && (src + 1) < src_end && *(src + 1) == '\n') {
160+
src += 2; // CRLF
161+
}
162+
else {
163+
++src;
164+
}
165+
166+
// Skip leading whitespace on next line
167+
while (src < src_end && (*src == ' ' || *src == '\t')) {
168+
++src;
169+
}
170+
// No output - this is line continuation without space
171+
continue;
172+
}
173+
169174
// Check simple escape table first
170175
if (yaml_escape_is_simple[esc]) {
171176
*dst++ = yaml_unescape_table[esc];
@@ -266,8 +271,9 @@ namespace glz
266271
++it; // skip closing quote
267272
}
268273

269-
// SWAR-optimized single-quoted string parsing
274+
// Single-quoted string parsing with line folding
270275
// Only escape is '' -> ' (doubled single quote)
276+
// Line breaks are folded: single newline -> space, blank line -> newline
271277
template <class Ctx, class It, class End>
272278
GLZ_ALWAYS_INLINE void parse_single_quoted_string(std::string& value, Ctx& ctx, It& it, End end)
273279
{
@@ -281,65 +287,11 @@ namespace glz
281287
++it; // skip opening quote
282288
auto start = it;
283289

284-
// Pass 1: Find closing quote using SWAR
285-
const auto remaining = static_cast<size_t>(end - it);
286-
if (remaining >= 8) {
287-
const auto* end8 = &*(end - 7);
288-
while (it < end8) {
289-
uint64_t chunk;
290-
std::memcpy(&chunk, &*it, 8);
291-
if constexpr (std::endian::native == std::endian::big) {
292-
chunk = std::byteswap(chunk);
293-
}
294-
295-
const uint64_t has_sq = glz::has_char<'\''>(chunk);
296-
if (has_sq) {
297-
const auto offset = countr_zero(has_sq) >> 3;
298-
it += offset;
299-
break;
300-
}
301-
it += 8;
302-
}
303-
}
304-
305-
// Finish finding quote byte-by-byte
306-
while (it != end && *it != '\'') {
307-
++it;
308-
}
309-
310-
if (it == end) [[unlikely]] {
311-
ctx.error = error_code::unexpected_end;
312-
return;
313-
}
314-
315-
// Now scan for '' escapes and find actual end
316-
auto scan = start;
317-
auto actual_end = it;
318-
bool has_escapes = false;
319-
320-
while (scan < actual_end) {
321-
if (*scan == '\'') {
322-
if (scan + 1 < end && *(scan + 1) == '\'') {
323-
has_escapes = true;
324-
scan += 2;
325-
}
326-
else {
327-
actual_end = scan;
328-
break;
329-
}
330-
}
331-
else {
332-
++scan;
333-
}
334-
}
335-
336-
// Continue scanning past '' escapes to find true end
337-
it = scan;
290+
// Pass 1: Find closing quote (handling '' escapes)
338291
while (it != end) {
339292
if (*it == '\'') {
340293
if ((it + 1) != end && *(it + 1) == '\'') {
341-
has_escapes = true;
342-
it += 2;
294+
it += 2; // Skip escaped quote
343295
}
344296
else {
345297
break; // Found closing quote
@@ -355,54 +307,63 @@ namespace glz
355307
return;
356308
}
357309

358-
actual_end = it;
359-
const auto input_len = static_cast<size_t>(actual_end - start);
360-
361-
if (!has_escapes) {
362-
// Fast path: no escapes, direct assign
363-
value.assign(&*start, input_len);
364-
++it; // skip closing quote
365-
return;
366-
}
367-
368-
// Has escapes: need to process
310+
const auto input_len = static_cast<size_t>(it - start);
369311
value.resize(input_len + string_padding_bytes);
370312
auto* dst = value.data();
371313
auto* const dst_start = dst;
372314
auto src = start;
373-
const auto* const src_end = &*actual_end;
315+
const auto* const src_end = &*it;
374316

317+
// Pass 2: Process content with line folding and '' escapes
375318
while (src < src_end) {
376-
const auto src_remaining = static_cast<size_t>(src_end - src);
319+
// Check for newline - needs line folding
320+
if (*src == '\n' || *src == '\r') {
321+
// Trim trailing whitespace from output before processing newline
322+
while (dst > dst_start && (*(dst - 1) == ' ' || *(dst - 1) == '\t')) {
323+
--dst;
324+
}
377325

378-
// Try to copy 8 bytes at a time
379-
if (src_remaining >= 8) {
380-
uint64_t chunk;
381-
std::memcpy(&chunk, src, 8);
382-
if constexpr (std::endian::native == std::endian::big) {
383-
chunk = std::byteswap(chunk);
326+
// Skip the newline
327+
if (*src == '\r' && (src + 1) < src_end && *(src + 1) == '\n') {
328+
src += 2; // CRLF
329+
}
330+
else {
331+
++src;
384332
}
385333

386-
const uint64_t has_sq = glz::has_char<'\''>(chunk);
387-
if (!has_sq) {
388-
std::memcpy(dst, src, 8);
389-
src += 8;
390-
dst += 8;
391-
continue;
334+
// Skip leading whitespace on next line
335+
while (src < src_end && (*src == ' ' || *src == '\t')) {
336+
++src;
392337
}
393338

394-
const auto offset = countr_zero(has_sq) >> 3;
395-
if (offset > 0) {
396-
std::memcpy(dst, src, offset);
397-
dst += offset;
398-
src += offset;
339+
// Check if this is a blank line (another newline follows)
340+
if (src < src_end && (*src == '\n' || *src == '\r')) {
341+
// Blank line(s) - output newlines for each blank line
342+
while (src < src_end && (*src == '\n' || *src == '\r')) {
343+
*dst++ = '\n';
344+
// Skip the newline
345+
if (*src == '\r' && (src + 1) < src_end && *(src + 1) == '\n') {
346+
src += 2; // CRLF
347+
}
348+
else {
349+
++src;
350+
}
351+
// Skip leading whitespace on next line
352+
while (src < src_end && (*src == ' ' || *src == '\t')) {
353+
++src;
354+
}
355+
}
356+
// Don't add space - we're now at content after blank line(s)
399357
}
358+
else {
359+
// Single newline - fold to space
360+
*dst++ = ' ';
361+
}
362+
continue;
400363
}
401364

402-
if (src >= src_end) break;
403-
404365
if (*src == '\'') {
405-
// Must be '' (escaped quote)
366+
// Must be '' (escaped quote) - we validated this in pass 1
406367
*dst++ = '\'';
407368
src += 2;
408369
}

0 commit comments

Comments
 (0)