Skip to content

Commit 53a7fda

Browse files
committed
Implement Block Scalar support
1 parent a6c2cd8 commit 53a7fda

File tree

4 files changed

+274
-0
lines changed

4 files changed

+274
-0
lines changed

examples/block_scalar_demo.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Block Scalar Demonstration
2+
3+
# Literal style (|) - preserves line breaks
4+
poem: |
5+
Roses are red,
6+
Violets are blue,
7+
YAML is great,
8+
And so are you!
9+
10+
# Folded style (>) - folds lines into spaces
11+
paragraph: >
12+
This is a long paragraph that spans
13+
multiple lines in the source file,
14+
but will be folded into a single
15+
line with spaces between words.
16+
17+
# Practical use case - embedded JSON
18+
api_response: |
19+
{
20+
"status": "success",
21+
"data": {
22+
"user": "alice",
23+
"id": 123
24+
}
25+
}
26+
27+
# Practical use case - shell script
28+
setup_script: |
29+
#!/bin/bash
30+
echo "Setting up environment..."
31+
export PATH=$PATH:/usr/local/bin
32+
echo "Done!"
33+
34+
# Practical use case - SQL query
35+
database_query: |
36+
SELECT users.name, orders.total
37+
FROM users
38+
INNER JOIN orders ON users.id = orders.user_id
39+
WHERE orders.status = 'completed'
40+
ORDER BY orders.total DESC;

examples/block_scalars.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Simple configuration using block scalars
2+
service: popshop
3+
version: "0.1.0"
4+
5+
# Using a literal block scalar for multi-line text
6+
description: |
7+
This is a multi-line description.
8+
It preserves line breaks.
9+
Each line is on its own line.
10+
11+
# Using a folded block scalar
12+
summary: >
13+
This text will be folded into a single line.
14+
Line breaks become spaces.
15+
Unless there are blank lines.
16+
17+
# Simple values
18+
host: localhost
19+
port: 8080
20+
21+
# List of features
22+
features:
23+
- fast
24+
- reliable
25+
- scalable

src/Parser.zig

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ fn value(self: *Parser, gpa: Allocator) ParseError!Node.OptionalIndex {
168168
self.token_it.seekBy(-1);
169169
return self.listBracketed(gpa);
170170
},
171+
.block_literal, .block_folded => {
172+
// block scalar
173+
self.token_it.seekBy(-1);
174+
return self.blockScalar(gpa);
175+
},
171176
else => return .none,
172177
}
173178
}
@@ -574,6 +579,197 @@ fn leafValue(self: *Parser, gpa: Allocator) ParseError!Node.OptionalIndex {
574579
return error.MalformedYaml;
575580
}
576581

582+
fn blockScalar(self: *Parser, gpa: Allocator) ParseError!Node.OptionalIndex {
583+
const node_index: Node.Index = @enumFromInt(try self.nodes.addOne(gpa));
584+
const node_start = self.token_it.pos;
585+
586+
// Get the block indicator (| or >)
587+
const indicator_tok = self.token_it.next() orelse return error.UnexpectedEof;
588+
const is_literal = indicator_tok.id == .block_literal;
589+
590+
log.debug("(block_scalar) begin {s}@{d}", .{ @tagName(indicator_tok.id), node_start });
591+
592+
// The parent indentation is where the key was
593+
// We need to find the indentation of the line containing the block indicator
594+
// For simplicity, we'll track back to find the key's column
595+
var parent_col: usize = 0;
596+
if (@intFromEnum(node_start) > 0) {
597+
var check_pos: usize = @intFromEnum(node_start);
598+
while (check_pos > 0) {
599+
check_pos -= 1;
600+
const check_tok = self.tokens.items(.token)[check_pos];
601+
if (check_tok.id == .new_line) {
602+
// Found the previous line, now find first non-space token on current line
603+
var line_pos = check_pos + 1;
604+
while (line_pos < @intFromEnum(node_start)) : (line_pos += 1) {
605+
const line_tok = self.tokens.items(.token)[line_pos];
606+
if (line_tok.id != .space and line_tok.id != .tab) {
607+
parent_col = self.getCol(@enumFromInt(line_pos));
608+
break;
609+
}
610+
}
611+
break;
612+
}
613+
}
614+
}
615+
616+
log.debug("(block_scalar) parent_col = {d}", .{parent_col});
617+
618+
// Skip optional chomping indicator and/or indentation indicator
619+
// For simplicity, we'll just skip any literal that immediately follows
620+
self.eatCommentsAndSpace(&.{ .new_line });
621+
_ = self.eatToken(.literal, &.{ .new_line, .comment });
622+
623+
// Expect newline after block indicator
624+
self.eatCommentsAndSpace(&.{});
625+
const next_tok = self.token_it.peek();
626+
if (next_tok == null or next_tok.?.id != .new_line) {
627+
// Must have a newline after block indicator
628+
if (next_tok) |tok| {
629+
log.debug("(block_scalar) expected newline but got {s}", .{@tagName(tok.id)});
630+
}
631+
}
632+
_ = self.eatToken(.new_line, &.{});
633+
634+
// Determine base indentation from first content line
635+
var base_indent: ?u32 = null;
636+
var content_start: ?Token.Index = null;
637+
var content_end: Token.Index = node_start;
638+
639+
// Collect all indented lines
640+
while (self.token_it.peek()) |tok| {
641+
switch (tok.id) {
642+
.space, .tab => {
643+
// Could be indentation - advance
644+
content_end = self.token_it.pos;
645+
_ = self.token_it.next();
646+
},
647+
.new_line => {
648+
// Empty line or end of line - include it
649+
content_end = self.token_it.pos;
650+
_ = self.token_it.next();
651+
},
652+
.comment => {
653+
// Comments at parent level or less end the block scalar
654+
const comment_col = self.getCol(self.token_it.pos);
655+
if (comment_col <= parent_col) {
656+
break;
657+
}
658+
// Otherwise include the comment in content
659+
content_end = self.token_it.pos;
660+
_ = self.token_it.next();
661+
},
662+
.doc_start, .doc_end, .eof => {
663+
// Document markers end the block scalar
664+
break;
665+
},
666+
else => {
667+
// Any other token could be content
668+
const line_col = self.getCol(self.token_it.pos);
669+
670+
// First content line establishes the base indentation
671+
if (base_indent == null) {
672+
base_indent = @intCast(line_col);
673+
content_start = self.token_it.pos;
674+
log.debug("(block_scalar) base_indent = {d}", .{base_indent.?});
675+
}
676+
677+
// If indentation is less than or equal to parent, we've reached the end
678+
if (line_col <= parent_col) {
679+
log.debug("(block_scalar) ending: line_col={d} <= parent_col={d}", .{line_col, parent_col});
680+
break;
681+
}
682+
683+
content_end = self.token_it.pos;
684+
_ = self.token_it.next();
685+
},
686+
}
687+
}
688+
689+
// If no content was found, return empty string
690+
if (content_start == null) {
691+
self.nodes.set(@intFromEnum(node_index), .{
692+
.tag = .string_value,
693+
.scope = .{
694+
.start = node_start,
695+
.end = content_end,
696+
},
697+
.data = .{ .string = .{ .index = @enumFromInt(0), .len = 0 } },
698+
});
699+
return node_index.toOptional();
700+
}
701+
702+
// Extract the raw text
703+
const raw = self.rawString(content_start.?, content_end);
704+
705+
// Process the content based on type
706+
var result_bytes: std.ArrayListUnmanaged(u8) = .empty;
707+
defer result_bytes.deinit(gpa);
708+
709+
// Split into lines and process
710+
var lines = std.mem.splitScalar(u8, raw, '\n');
711+
var first_line = true;
712+
713+
while (lines.next()) |line| {
714+
// Strip base indentation
715+
const stripped = if (base_indent) |bi| blk: {
716+
var count: u32 = 0;
717+
var i: usize = 0;
718+
while (i < line.len and count < bi) : (i += 1) {
719+
if (line[i] == ' ') {
720+
count += 1;
721+
} else if (line[i] == '\t') {
722+
count += 1;
723+
} else {
724+
break;
725+
}
726+
}
727+
break :blk line[i..];
728+
} else line;
729+
730+
if (!first_line) {
731+
if (is_literal) {
732+
// Literal style: preserve line breaks
733+
try result_bytes.append(gpa, '\n');
734+
} else {
735+
// Folded style: replace with space (simplified)
736+
if (result_bytes.items.len > 0 and stripped.len > 0) {
737+
try result_bytes.append(gpa, ' ');
738+
}
739+
}
740+
}
741+
first_line = false;
742+
743+
try result_bytes.appendSlice(gpa, stripped);
744+
}
745+
746+
// Add final newline for literal style
747+
if (is_literal and result_bytes.items.len > 0) {
748+
try result_bytes.append(gpa, '\n');
749+
}
750+
751+
// Store the string
752+
const string_index: u32 = @intCast(self.string_bytes.items.len);
753+
try self.string_bytes.appendSlice(gpa, result_bytes.items);
754+
755+
const node_end = content_end;
756+
log.debug("(block_scalar) end content: {s}", .{result_bytes.items});
757+
758+
self.nodes.set(@intFromEnum(node_index), .{
759+
.tag = .string_value,
760+
.scope = .{
761+
.start = node_start,
762+
.end = node_end,
763+
},
764+
.data = .{ .string = .{
765+
.index = @enumFromInt(string_index),
766+
.len = @intCast(result_bytes.items.len),
767+
} },
768+
});
769+
770+
return node_index.toOptional();
771+
}
772+
577773
fn eatCommentsAndSpace(self: *Parser, comptime exclusions: []const Token.Id) void {
578774
log.debug("eatCommentsAndSpace", .{});
579775
outer: while (self.token_it.next()) |tok| {

src/Tokenizer.zig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ const testing = std.testing;
77
buffer: []const u8,
88
index: usize = 0,
99
in_flow: usize = 0,
10+
block_scalar_indent: ?usize = null,
1011

1112
pub const Token = struct {
1213
id: Id,
@@ -42,6 +43,8 @@ pub const Token = struct {
4243
single_quoted, // '...'
4344
double_quoted, // "..."
4445
literal,
46+
block_literal, // |
47+
block_folded, // >
4548
// zig fmt: on
4649
};
4750

@@ -222,6 +225,16 @@ pub fn next(self: *Tokenizer) Token {
222225
'"' => {
223226
state = .double_quoted;
224227
},
228+
'|' => {
229+
result.id = .block_literal;
230+
self.index += 1;
231+
break;
232+
},
233+
'>' => {
234+
result.id = .block_folded;
235+
self.index += 1;
236+
break;
237+
},
225238
else => {
226239
state = .literal;
227240
},

0 commit comments

Comments
 (0)