Yaml: simplify and improve parser

chqrlie · bvdberg · commit 8d0f401f0dd1 · 2026-02-15T08:28:55.000+01:00
* make `Data.text_cur` and `Data.nodes_cur` integers to avoid fixups
* enforce enum and names synchronisation
* remove redundant casts
* simplify tokenizer
* remove redundant enum prefixes
diff --git a/common/yaml/yaml_data.c2 b/common/yaml/yaml_data.c2
@@ -19,7 +19,8 @@ import string local;
 import stdlib local;
 
 const u32 MaxDepth = 8;
-
+const u32 MinText = 256;
+const u32 MinNodes = 32;
 
 type NodeKind enum u8 {
     Unknown,
@@ -29,7 +30,12 @@ type NodeKind enum u8 {
 }
 
 // NOTE: keep in sync with NodeKind
-const char*[NodeKind] node_names = { "UNK", "SCA", "MAP", "SEQ" }
+const char*[NodeKind] node_names = {
+    [Unknown]  = "UNK",
+    [Scalar]   = "SCA",
+    [Map]      = "MAP",
+    [Sequence] = "SEQ",
+}
 
 public type Node struct @(opaque) {
     NodeKind kind;
@@ -41,7 +47,6 @@ public type Node struct @(opaque) {
     }
 }
 
-
 type StackLevel struct {
     i32 indent;     // -1 for root node
     Node* node;
@@ -52,26 +57,28 @@ type Data struct {
     // text
     char* text;
     u32 text_size;
-    char* text_cur;
+    u32 text_cur;
 
     // nodes
     Node* nodes;
     u32 nodes_count;
-    Node* nodes_cur;
+    u32 nodes_cur;
 
     // needed for node resize
     StackLevel* stack;
 }
 
 fn void Data.init(Data* d, u32 text_size, u32 nodes_count, StackLevel* stack) {
+    if (text_size < MinText) text_size = MinText;
     d.text = malloc(text_size);
     d.text_size = text_size;
-    d.text_cur = d.text + 1;   // reserve first byte for empty text
-    d.text[0] = 0;
+    d.text_cur = 1;   // reserve first byte for empty text
+    d.text[0] = '\0';
 
+    if (nodes_count < MinNodes) nodes_count = MinNodes;
     d.nodes = malloc(nodes_count * sizeof(Node));
     d.nodes_count = nodes_count;
-    d.nodes_cur = &d.nodes[1];  // reserve first node
+    d.nodes_cur = 1;  // reserve first node
     memset(&d.nodes[0], 0, sizeof(Node));
 
     d.stack = stack;
@@ -83,46 +90,39 @@ fn void Data.destroy(Data* d) {
 }
 
 fn void Data.resize_nodes(Data* d) {
-    u32 idx = (u32)(d.nodes_cur - d.nodes);
-
     d.nodes_count *= 2;
     Node* nodes2 = malloc(d.nodes_count * sizeof(Node));
-    memcpy(nodes2, d.nodes, idx * sizeof(Node));
+    memcpy(nodes2, d.nodes, d.nodes_cur * sizeof(Node));
 
     // fix-up stack pointers
     for (u32 i=0; i<MaxDepth; i++) {
         StackLevel* sl = &d.stack[i];
         if (sl.node) {
-            u32 node_idx = (u32)(sl.node - d.nodes);
+            isize node_idx = sl.node - d.nodes;
             sl.node = &nodes2[node_idx];
         }
         if (sl.last_child) {
-            u32 last_child_idx = (u32)(sl.last_child - d.nodes);
+            isize last_child_idx = sl.last_child - d.nodes;
             sl.last_child = &nodes2[last_child_idx];
         }
     }
 
     free(d.nodes);
     d.nodes = nodes2;
-    d.nodes_cur = &d.nodes[idx];
 }
 
 fn void Data.resize_text(Data* d) {
-    u32 idx = (u32)(d.text_cur - d.text);
-
     d.text_size *= 2;
     char* text2 = malloc(d.text_size);
-    memcpy(text2, d.text, idx + 1); // also copy 0-termination
+    memcpy(text2, d.text, d.text_cur);
     free(d.text);
     d.text = text2;
-    d.text_cur = &d.text[idx];
 }
 
 fn Node* Data.add_node(Data* d, NodeKind kind, u32 name_idx) {
-    u32 idx = (u32)(d.nodes_cur - d.nodes);
-    if (idx >= d.nodes_count -1) d.resize_nodes();
+    if (d.nodes_cur >= d.nodes_count - 1) d.resize_nodes();
 
-    Node* result = d.nodes_cur;
+    Node* result = &d.nodes[d.nodes_cur];
     d.nodes_cur++;
     result.kind = kind;
     result.next_idx = 0;
@@ -136,12 +136,12 @@ fn u32 Data.node2idx(const Data* d, const Node* n) @(inline) {
 }
 
 fn u32 Data.add_text(Data* d, const char* text, u32 len) {
-    u32 idx = (u32)(d.text_cur - d.text);
+    u32 idx = d.text_cur;
     while (idx + len + 1 >= d.text_size) d.resize_text();
 
-    memcpy(d.text_cur, text, len);
-    d.text_cur[len] = 0;
-    d.text_cur += len+1; // add 0-terminator
+    memcpy(d.text + idx, text, len);
+    d.text[idx + len] = '\0';
+    d.text_cur += len + 1; // skip 0-terminator
     return idx;
 }
 
diff --git a/common/yaml/yaml_dump.c2 b/common/yaml/yaml_dump.c2
@@ -23,11 +23,12 @@ public fn void Parser.dump(const Parser* p, bool verbose) @(unused) {
 }
 
 fn void Data.dump(const Data* d, bool verbose) {
-    u32 node_count = (u32)(d.nodes_cur - d.nodes);
+    u32 node_count = d.nodes_cur;
     if (verbose) {
-        printf("Text %d/%d\n", (u32)(d.text_cur - d.text), d.text_size);
+        printf("Text %d/%d\n", d.text_cur, d.text_size);
         const char* cp = d.text + 1;
-        while (cp < d.text_cur) {
+        const char* end = d.text + d.text_cur;
+        while (cp < end) {
             u32 len = (u32)strlen(cp);
             u32 offset = (u32)(cp - d.text);
             printf("  [%3d] %s\n", offset, cp);
diff --git a/common/yaml/yaml_iterator.c2 b/common/yaml/yaml_iterator.c2
@@ -17,17 +17,17 @@ module yaml;
 
 import string;
 
-public fn bool Node.isMap(const Node* n) @(unused) { return n.kind == NodeKind.Map; }
+public fn bool Node.isMap(const Node* n) @(unused) { return n.kind == Map; }
 
-public fn bool Node.isSequence(const Node* n) @(unused) { return n.kind == NodeKind.Sequence; }
+public fn bool Node.isSequence(const Node* n) @(unused) { return n.kind == Sequence; }
 
-fn bool Node.isScalar(const Node* n) @(unused) { return n.kind == NodeKind.Scalar; }
+fn bool Node.isScalar(const Node* n) @(unused) { return n.kind == Scalar; }
 
 
 // TODO only pass iterators? (that way we dont need Parser* anymore)
 public fn const Node* Parser.getRoot(const Parser* p) {
-    u32 node_count = (u32)(p.data.nodes_cur - p.data.nodes) - 1;
-    if (node_count == 0) return nil;
+    u32 node_count = p.data.nodes_cur;
+    if (node_count <= 1) return nil;
     return &p.data.nodes[1];
 }
 
@@ -42,10 +42,10 @@ public fn const Node* Parser.findNode(const Parser* p, const char* path) {
 }
 
 fn const Node* Data.findNode(const Data* d, const char* path) {
-    u32 node_count = (u32)(d.nodes_cur - d.nodes) - 1;
-    if (node_count == 0) return nil;
+    u32 node_count = d.nodes_cur;
+    if (node_count <= 1) return nil;
     const Node* root = &d.nodes[1];
-    if (root.kind == NodeKind.Sequence) return nil;
+    if (root.kind == Sequence) return nil;
     return d.findChildNode(path, root.child_idx);
 }
 
@@ -59,7 +59,7 @@ fn const Node* Data.findChildNode(const Data* d, const char* path, u32 next) {
             if (rest) { // match
                 path = rest;
                 if (path[0] == 0) return node; // found node
-                if (node.kind == NodeKind.Sequence) return nil;   // dont search in sequence
+                if (node.kind == Sequence) return nil;   // dont search in sequence
                 next = node.child_idx;
                 continue;
             }
@@ -77,7 +77,7 @@ public type Iter struct {
 
 public fn Iter Parser.getNodeChildIter(const Parser* p, const Node* n) {
     Iter iter = { .data = &p.data, .node = nil }
-    if (n && n.kind != NodeKind.Scalar && n.child_idx) {
+    if (n && n.kind != Scalar && n.child_idx) {
         iter.node = p.data.idx2node(n.child_idx);
     }
     return iter;
@@ -103,7 +103,7 @@ public fn const char* Iter.getName(const Iter* iter) {
 
 public fn const char* Iter.getValue(const Iter* iter) {
     const Data* d = (Data*)iter.data;
-    if (iter.node && iter.node.kind == NodeKind.Scalar) return &d.text[iter.node.text_idx];
+    if (iter.node && iter.node.kind == Scalar) return &d.text[iter.node.text_idx];
     return nil;
 }
 
@@ -112,7 +112,7 @@ public fn Iter Iter.getChildIter(Iter* parent) @(unused) {
     if (parent.node == nil) return iter;
 
     const Node* n = parent.node;
-    if (n.kind != NodeKind.Scalar && n.child_idx) {
+    if (n.kind != Scalar && n.child_idx) {
         const Data* d = (Data*)iter.data;
         iter.node = d.idx2node(n.child_idx);
     }
@@ -122,7 +122,7 @@ public fn Iter Iter.getChildIter(Iter* parent) @(unused) {
 public fn const char* Iter.getChildScalarValue(Iter* iter, const char* path) {
     if (!iter.node) return nil;
 
-    if (iter.node.kind == NodeKind.Sequence) return nil;
+    if (iter.node.kind == Sequence) return nil;
     const Data* d = (Data*)iter.data;
     const Node* n = d.findChildNode(path, iter.node.child_idx);
     if (n && n.isScalar()) return &d.text[n.text_idx];
diff --git a/common/yaml/yaml_parser.c2 b/common/yaml/yaml_parser.c2
@@ -52,13 +52,13 @@ public fn void Parser.destroy(Parser* p) {
 public fn bool Parser.parse(Parser* p, const char* input) {
     p.tokenizer.init(input, &p.data, p.message);
 
-    p.token.kind = TokenKind.None;
+    p.token.kind = None;
 
     i32 res = setjmp(&p.jmp_err);
     if (res == 0) {
         p.consumeToken();
 
-        while (p.token.kind != TokenKind.Eof) p.parse_doc();
+        while (p.token.kind != Eof) p.parse_doc();
     } else {
         // got error, error_msg should be set
         return false;
@@ -75,18 +75,20 @@ fn void Parser.error(Parser* p, const char* format @(printf_format), ...) {
     va_list args;
     va_start(args, format);
     char* cp = p.message;
-    cp += vsnprintf(cp, MaxDiag-1, format, args);
+    i32 len = vsnprintf(cp, MaxDiag, format, args);
     va_end(args);
-    sprintf(cp, " %s", p.token.loc.str());
+    if ((u32)len < MaxDiag) {
+        snprintf(cp + len, MaxDiag - len, "at line %d:%d", p.token.loc.line, p.token.loc.column);
+    }
     longjmp(&p.jmp_err, 1);
 }
 
 fn void Parser.consumeToken(Parser* p) {
     p.tokenizer.lex(&p.token);
 #if YamlPrintToken
-    printf("%s  %s  %d\n", p.token.str(), p.token.loc.str(), p.token.same_line);
+    printf("%s  pos %d:%d  %d\n", p.token.str(), p.token.loc.line, p.token.loc.column, p.token.same_line);
 #endif
-    if (p.token.kind == TokenKind.Error) longjmp(&p.jmp_err, 1);
+    if (p.token.kind == Error) longjmp(&p.jmp_err, 1);
 }
 
 fn void Parser.expectAndConsume(Parser* p, TokenKind kind) {
@@ -130,16 +132,16 @@ fn void Parser.parse_node(Parser* p) {
     case Plain_Scalar:
     case Single_Quoted_Scalar:
     case Double_Quoted_Scalar:
-        Node* n = p.data.add_node(NodeKind.Unknown, p.token.text_idx);
-        p.push_node(n, NodeKind.Unknown, p.cur_indent);
+        Node* n = p.data.add_node(Unknown, p.token.text_idx);
+        p.push_node(n, Unknown, p.cur_indent);
         p.consumeToken();
-        p.expectAndConsume(TokenKind.Colon);
+        p.expectAndConsume(Colon);
         p.parse_value();
         break;
     case Dash:
         p.consumeToken();
-        Node* n = p.data.add_node(NodeKind.Unknown, 0);
-        p.push_node(n, NodeKind.Sequence, p.cur_indent + 1);
+        Node* n = p.data.add_node(Unknown, 0);
+        p.push_node(n, Sequence, p.cur_indent + 1);
         p.parse_node_or_value();
         break;
     case Indent:
@@ -175,8 +177,8 @@ fn void Parser.parse_value(Parser* p) {
         return;
     case Dash:
         p.consumeToken();
-        Node* n = p.data.add_node(NodeKind.Unknown, 0);
-        p.push_node(n, NodeKind.Sequence, p.cur_indent + 1);
+        Node* n = p.data.add_node(Unknown, 0);
+        p.push_node(n, Sequence, p.cur_indent + 1);
         p.parse_node_or_value();
         return;
     case Indent:
@@ -208,7 +210,7 @@ fn void Parser.parse_node_or_value(Parser* p) {
     case Single_Quoted_Scalar:
     case Double_Quoted_Scalar:
         Token* next = p.tokenizer.lex_next();
-        if (next.kind == TokenKind.Colon) {
+        if (next.kind == Colon) {
             // NOTE: this doesn't work, because tokenizer doesn't know (and doesn't give DEDENT)
             p.cur_indent += 2; // one for dash, one for node
             // TEMP DIRTY HACK, how to do properly?
@@ -231,8 +233,8 @@ fn void Parser.doc_start(Parser* p) {
 
 fn void Parser.doc_end(Parser* p) {
     p.cur_indent = -1;
-    if (p.stack_size == 1 && p.stack[0].node.kind == NodeKind.Unknown) {
-        p.stack[0].node.kind = NodeKind.Map;
+    if (p.stack_size == 1 && p.stack[0].node.kind == Unknown) {
+        p.stack[0].node.kind = Map;
     }
     p.pop();
     p.cur_indent = 0;
@@ -242,11 +244,11 @@ fn void Parser.doc_end(Parser* p) {
 fn void Parser.add_scalar_value(Parser* p, u32 value_idx) {
     StackLevel* top = &p.stack[p.stack_size-1];
     Node* n = top.node;
-    if (n.kind != NodeKind.Unknown) {
+    if (n.kind != Unknown) {
         //p.error("%s() cannot add scalar to node", __func__);
         p.error("%s() cannot add scalar to node", "add_scalar_value");
     }
-    n.kind = NodeKind.Scalar;
+    n.kind = Scalar;
     n.text_idx = value_idx;
 }
 
@@ -260,7 +262,7 @@ fn void Parser.pop(Parser* p) {
             StackLevel* prev = &p.stack[p.stack_size-2];
             prev.last_child = top.node;
         }
-        if (top.node.kind == NodeKind.Unknown) top.node.kind = NodeKind.Scalar;
+        if (top.node.kind == Unknown) top.node.kind = Scalar;
 
         top.indent = 0;
         top.node = nil;
@@ -270,7 +272,7 @@ fn void Parser.pop(Parser* p) {
 }
 
 fn void Parser.push_root(Parser* p) {
-    Node* root = p.data.add_node(NodeKind.Unknown, 0);
+    Node* root = p.data.add_node(Unknown, 0);
     StackLevel* top = &p.stack[0];
     if (p.stack_size) {
         top.node.next_idx = p.data.node2idx(root);
@@ -295,7 +297,7 @@ fn void Parser.push_node(Parser* p, Node* n, NodeKind parent_kind, i32 indent) {
     if (top.indent == indent) { // same level
         if (top.node) {
             // close old node as SCALAR with empty data
-            if (top.node.kind == NodeKind.Unknown) top.node.kind = NodeKind.Scalar;
+            if (top.node.kind == Unknown) top.node.kind = Scalar;
             top.node.next_idx = n_idx;
         }
         top.last_child = nil;
@@ -304,9 +306,9 @@ fn void Parser.push_node(Parser* p, Node* n, NodeKind parent_kind, i32 indent) {
         assert(indent > top.indent);
         Node* parent = top.node;
 
-        if (parent.kind == NodeKind.Unknown) {
+        if (parent.kind == Unknown) {
             // just assign it
-            if (parent_kind == NodeKind.Unknown) parent_kind = NodeKind.Map;
+            if (parent_kind == Unknown) parent_kind = Map;
             parent.kind = parent_kind;
         }
         if (top.last_child) {
@@ -326,9 +328,8 @@ fn void Parser.push_node(Parser* p, Node* n, NodeKind parent_kind, i32 indent) {
     StackLevel* prev = &p.stack[p.stack_size-2];
     Node* parent = prev.node;
 
-    if (parent.kind != parent_kind
-            && !(parent.kind == NodeKind.Map && parent_kind == NodeKind.Unknown)) {
-        if (parent.kind == NodeKind.Sequence) {
+    if (parent.kind != parent_kind && !(parent.kind == Map && parent_kind == Unknown)) {
+        if (parent.kind == Sequence) {
             p.error("invalid scalar after sequence");
         } else {
             p.error("invalid scalar after %s", node_names[parent.kind]);
diff --git a/common/yaml/yaml_tokenizer.c2 b/common/yaml/yaml_tokenizer.c2