Add postlexer to support multiline binary operators and ternary expressions

kkozik-amplify · kkozik-amplify · commit ba0039faea5d · 2026-03-10T14:26:58.000+01:00
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -3,16 +3,20 @@
 ## Pipeline
 
 ```
-Forward:  HCL2 Text → Lark Parse Tree → LarkElement Tree → Python Dict/JSON
+Forward:  HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Python Dict/JSON
 Reverse:  Python Dict/JSON → LarkElement Tree → Lark Tree → HCL2 Text
+Direct:   HCL2 Text → [PostLexer] → Lark Parse Tree → LarkElement Tree → Lark Tree → HCL2 Text
 ```
 
+The **Direct** pipeline (`parse_to_tree` → `transform` → `to_lark` → `reconstruct`) skips serialization to dict, so all IR nodes (including `NewLineOrCommentRule` nodes for whitespace/comments) directly influence the reconstructed output. Any information discarded before the IR is lost in this pipeline.
+
 ## Module Map
 
 | Module | Role |
 |---|---|
 | `hcl2/hcl2.lark` | Lark grammar definition |
 | `hcl2/api.py` | Public API (`load/loads/dump/dumps` + intermediate stages) |
+| `hcl2/postlexer.py` | Token stream transforms between lexer and parser |
 | `hcl2/parser.py` | Lark parser factory with caching |
 | `hcl2/transformer.py` | Lark parse tree → LarkElement tree |
 | `hcl2/deserializer.py` | Python dict → LarkElement tree |
@@ -73,6 +77,16 @@ jsontohcl2 --indent 4 --no-align file.json
 
 Add new options as `parser.add_argument()` calls in the relevant entry point module.
 
+## PostLexer (`postlexer.py`)
+
+Lark's `postlex` parameter accepts a single object with a `process(stream)` method that transforms the token stream between the lexer and LALR parser. The `PostLexer` class is designed for extensibility: each transformation is a private method that accepts and yields tokens, and `process()` chains them together.
+
+Current passes:
+
+- `_merge_newlines_into_operators`
+
+To add a new pass: create a private method with the same `(self, stream) -> generator` signature, and add a `yield from` call in `process()`.
+
 ## Hard Rules
 
 These are project-specific constraints that must not be violated:
@@ -88,6 +102,7 @@ These are project-specific constraints that must not be violated:
 ## Adding a New Language Construct
 
 1. Add grammar rules to `hcl2.lark`
+1. If the new construct creates LALR ambiguities with `NL_OR_COMMENT`, add a postlexer pass in `postlexer.py`
 1. Create rule class(es) in the appropriate `rules/` file
 1. Add transformer method(s) in `transformer.py`
 1. Implement `serialize()` in the rule class
@@ -103,9 +118,9 @@ python -m unittest discover -s test -p "test_*.py" -v
 
 **Unit tests** (`test/unit/`): instantiate rule objects directly (no parsing).
 
-- `test/unit/rules/` — one file per rules module
-- `test/unit/cli/` — one file per CLI module
-- `test/unit/test_api.py`, `test_builder.py`, `test_deserializer.py`, `test_formatter.py`, `test_reconstructor.py`, `test_utils.py`
+- `rules/` — one file per rules module
+- `cli/` — one file per CLI module
+- `test_*.py` — tests for corresponding files from `hcl2/` directory
 
 Use concrete stubs when testing ABCs (e.g., `StubExpression(ExpressionRule)`).
 
diff --git a/hcl2/parser.py b/hcl2/parser.py
@@ -4,6 +4,8 @@
 
 from lark import Lark
 
+from hcl2.postlexer import PostLexer
+
 
 PARSER_FILE = Path(__file__).absolute().resolve().parent / ".lark_cache.bin"
 
@@ -17,4 +19,5 @@ def parser() -> Lark:
         cache=str(PARSER_FILE),  # Disable/Delete file to effect changes to the grammar
         rel_to=__file__,
         propagate_positions=True,
+        postlex=PostLexer(),
     )
diff --git a/hcl2/postlexer.py b/hcl2/postlexer.py
@@ -0,0 +1,80 @@
+"""Postlexer that transforms the token stream between the Lark lexer and parser.
+
+Each transformation is implemented as a private method that accepts and yields
+tokens.  The public ``process`` method chains them together, making it easy to
+add new passes without touching existing logic.
+"""
+
+from collections.abc import Iterator
+from typing import FrozenSet, Optional, Tuple
+
+from lark import Token
+
+# Type alias for a token stream consumed and produced by each pass.
+TokenStream = Iterator[Token]
+
+# Operator token types that may legally follow a line-continuation newline.
+# MINUS is excluded — it is also the unary negation operator, and merging a
+# newline into MINUS would incorrectly consume statement-separating newlines
+# before negative literals (e.g. "a = 1\nb = -2").
+OPERATOR_TYPES: FrozenSet[str] = frozenset(
+    {
+        "DOUBLE_EQ",
+        "NEQ",
+        "LT",
+        "GT",
+        "LEQ",
+        "GEQ",
+        "ASTERISK",
+        "SLASH",
+        "PERCENT",
+        "DOUBLE_AMP",
+        "DOUBLE_PIPE",
+        "PLUS",
+        "QMARK",
+    }
+)
+
+
+class PostLexer:
+    """Transform the token stream before it reaches the LALR parser."""
+
+    def process(self, stream: TokenStream) -> TokenStream:
+        """Chain all postlexer passes over the token stream."""
+        yield from self._merge_newlines_into_operators(stream)
+
+    def _merge_newlines_into_operators(self, stream: TokenStream) -> TokenStream:
+        """Merge NL_OR_COMMENT tokens into immediately following operator tokens.
+
+        LALR parsers cannot distinguish a statement-ending newline from a
+        line-continuation newline before a binary operator.  This pass resolves
+        the ambiguity by merging NL_OR_COMMENT into the operator token's value
+        when the next token is a binary operator or QMARK.  The transformer
+        later extracts the newline prefix and creates a NewLineOrCommentRule
+        node, preserving round-trip fidelity.
+        """
+        pending_nl: Optional[Token] = None
+        for token in stream:
+            if token.type == "NL_OR_COMMENT":
+                if pending_nl is not None:
+                    yield pending_nl
+                pending_nl = token
+            else:
+                if pending_nl is not None:
+                    if token.type in OPERATOR_TYPES:
+                        token = token.update(value=str(pending_nl) + str(token))
+                    else:
+                        yield pending_nl
+                    pending_nl = None
+                yield token
+        if pending_nl is not None:
+            yield pending_nl
+
+    @property
+    def always_accept(self) -> Tuple[()]:
+        """Terminal names the parser must accept even when not expected by LALR.
+
+        Lark requires this property on postlexer objects.  An empty tuple
+        means no extra terminals are injected.
+        """
+        return ()
diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py
@@ -183,6 +183,11 @@ def _reconstruct_tree(
         # Check spacing BEFORE processing children, while _last_rule_name
         # still reflects the previous sibling (not a child of this tree).
         needs_space = self._should_add_space_before(tree, parent_rule_name)
+        if needs_space:
+            # A space will be inserted before this tree's output, so tell
+            # children that the last character was a space to prevent the
+            # first child from adding a duplicate leading space.
+            self._last_was_space = True
 
         if rule_name == UnaryOpRule.lark_name():
             for i, child in enumerate(tree.children):
diff --git a/hcl2/rules/expressions.py b/hcl2/rules/expressions.py
@@ -122,6 +122,7 @@ class ConditionalRule(ExpressionRule):
 
     _children_layout: Tuple[
         ExpressionRule,
+        Optional[NewLineOrCommentRule],
         QMARK,
         Optional[NewLineOrCommentRule],
         ExpressionRule,
@@ -137,7 +138,7 @@ def lark_name() -> str:
         return "conditional"
 
     def __init__(self, children, meta: Optional[Meta] = None):
-        self._insert_optionals(children, [2, 4, 6])
+        self._insert_optionals(children, [1, 3, 5, 7])
         super().__init__(children, meta)
 
     @property
@@ -148,12 +149,12 @@ def condition(self) -> ExpressionRule:
     @property
     def if_true(self) -> ExpressionRule:
         """Return the true-branch expression."""
-        return self._children[3]
+        return self._children[4]
 
     @property
     def if_false(self) -> ExpressionRule:
         """Return the false-branch expression."""
-        return self._children[7]
+        return self._children[8]
 
     def serialize(
         self, options=SerializationOptions(), context=SerializationContext()
@@ -179,6 +180,7 @@ class BinaryTermRule(ExpressionRule):
     """Rule for the operator+operand portion of a binary operation."""
 
     _children_layout: Tuple[
+        Optional[NewLineOrCommentRule],
         BinaryOperatorRule,
         Optional[NewLineOrCommentRule],
         ExprTermRule,
@@ -190,18 +192,18 @@ def lark_name() -> str:
         return "binary_term"
 
     def __init__(self, children, meta: Optional[Meta] = None):
-        self._insert_optionals(children, [1])
+        self._insert_optionals(children, [0, 2])
         super().__init__(children, meta)
 
     @property
     def binary_operator(self) -> BinaryOperatorRule:
         """Return the binary operator."""
-        return self._children[0]
+        return self._children[1]
 
     @property
     def expr_term(self) -> ExprTermRule:
         """Return the right-hand operand."""
-        return self._children[2]
+        return self._children[3]
 
     def serialize(
         self, options=SerializationOptions(), context=SerializationContext()
diff --git a/hcl2/transformer.py b/hcl2/transformer.py
@@ -81,9 +81,15 @@ def __init__(self, discard_new_line_or_comments: bool = False):
 
     def __default_token__(self, token: Token) -> StringToken:
         # TODO make this return StaticStringToken where applicable
-        if token.value in StaticStringToken.classes_by_value:
-            return StaticStringToken.classes_by_value[token.value]()
-        return StringToken[token.type](token.value)  # type: ignore[misc]
+        value = token.value
+        # The EQ terminal /[ \t]*=(?!=|>)/ captures leading whitespace.
+        # Strip it so parsed and deserialized tokens produce the same "=" value,
+        # preventing the reconstructor from emitting double spaces.
+        if token.type == "EQ":
+            value = value.lstrip(" \t")
+        if value in StaticStringToken.classes_by_value:
+            return StaticStringToken.classes_by_value[value]()
+        return StringToken[token.type](value)  # type: ignore[misc]
 
     # pylint: disable=C0103
     def FLOAT_LITERAL(self, token: Token) -> FloatLiteral:
@@ -164,8 +170,32 @@ def heredoc_template_trim(self, meta: Meta, args) -> HeredocTrimTemplateRule:
     def expr_term(self, meta: Meta, args) -> ExprTermRule:
         return ExprTermRule(args, meta)
 
+    def _extract_nl_prefix(self, token):
+        """Strip leading newlines from a token value.
+
+        If the token contains a newline prefix (from the postlexer merging a
+        line-continuation newline into the operator token), strip it and
+        return a NewLineOrCommentRule.  Otherwise return None.
+        """
+        value = str(token.value)
+        stripped = value.lstrip("\n \t")
+        if len(stripped) == len(value):
+            return None
+        nl_text = value[: len(value) - len(stripped)]
+        token.set_value(stripped)
+        if self.discard_new_line_or_comments:
+            return None
+        return NewLineOrCommentRule.from_string(nl_text)
+
     @v_args(meta=True)
     def conditional(self, meta: Meta, args) -> ConditionalRule:
+        # args: [condition, QMARK, NL?, if_true, NL?, COLON, NL?, if_false]
+        # QMARK is at index 1 — check for NL prefix from the postlexer
+        qmark_token = args[1]
+        nl_rule = self._extract_nl_prefix(qmark_token)
+        if nl_rule is not None:
+            args = list(args)
+            args.insert(1, nl_rule)
         return ConditionalRule(args, meta)
 
     @v_args(meta=True)
@@ -174,6 +204,12 @@ def binary_operator(self, meta: Meta, args) -> BinaryOperatorRule:
 
     @v_args(meta=True)
     def binary_term(self, meta: Meta, args) -> BinaryTermRule:
+        # args: [BinaryOperatorRule, NL?, ExprTermRule]
+        # The operator's token may contain a NL prefix from the postlexer
+        op_rule = args[0]
+        nl_rule = self._extract_nl_prefix(op_rule.token)
+        if nl_rule is not None:
+            args = [nl_rule] + list(args)
         return BinaryTermRule(args, meta)
 
     @v_args(meta=True)
diff --git a/test/integration/hcl2_original/smoke.tf b/test/integration/hcl2_original/smoke.tf
@@ -43,6 +43,28 @@ block label1 label2 {
   }
 }
 
+block multiline_ternary {
+  foo = (
+    bar
+    ? baz(foo)
+    : foo == "bar"
+    ? "baz"
+    : foo
+  )
+}
+
+block multiline_binary_ops {
+  expr = {
+    for k, v in local.map_a : k => v
+    if lookup(local.map_b[v.id
+    ], "enabled", false)
+    || (
+    contains(local.map_c, v.id)
+    && contains(local.map_d, v.id)
+    )
+  }
+}
+
 block {
   route53_forwarding_rule_shares = {
     for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) :
diff --git a/test/integration/hcl2_reconstructed/smoke.tf b/test/integration/hcl2_reconstructed/smoke.tf
@@ -39,6 +39,20 @@ block label1 label2 {
 }
 
 
+block multiline_ternary {
+  foo = (bar ? baz(foo) : foo == "bar" ? "baz" : foo)
+}
+
+
+block multiline_binary_ops {
+  expr = {
+    for k, v in local.map_a :
+    k => v
+    if lookup(local.map_b[v.id], "enabled", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))
+  }
+}
+
+
 block {
   route53_forwarding_rule_shares = {
     for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) :
diff --git a/test/integration/json_reserialized/smoke.json b/test/integration/json_reserialized/smoke.json
@@ -48,6 +48,18 @@
     }
    }
   },
+  {
+   "multiline_ternary": {
+    "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}",
+    "__is_block__": true
+   }
+  },
+  {
+   "multiline_binary_ops": {
+    "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}",
+    "__is_block__": true
+   }
+  },
   {
    "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}",
    "__is_block__": true
diff --git a/test/integration/json_serialized/smoke.json b/test/integration/json_serialized/smoke.json
@@ -48,6 +48,18 @@
     }
    }
   },
+  {
+   "multiline_ternary": {
+    "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}",
+    "__is_block__": true
+   }
+  },
+  {
+   "multiline_binary_ops": {
+    "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) || (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}",
+    "__is_block__": true
+   }
+  },
   {
    "route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}",
    "__is_block__": true
diff --git a/test/unit/rules/test_expressions.py b/test/unit/rules/test_expressions.py
diff --git a/test/unit/test_postlexer.py b/test/unit/test_postlexer.py

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,18 @@`
`48`	`48`	`}`
`49`	`49`	`}`
`50`	`50`	`},`
	`51`	`+ {`
	`52`	`+ "multiline_ternary": {`
	`53`	`+ "foo": "${(bar ? baz(foo) : foo == \"bar\" ? \"baz\" : foo)}",`
	`54`	`+ "__is_block__": true`
	`55`	`+ }`
	`56`	`+ },`
	`57`	`+ {`
	`58`	`+ "multiline_binary_ops": {`
	`59`	`+ "expr": "${{for k, v in local.map_a : k => v if lookup(local.map_b[v.id], \"enabled\", false) \|\| (contains(local.map_c, v.id) && contains(local.map_d, v.id))}}",`
	`60`	`+ "__is_block__": true`
	`61`	`+ }`
	`62`	`+ },`
`51`	`63`	`{`
`52`	`64`	`"route53_forwarding_rule_shares": "${{for forwarding_rule_key in keys(var.route53_resolver_forwarding_rule_shares) : \"${forwarding_rule_key}\" => {aws_account_ids = [for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names : module.remote_state_subaccounts.map[account_name].outputs[\"aws_account_id\"]]}... if substr(bucket_name, 0, 1) == \"l\"}}",`
`53`	`65`	`"__is_block__": true`