more robust whitespace handling in reconstruction (#271)

kkozik-amplify · web-flow · commit 43dd96fee3e3 · 2026-03-11T12:09:48.000+01:00
* Add postlexer to support multiline binary operators and ternary expressions

* CLAUED.md - add pre-commit section
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -131,6 +131,14 @@ Use concrete stubs when testing ABCs (e.g., `StubExpression(ExpressionRule)`).
 
 Always run round-trip full test suite after any modification.
 
+## Pre-commit Checks
+
+Hooks are defined in `.pre-commit-config.yaml` (includes black, mypy, pylint, and others). All changed files must pass these checks before committing. When writing or modifying code:
+
+- Format Python with **black** (Python 3.8 target).
+- Ensure **mypy** and **pylint** pass. Pylint config is in `pylintrc`, scoped to `hcl2/` and `test/`.
+- End files with a newline; strip trailing whitespace (except under `test/integration/(hcl2_reconstructed|specialized)/`).
+
 ## Keeping Docs Current
 
 Update this file when architecture, modules, API surface, or testing conventions change. Also update `README.md` and `docs/usage.md` when changes affect the public API, CLI flags, or option fields.
diff --git a/hcl2/formatter.py b/hcl2/formatter.py
@@ -280,8 +280,8 @@ def _align_attributes_sequence(self, attributes_sequence: List[AttributeRule]):
         for attribute in attributes_sequence:
             name_length = len(attribute.identifier.token.value)
             spaces_to_add = max_length - name_length
-            base = attribute.children[1].value.lstrip(" ")
-            attribute.children[1].set_value(" " * spaces_to_add + base)
+            base = attribute.children[1].value.lstrip(" \t")
+            attribute.children[1].set_value(" " * (spaces_to_add + 1) + base)
 
     def _vertically_align_object_elems(self, rule: ObjectRule):
         max_length = max(self._key_text_width(elem.key) for elem in rule.elements)
@@ -294,8 +294,12 @@ def _vertically_align_object_elems(self, rule: ObjectRule):
             if isinstance(separator, COLON):  # type: ignore[misc]
                 spaces_to_add += 1
 
-            base = separator.value.lstrip(" ")
-            elem.children[1].set_value(" " * spaces_to_add + base)
+            base = separator.value.lstrip(" \t")
+            # EQ gets +1 for the base space (reconstructor no longer adds it
+            # when the token already has leading whitespace); COLON already
+            # has its own +1 above and the reconstructor doesn't add space.
+            extra = 1 if not isinstance(separator, COLON) else 0  # type: ignore[misc]
+            elem.children[1].set_value(" " * (spaces_to_add + extra) + base)
 
     @staticmethod
     def _key_text_width(key: LarkElement) -> int:
diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark
@@ -50,7 +50,7 @@ RSQB : "]"
 COMMA : ","
 DOT : "."
 EQ : /[ \t]*=(?!=|>)/
-COLON : ":"
+COLON : /[ \t]*:(?!:)/
 DBLQUOTE : "\""
 
 // Interpolation
diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py
@@ -4,6 +4,7 @@
 from lark import Tree, Token
 from hcl2.rules import tokens
 from hcl2.rules.base import BlockRule
+from hcl2.rules.containers import ObjectElemRule
 from hcl2.rules.for_expressions import ForIntroRule, ForTupleExprRule, ForObjectExprRule
 from hcl2.rules.literal_rules import IdentifierRule
 from hcl2.rules.strings import StringRule
@@ -76,14 +77,20 @@ def _should_add_space_before(
                 or self._last_token_name
                 in [tokens.COLON.lark_name(), tokens.QMARK.lark_name()]
             ):
+                # COLON may already carry leading whitespace from the grammar
+                if token_type == tokens.COLON.lark_name() and str(
+                    current_node
+                ).startswith((" ", "\t")):
+                    return False
                 return True
 
-            # Space after
+            # Space before colon in for_intro
             if (
                 parent_rule_name == ForIntroRule.lark_name()
                 and token_type == tokens.COLON.lark_name()
             ):
-
+                if str(current_node).startswith((" ", "\t")):
+                    return False
                 return True
 
             # Space after commas in tuples and function arguments...
@@ -120,10 +127,28 @@ def _should_add_space_before(
                 return True
 
             # Space after ellipsis in function arguments
+            # ... except before newlines which provide their own whitespace
             if self._last_token_name == tokens.ELLIPSIS.lark_name():
+                if token_type == "NL_OR_COMMENT":
+                    return False
                 return True
 
-            if tokens.EQ.lark_name() in [token_type, self._last_token_name]:
+            # Space around EQ and COLON separators in attributes/object elements.
+            # Both terminals may carry leading whitespace from the original
+            # source (e.g. "   =" for aligned attributes, " :" for object
+            # elements).  Skip the automatic space when the token already
+            # provides it.  COLON only gets space if it already has leading
+            # whitespace (unlike EQ which always gets at least one space).
+            if token_type == tokens.EQ.lark_name():
+                if str(current_node).startswith((" ", "\t")):
+                    return False
+                return True
+            if token_type == tokens.COLON.lark_name():
+                return False
+            if self._last_token_name == tokens.EQ.lark_name():
+                # Don't add space before newlines which provide their own whitespace
+                if token_type == "NL_OR_COMMENT":
+                    return False
                 return True
 
             # Don't add space around operator tokens inside unary_op
@@ -158,14 +183,16 @@ def _should_add_space_before(
             ):
                 return True
 
-            # Space after colon in for expressions (before value expression,
-            # but not before newline/comment which provides its own whitespace)
+            # Space after colon in for expressions and object elements
+            # (before value expression, but not before newline/comment
+            # which provides its own whitespace)
             if (
                 self._last_token_name == tokens.COLON.lark_name()
                 and parent_rule_name
                 in [
                     ForTupleExprRule.lark_name(),
                     ForObjectExprRule.lark_name(),
+                    ObjectElemRule.lark_name(),
                 ]
                 and rule_name != "new_line_or_comment"
             ):
@@ -253,6 +280,14 @@ def reconstruct(self, tree: Tree, postproc=None) -> str:
         if postproc:
             result = postproc(result)
 
+        # The grammar's body rule ends with an optional new_line_or_comment
+        # which captures the final newline.  The parser often produces two
+        # NL_OR_COMMENT tokens for a single trailing newline (the statement
+        # separator plus the EOF newline), resulting in a spurious blank line.
+        # Strip exactly one trailing newline when there are two or more.
+        if result.endswith("\n\n"):
+            result = result[:-1]
+
         # Ensure file ends with newline
         if result and not result.endswith("\n"):
             result += "\n"
diff --git a/hcl2/rules/expressions.py b/hcl2/rules/expressions.py
@@ -248,7 +248,9 @@ def serialize(
         """Serialize to 'lhs operator rhs' string."""
         with context.modify(inside_dollar_string=True):
             lhs = self.expr_term.serialize(options, context)
-            operator = self.binary_term.binary_operator.serialize(options, context)
+            operator = str(
+                self.binary_term.binary_operator.serialize(options, context)
+            ).strip()
             rhs = self.binary_term.expr_term.serialize(options, context)
 
         result = f"{lhs} {operator} {rhs}"
@@ -286,7 +288,8 @@ def serialize(
     ) -> Any:
         """Serialize to 'operator operand' string."""
         with context.modify(inside_dollar_string=True):
-            result = f"{self.operator}{self.expr_term.serialize(options, context)}"
+            operator = self.operator.rstrip()
+            result = f"{operator}{self.expr_term.serialize(options, context)}"
 
         if not context.inside_dollar_string:
             result = to_dollar_string(result)
diff --git a/hcl2/transformer.py b/hcl2/transformer.py
@@ -83,10 +83,10 @@ def __default_token__(self, token: Token) -> StringToken:
         # TODO make this return StaticStringToken where applicable
         value = token.value
         # The EQ terminal /[ \t]*=(?!=|>)/ captures leading whitespace.
-        # Strip it so parsed and deserialized tokens produce the same "=" value,
-        # preventing the reconstructor from emitting double spaces.
-        if token.type == "EQ":
-            value = value.lstrip(" \t")
+        # Preserve it so the direct pipeline (to_lark → reconstruct) retains
+        # original alignment.  The reconstructor skips its own space insertion
+        # when the EQ token already carries leading whitespace.
+
         if value in StaticStringToken.classes_by_value:
             return StaticStringToken.classes_by_value[value]()
         return StringToken[token.type](value)  # type: ignore[misc]
diff --git a/test/integration/hcl2_original/smoke.tf b/test/integration/hcl2_original/smoke.tf
@@ -3,7 +3,7 @@ block label1 label2 {
   a = 5
   b = 1256.5
   c = 15 + (10 * 12)
-  d = (- a)
+  d = (-a)
   e = (
     a == b
     ? true : false
diff --git a/test/integration/hcl2_reconstructed/smoke.tf b/test/integration/hcl2_reconstructed/smoke.tf
@@ -61,7 +61,7 @@ block {
         for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names :
         module.remote_state_subaccounts.map[account_name].outputs["aws_account_id"]
       ]
-    } ... 
+    } ...
     if substr(bucket_name, 0, 1) == "l"
   }
 }
diff --git a/test/integration/test_round_trip.py b/test/integration/test_round_trip.py
@@ -79,6 +79,14 @@ def _parse_and_serialize(hcl_text: str, options=None) -> dict:
     return rules.serialize()
 
 
+def _direct_reconstruct(hcl_text: str) -> str:
+    """Parse HCL text, transform to IR, convert to Lark tree, and reconstruct."""
+    parsed_tree = parses_to_tree(hcl_text)
+    rules = RuleTransformer().transform(parsed_tree)
+    lark_tree = rules.to_lark()
+    return HCLReconstructor().reconstruct(lark_tree)
+
+
 def _deserialize_and_reserialize(serialized: dict) -> dict:
     """Deserialize a Python dict back through the rule tree and reserialize."""
     deserializer = BaseDeserializer()
@@ -120,6 +128,33 @@ def test_hcl_to_json(self):
                 )
 
 
+class TestDirectReconstruction(TestCase):
+    """Test HCL2 → IR → HCL2 direct pipeline.
+
+    Parse HCL, transform to IR, convert directly to Lark tree (skipping
+    serialization to dict), reconstruct HCL, then verify the result
+    re-parses to the same JSON as the original.
+    """
+
+    maxDiff = None
+
+    def test_direct_reconstruct(self):
+        for suite in _get_suites():
+            with self.subTest(suite=suite):
+                hcl_path = _get_suite_file(suite, SuiteStep.ORIGINAL)
+                original_hcl = hcl_path.read_text()
+
+                # Direct: HCL → IR → Lark → HCL
+                reconstructed_hcl = _direct_reconstruct(original_hcl)
+
+                self.assertMultiLineEqual(
+                    reconstructed_hcl,
+                    original_hcl,
+                    f"Direct reconstruction mismatch for suite {suite}: "
+                    f"HCL → IR → HCL did not match original HCL",
+                )
+
+
 class TestRoundTripReserialization(TestCase):
     """Test JSON → JSON reserialization.
 

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ block {`
`61`	`61`	`for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names :`
`62`	`62`	`module.remote_state_subaccounts.map[account_name].outputs["aws_account_id"]`
`63`	`63`	`]`
`64`		`- } ...`
	`64`	`+ } ...`
`65`	`65`	`if substr(bucket_name, 0, 1) == "l"`
`66`	`66`	`}`
`67`	`67`	`}`