Skip to content

Commit 43dd96f

Browse files
more robust whitespace handling in reconstruction (#271)
* Add postlexer to support multiline binary operators and ternary expressions * CLAUED.md - add pre-commit section
1 parent 2dd72b7 commit 43dd96f

File tree

9 files changed

+103
-18
lines changed

9 files changed

+103
-18
lines changed

CLAUDE.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,14 @@ Use concrete stubs when testing ABCs (e.g., `StubExpression(ExpressionRule)`).
131131

132132
Always run round-trip full test suite after any modification.
133133

134+
## Pre-commit Checks
135+
136+
Hooks are defined in `.pre-commit-config.yaml` (includes black, mypy, pylint, and others). All changed files must pass these checks before committing. When writing or modifying code:
137+
138+
- Format Python with **black** (Python 3.8 target).
139+
- Ensure **mypy** and **pylint** pass. Pylint config is in `pylintrc`, scoped to `hcl2/` and `test/`.
140+
- End files with a newline; strip trailing whitespace (except under `test/integration/(hcl2_reconstructed|specialized)/`).
141+
134142
## Keeping Docs Current
135143

136144
Update this file when architecture, modules, API surface, or testing conventions change. Also update `README.md` and `docs/usage.md` when changes affect the public API, CLI flags, or option fields.

hcl2/formatter.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,8 @@ def _align_attributes_sequence(self, attributes_sequence: List[AttributeRule]):
280280
for attribute in attributes_sequence:
281281
name_length = len(attribute.identifier.token.value)
282282
spaces_to_add = max_length - name_length
283-
base = attribute.children[1].value.lstrip(" ")
284-
attribute.children[1].set_value(" " * spaces_to_add + base)
283+
base = attribute.children[1].value.lstrip(" \t")
284+
attribute.children[1].set_value(" " * (spaces_to_add + 1) + base)
285285

286286
def _vertically_align_object_elems(self, rule: ObjectRule):
287287
max_length = max(self._key_text_width(elem.key) for elem in rule.elements)
@@ -294,8 +294,12 @@ def _vertically_align_object_elems(self, rule: ObjectRule):
294294
if isinstance(separator, COLON): # type: ignore[misc]
295295
spaces_to_add += 1
296296

297-
base = separator.value.lstrip(" ")
298-
elem.children[1].set_value(" " * spaces_to_add + base)
297+
base = separator.value.lstrip(" \t")
298+
# EQ gets +1 for the base space (reconstructor no longer adds it
299+
# when the token already has leading whitespace); COLON already
300+
# has its own +1 above and the reconstructor doesn't add space.
301+
extra = 1 if not isinstance(separator, COLON) else 0 # type: ignore[misc]
302+
elem.children[1].set_value(" " * (spaces_to_add + extra) + base)
299303

300304
@staticmethod
301305
def _key_text_width(key: LarkElement) -> int:

hcl2/hcl2.lark

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ RSQB : "]"
5050
COMMA : ","
5151
DOT : "."
5252
EQ : /[ \t]*=(?!=|>)/
53-
COLON : ":"
53+
COLON : /[ \t]*:(?!:)/
5454
DBLQUOTE : "\""
5555

5656
// Interpolation

hcl2/reconstructor.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from lark import Tree, Token
55
from hcl2.rules import tokens
66
from hcl2.rules.base import BlockRule
7+
from hcl2.rules.containers import ObjectElemRule
78
from hcl2.rules.for_expressions import ForIntroRule, ForTupleExprRule, ForObjectExprRule
89
from hcl2.rules.literal_rules import IdentifierRule
910
from hcl2.rules.strings import StringRule
@@ -76,14 +77,20 @@ def _should_add_space_before(
7677
or self._last_token_name
7778
in [tokens.COLON.lark_name(), tokens.QMARK.lark_name()]
7879
):
80+
# COLON may already carry leading whitespace from the grammar
81+
if token_type == tokens.COLON.lark_name() and str(
82+
current_node
83+
).startswith((" ", "\t")):
84+
return False
7985
return True
8086

81-
# Space after
87+
# Space before colon in for_intro
8288
if (
8389
parent_rule_name == ForIntroRule.lark_name()
8490
and token_type == tokens.COLON.lark_name()
8591
):
86-
92+
if str(current_node).startswith((" ", "\t")):
93+
return False
8794
return True
8895

8996
# Space after commas in tuples and function arguments...
@@ -120,10 +127,28 @@ def _should_add_space_before(
120127
return True
121128

122129
# Space after ellipsis in function arguments
130+
# ... except before newlines which provide their own whitespace
123131
if self._last_token_name == tokens.ELLIPSIS.lark_name():
132+
if token_type == "NL_OR_COMMENT":
133+
return False
124134
return True
125135

126-
if tokens.EQ.lark_name() in [token_type, self._last_token_name]:
136+
# Space around EQ and COLON separators in attributes/object elements.
137+
# Both terminals may carry leading whitespace from the original
138+
# source (e.g. " =" for aligned attributes, " :" for object
139+
# elements). Skip the automatic space when the token already
140+
# provides it. COLON only gets space if it already has leading
141+
# whitespace (unlike EQ which always gets at least one space).
142+
if token_type == tokens.EQ.lark_name():
143+
if str(current_node).startswith((" ", "\t")):
144+
return False
145+
return True
146+
if token_type == tokens.COLON.lark_name():
147+
return False
148+
if self._last_token_name == tokens.EQ.lark_name():
149+
# Don't add space before newlines which provide their own whitespace
150+
if token_type == "NL_OR_COMMENT":
151+
return False
127152
return True
128153

129154
# Don't add space around operator tokens inside unary_op
@@ -158,14 +183,16 @@ def _should_add_space_before(
158183
):
159184
return True
160185

161-
# Space after colon in for expressions (before value expression,
162-
# but not before newline/comment which provides its own whitespace)
186+
# Space after colon in for expressions and object elements
187+
# (before value expression, but not before newline/comment
188+
# which provides its own whitespace)
163189
if (
164190
self._last_token_name == tokens.COLON.lark_name()
165191
and parent_rule_name
166192
in [
167193
ForTupleExprRule.lark_name(),
168194
ForObjectExprRule.lark_name(),
195+
ObjectElemRule.lark_name(),
169196
]
170197
and rule_name != "new_line_or_comment"
171198
):
@@ -253,6 +280,14 @@ def reconstruct(self, tree: Tree, postproc=None) -> str:
253280
if postproc:
254281
result = postproc(result)
255282

283+
# The grammar's body rule ends with an optional new_line_or_comment
284+
# which captures the final newline. The parser often produces two
285+
# NL_OR_COMMENT tokens for a single trailing newline (the statement
286+
# separator plus the EOF newline), resulting in a spurious blank line.
287+
# Strip exactly one trailing newline when there are two or more.
288+
if result.endswith("\n\n"):
289+
result = result[:-1]
290+
256291
# Ensure file ends with newline
257292
if result and not result.endswith("\n"):
258293
result += "\n"

hcl2/rules/expressions.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,9 @@ def serialize(
248248
"""Serialize to 'lhs operator rhs' string."""
249249
with context.modify(inside_dollar_string=True):
250250
lhs = self.expr_term.serialize(options, context)
251-
operator = self.binary_term.binary_operator.serialize(options, context)
251+
operator = str(
252+
self.binary_term.binary_operator.serialize(options, context)
253+
).strip()
252254
rhs = self.binary_term.expr_term.serialize(options, context)
253255

254256
result = f"{lhs} {operator} {rhs}"
@@ -286,7 +288,8 @@ def serialize(
286288
) -> Any:
287289
"""Serialize to 'operator operand' string."""
288290
with context.modify(inside_dollar_string=True):
289-
result = f"{self.operator}{self.expr_term.serialize(options, context)}"
291+
operator = self.operator.rstrip()
292+
result = f"{operator}{self.expr_term.serialize(options, context)}"
290293

291294
if not context.inside_dollar_string:
292295
result = to_dollar_string(result)

hcl2/transformer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,10 @@ def __default_token__(self, token: Token) -> StringToken:
8383
# TODO make this return StaticStringToken where applicable
8484
value = token.value
8585
# The EQ terminal /[ \t]*=(?!=|>)/ captures leading whitespace.
86-
# Strip it so parsed and deserialized tokens produce the same "=" value,
87-
# preventing the reconstructor from emitting double spaces.
88-
if token.type == "EQ":
89-
value = value.lstrip(" \t")
86+
# Preserve it so the direct pipeline (to_lark → reconstruct) retains
87+
# original alignment. The reconstructor skips its own space insertion
88+
# when the EQ token already carries leading whitespace.
89+
9090
if value in StaticStringToken.classes_by_value:
9191
return StaticStringToken.classes_by_value[value]()
9292
return StringToken[token.type](value) # type: ignore[misc]

test/integration/hcl2_original/smoke.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ block label1 label2 {
33
a = 5
44
b = 1256.5
55
c = 15 + (10 * 12)
6-
d = (- a)
6+
d = (-a)
77
e = (
88
a == b
99
? true : false

test/integration/hcl2_reconstructed/smoke.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ block {
6161
for account_name in var.route53_resolver_forwarding_rule_shares[forwarding_rule_key].aws_account_names :
6262
module.remote_state_subaccounts.map[account_name].outputs["aws_account_id"]
6363
]
64-
} ...
64+
} ...
6565
if substr(bucket_name, 0, 1) == "l"
6666
}
6767
}

test/integration/test_round_trip.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ def _parse_and_serialize(hcl_text: str, options=None) -> dict:
7979
return rules.serialize()
8080

8181

82+
def _direct_reconstruct(hcl_text: str) -> str:
83+
"""Parse HCL text, transform to IR, convert to Lark tree, and reconstruct."""
84+
parsed_tree = parses_to_tree(hcl_text)
85+
rules = RuleTransformer().transform(parsed_tree)
86+
lark_tree = rules.to_lark()
87+
return HCLReconstructor().reconstruct(lark_tree)
88+
89+
8290
def _deserialize_and_reserialize(serialized: dict) -> dict:
8391
"""Deserialize a Python dict back through the rule tree and reserialize."""
8492
deserializer = BaseDeserializer()
@@ -120,6 +128,33 @@ def test_hcl_to_json(self):
120128
)
121129

122130

131+
class TestDirectReconstruction(TestCase):
132+
"""Test HCL2 → IR → HCL2 direct pipeline.
133+
134+
Parse HCL, transform to IR, convert directly to Lark tree (skipping
135+
serialization to dict), reconstruct HCL, then verify the result
136+
re-parses to the same JSON as the original.
137+
"""
138+
139+
maxDiff = None
140+
141+
def test_direct_reconstruct(self):
142+
for suite in _get_suites():
143+
with self.subTest(suite=suite):
144+
hcl_path = _get_suite_file(suite, SuiteStep.ORIGINAL)
145+
original_hcl = hcl_path.read_text()
146+
147+
# Direct: HCL → IR → Lark → HCL
148+
reconstructed_hcl = _direct_reconstruct(original_hcl)
149+
150+
self.assertMultiLineEqual(
151+
reconstructed_hcl,
152+
original_hcl,
153+
f"Direct reconstruction mismatch for suite {suite}: "
154+
f"HCL → IR → HCL did not match original HCL",
155+
)
156+
157+
123158
class TestRoundTripReserialization(TestCase):
124159
"""Test JSON → JSON reserialization.
125160

0 commit comments

Comments
 (0)