python-hcl2/hcl2/reconstructor.py at 1a620b09ce6a72a58fb634738d9927b1866b1eea · eranor/python-hcl2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
"""A reconstructor for HCL2 implemented using Lark's experimental reconstruction functionality"""

import re
import json
from typing import List, Dict, Callable, Optional, Union, Any, Tuple

from lark import Lark, Tree
from lark.grammar import Terminal, Symbol
from lark.lexer import Token, PatternStr, TerminalDef
from lark.reconstruct import Reconstructor
from lark.tree_matcher import is_discarded_terminal
from lark.visitors import Transformer_InPlace

from hcl2.const import START_LINE_KEY, END_LINE_KEY
from hcl2.parser import reconstruction_parser


# function to remove the backslashes within interpolated portions
def reverse_quotes_within_interpolation(interp_s: str) -> str:
    """
    A common operation is to `json.dumps(s)` where s is a string to output in
    HCL. This is useful for automatically escaping any quotes within the
    string, but this escapes quotes within interpolation incorrectly. This
    method removes any erroneous escapes within interpolated segments of a
    string.
    """
    return re.sub(r"\$\{(.*)}", lambda m: m.group(0).replace('\\"', '"'), interp_s)


class WriteTokensAndMetaTransformer(Transformer_InPlace):
    """
    Inserts discarded tokens into their correct place, according to the rules
    of grammar, and annotates with metadata during reassembly. The metadata
    tracked here include the terminal which generated a particular string
    output, and the rule that that terminal was matched on.

    This is a modification of lark.reconstruct.WriteTokensTransformer
    """

    tokens: Dict[str, TerminalDef]
    term_subs: Dict[str, Callable[[Symbol], str]]

    def __init__(
        self,
        tokens: Dict[str, TerminalDef],
        term_subs: Dict[str, Callable[[Symbol], str]],
    ) -> None:
        super().__init__()
        self.tokens = tokens
        self.term_subs = term_subs

    def __default__(self, data, children, meta):
        """
        This method is called for every token the transformer visits.
        """

        if not getattr(meta, "match_tree", False):
            return Tree(data, children)
        iter_args = iter(
            [child[2] if isinstance(child, tuple) else child for child in children]
        )
        to_write = []
        for sym in meta.orig_expansion:
            if is_discarded_terminal(sym):
                try:
                    value = self.term_subs[sym.name](sym)
                except KeyError as exc:
                    token = self.tokens[sym.name]
                    if not isinstance(token.pattern, PatternStr):
                        raise NotImplementedError(
                            f"Reconstructing regexps not supported yet: {token}"
                        ) from exc

                    value = token.pattern.value

                # annotate the leaf with the specific rule (data) and terminal
                # (sym) it was generated from
                to_write.append((data, sym, value))
            else:
                item = next(iter_args)
                if isinstance(item, list):
                    to_write += item
                else:
                    if isinstance(item, Token):
                        # annotate the leaf with the specific rule (data) and
                        # terminal (sym) it was generated from
                        to_write.append((data, sym, item))
                    else:
                        to_write.append(item)

        return to_write


class HCLReconstructor(Reconstructor):
    """This class converts a Lark.Tree AST back into a string representing the underlying HCL code."""

    def __init__(
        self,
        parser: Lark,
        term_subs: Optional[Dict[str, Callable[[Symbol], str]]] = None,
    ):
        Reconstructor.__init__(self, parser, term_subs)

        self.write_tokens: WriteTokensAndMetaTransformer = (
            WriteTokensAndMetaTransformer(
                {token.name: token for token in self.tokens}, term_subs or {}
            )
        )

        # these variables track state during reconstruction to enable us to make
        # informed decisions about formatting output. They are primarily used
        # by the _should_add_space(...) method.
        self._last_char_space = True
        self._last_terminal: Union[Terminal, None] = None
        self._last_rule: Union[Tree, Token, None] = None
        self._deferred_item = None

    def should_be_wrapped_in_spaces(self, terminal: Terminal) -> bool:
        """Whether given terminal should be wrapped in spaces"""
        return terminal.name in {
            "IF",
            "IN",
            "FOR",
            "FOR_EACH",
            "FOR_OBJECT_ARROW",
            "COLON",
            "QMARK",
            "BINARY_OP",
        }

    def _is_equals_sign(self, terminal) -> bool:
        return (
            isinstance(self._last_rule, Token)
            and self._last_rule.value in ("attribute", "object_elem")
            and self._last_terminal == Terminal("EQ")
            and terminal != Terminal("NL_OR_COMMENT")
        )

    # pylint: disable=too-many-branches, too-many-return-statements
    def _should_add_space(self, rule, current_terminal):
        """
        This method documents the situations in which we add space around
        certain tokens while reconstructing the generated HCL.

        Additional rules can be added here if the generated HCL has
        improper whitespace (affecting parse OR affecting ability to perfectly
        reconstruct a file down to the whitespace level.)

        It has the following information available to make its decision:

          - the last token (terminal) we output
          - the last rule that token belonged to
          - the current token (terminal) we're about to output
          - the rule the current token belongs to

        This should be sufficient to make a spacing decision.
        """
        # we don't need to add multiple spaces
        if self._last_char_space:
            return False

        # we don't add a space at the start of the file
        if not self._last_terminal or not self._last_rule:
            return False

        if self._is_equals_sign(current_terminal):
            return True

        # if we're in a ternary or binary operator, add space around the operator
        if (
            isinstance(rule, Token)
            and rule.value
            in [
                "conditional",
                "binary_operator",
            ]
            and self.should_be_wrapped_in_spaces(current_terminal)
        ):
            return True

        # if we just left a ternary or binary operator, add space around the
        # operator unless there's a newline already
        if (
            isinstance(self._last_rule, Token)
            and self._last_rule.value
            in [
                "conditional",
                "binary_operator",
            ]
            and self.should_be_wrapped_in_spaces(self._last_terminal)
            and current_terminal != Terminal("NL_OR_COMMENT")
        ):
            return True

        # if we're in a for or if statement and find a keyword, add a space
        if (
            isinstance(rule, Token)
            and rule.value
            in [
                "for_object_expr",
                "for_cond",
                "for_intro",
            ]
            and self.should_be_wrapped_in_spaces(current_terminal)
        ):
            return True

        # if we've just left a for or if statement and find a keyword, add a
        # space, unless we have a newline
        if (
            isinstance(self._last_rule, Token)
            and self._last_rule.value
            in [
                "for_object_expr",
                "for_cond",
                "for_intro",
            ]
            and self.should_be_wrapped_in_spaces(self._last_terminal)
            and current_terminal != Terminal("NL_OR_COMMENT")
        ):
            return True

        # if we're in a block
        if (isinstance(rule, Token) and rule.value == "block") or (
            isinstance(rule, str) and re.match(r"^__block_(star|plus)_.*", rule)
        ):
            # always add space before the starting brace
            if current_terminal == Terminal("LBRACE"):
                return True

            # always add space before the closing brace
            if current_terminal == Terminal(
                "RBRACE"
            ) and self._last_terminal != Terminal("LBRACE"):
                return True

            # always add space between string literals
            if current_terminal == Terminal("STRING_LIT"):
                return True

        # if we just opened a block, add a space, unless the block is empty
        # or has a newline
        if (
            isinstance(self._last_rule, Token)
            and self._last_rule.value == "block"
            and self._last_terminal == Terminal("LBRACE")
            and current_terminal not in [Terminal("RBRACE"), Terminal("NL_OR_COMMENT")]
        ):
            return True

        # if we're in a tuple or function arguments (this rule matches commas between items)
        if isinstance(self._last_rule, str) and re.match(
            r"^__(tuple|arguments)_(star|plus)_.*", self._last_rule
        ):

            # string literals, decimals, and identifiers should always be
            # preceded by a space if they're following a comma in a tuple or
            # function arg
            if current_terminal in [
                Terminal("STRING_LIT"),
                Terminal("DECIMAL"),
                Terminal("NAME"),
            ]:
                return True

        # the catch-all case, we're not sure, so don't add a space
        return False

    def _reconstruct(self, tree):
        unreduced_tree = self.match_tree(tree, tree.data)
        res = self.write_tokens.transform(unreduced_tree)
        for item in res:
            # any time we encounter a child tree, we recurse
            if isinstance(item, Tree):
                yield from self._reconstruct(item)

            # every leaf should be a tuple, which contains information about
            # which terminal the leaf represents
            elif isinstance(item, tuple):
                rule, terminal, value = item

                # first, handle any deferred items
                if self._deferred_item is not None:
                    (
                        deferred_rule,
                        deferred_terminal,
                        deferred_value,
                    ) = self._deferred_item

                    # if we deferred a comma and the next character ends a
                    # parenthesis or block, we can throw it out
                    if deferred_terminal == Terminal("COMMA") and terminal in [
                        Terminal("RPAR"),
                        Terminal("RBRACE"),
                    ]:
                        pass
                    # in any other case, we print the deferred item
                    else:
                        yield deferred_value

                        # and do our bookkeeping
                        self._last_terminal = deferred_terminal
                        self._last_rule = deferred_rule
                        if deferred_value and not deferred_value[-1].isspace():
                            self._last_char_space = False

                    # clear the deferred item
                    self._deferred_item = None

                # potentially add a space before the next token
                if self._should_add_space(rule, terminal):
                    yield " "
                    self._last_char_space = True

                # potentially defer the item if needed
                if terminal in [Terminal("COMMA")]:
                    self._deferred_item = item
                else:
                    # otherwise print the next token
                    yield value

                    # and do our bookkeeping so we can make an informed
                    # decision about formatting next time
                    self._last_terminal = terminal
                    self._last_rule = rule
                    if value:
                        self._last_char_space = value[-1].isspace()

            else:
                raise RuntimeError(f"Unknown bare token type: {item}")

    def reconstruct(self, tree, postproc=None, insert_spaces=False):
        """Convert a Lark.Tree AST back into a string representation of HCL."""
        return Reconstructor.reconstruct(
            self,
            tree,
            postproc,
            insert_spaces,
        )


class HCLReverseTransformer:
    """
    The reverse of hcl2.transformer.DictTransformer. This method attempts to
    convert a dict back into a working AST, which can be written back out.
    """

    @staticmethod
    def _name_to_identifier(name: str) -> Tree:
        """Converts a string to a NAME token within an identifier rule."""
        return Tree(Token("RULE", "identifier"), [Token("NAME", name)])

    @staticmethod
    def _escape_interpolated_str(interp_s: str) -> str:
        if interp_s.strip().startswith('<<-') or interp_s.strip().startswith('<<'):
            # For heredoc strings, preserve their format exactly
            return reverse_quotes_within_interpolation(interp_s)
        # Escape backslashes first (very important to do this first)
        escaped = interp_s.replace('\\', '\\\\')
        # Escape quotes
        escaped = escaped.replace('"', '\\"')
        # Escape control characters
        escaped = escaped.replace('\n', '\\n')
        escaped = escaped.replace('\r', '\\r')
        escaped = escaped.replace('\t', '\\t')
        escaped = escaped.replace('\b', '\\b')
        escaped = escaped.replace('\f', '\\f')
        # find each interpolation within the string and remove the backslashes
        interp_s = reverse_quotes_within_interpolation(f'"{escaped}"')
        return interp_s

    @staticmethod
    def _block_has_label(block: dict) -> bool:
        return len(block.keys()) == 1

    def __init__(self):
        pass

    def transform(self, hcl_dict: dict) -> Tree:
        """Given a dict, return a Lark.Tree representing the HCL AST."""
        level = 0
        body = self._transform_dict_to_body(hcl_dict, level)
        start = Tree(Token("RULE", "start"), [body])
        return start

    @staticmethod
    def _is_string_wrapped_tf(interp_s: str) -> bool:
        """
        Determines whether a string is a complex HCL data structure
        wrapped in ${ interpolation } characters.
        """
        if not interp_s.startswith("${") or not interp_s.endswith("}"):
            return False

        nested_tokens = []
        for match in re.finditer(r"\$?\{|}", interp_s):
            if match.group(0) in ["${", "{"]:
                nested_tokens.append(match.group(0))
            elif match.group(0) == "}":
                nested_tokens.pop()

            # if we exit ${ interpolation } before the end of the string,
            # this interpolated string has string parts and can't represent
            # a valid HCL expression on its own (without quotes)
            if len(nested_tokens) == 0 and match.end() != len(interp_s):
                return False

        return True

    @classmethod
    def _unwrap_interpolation(cls, value: str) -> str:
        if cls._is_string_wrapped_tf(value):
            return value[2:-1]
        return value

    def _newline(self, level: int, count: int = 1) -> Tree:
        return Tree(
            Token("RULE", "new_line_or_comment"),
            [Token("NL_OR_COMMENT", f"\n{'  ' * level}") for _ in range(count)],
        )

    def _is_block(self, value: Any) -> bool:
        if isinstance(value, dict):
            block_body = value
            if (
                START_LINE_KEY in block_body.keys()
                or END_LINE_KEY in block_body.keys()
            ):
                return True

            try:
                # if block is labeled, actual body might be nested
                # pylint: disable=W0612
                block_label, block_body = next(iter(value.items()))
            except StopIteration:
                # no more potential labels = nothing more to check
                return False

            return self._is_block(block_body)

        if isinstance(value, list):
            if len(value) > 0:
                return self._is_block(value[0])

        return False

    def _calculate_block_labels(self, block: dict) -> Tuple[List[str], dict]:
        # if block doesn't have a label
        if len(block.keys()) != 1:
            return [], block

        # otherwise, find the label
        curr_label = list(block)[0]
        potential_body = block[curr_label]

        # __start_line__ and __end_line__ metadata are not labels
        if (
            START_LINE_KEY in potential_body.keys()
            or END_LINE_KEY in potential_body.keys()
        ):
            return [curr_label], potential_body

        # recurse and append the label
        next_label, block_body = self._calculate_block_labels(potential_body)
        return [curr_label] + next_label, block_body

    # pylint:disable=R0914
    def _transform_dict_to_body(self, hcl_dict: dict, level: int) -> Tree:
        # we add a newline at the top of a body within a block, not the root body
        # >2 here is to ignore the __start_line__ and __end_line__ metadata
        if level > 0 and len(hcl_dict) > 2:
            children = [self._newline(level)]
        else:
            children = []

        # iterate through each attribute or sub-block of this block
        for key, value in hcl_dict.items():
            if key in [START_LINE_KEY, END_LINE_KEY]:
                continue

            # construct the identifier, whether that be a block type name or an attribute key
            identifier_name = self._name_to_identifier(key)

            # first, check whether the value is a "block"
            if self._is_block(value):
                for block_v in value:
                    block_labels, block_body_dict = self._calculate_block_labels(
                        block_v
                    )
                    block_label_tokens = [
                        Token("STRING_LIT", f'"{block_label}"')
                        for block_label in block_labels
                    ]
                    block_body = self._transform_dict_to_body(
                        block_body_dict, level + 1
                    )

                    # create our actual block to add to our own body
                    block = Tree(
                        Token("RULE", "block"),
                        [identifier_name] + block_label_tokens + [block_body],
                    )
                    children.append(block)
                    # add empty line after block
                    new_line = self._newline(level - 1)
                    # add empty line with indentation for next element in the block
                    new_line.children.append(self._newline(level).children[0])

                    children.append(new_line)

            # if the value isn't a block, it's an attribute
            else:
                expr_term = self._transform_value_to_expr_term(value, level)
                attribute = Tree(
                    Token("RULE", "attribute"),
                    [identifier_name, Token("EQ", " ="), expr_term],
                )
                children.append(attribute)
                children.append(self._newline(level))

        # since we're leaving a block body here, reduce the indentation of the
        # final newline if it exists
        if (
            len(children) > 0
            and isinstance(children[-1], Tree)
            and children[-1].data.type == "RULE"
            and children[-1].data.value == "new_line_or_comment"
        ):
            children[-1] = self._newline(level - 1)

        return Tree(Token("RULE", "body"), children)

    # pylint: disable=too-many-branches, too-many-return-statements
    def _transform_value_to_expr_term(self, value, level) -> Union[Token, Tree]:
        """Transforms a value from a dictionary into an "expr_term" (a value in HCL2)

        Anything passed to this function is treated "naively". Any lists passed
        are assumed to be tuples, and any dicts passed are assumed to be objects.
        No more checks will be performed for either to see if they are "blocks"
        as this check happens in `_transform_dict_to_body`.
        """

        # for lists, recursively turn the child elements into expr_terms and
        # store within a tuple
        if isinstance(value, list):
            tuple_tree = Tree(
                Token("RULE", "tuple"),
                [
                    self._transform_value_to_expr_term(tuple_v, level)
                    for tuple_v in value
                ],
            )
            return Tree(Token("RULE", "expr_term"), [tuple_tree])

        if value is None:
            return Tree(
                Token("RULE", "expr_term"),
                [Tree(Token("RULE", "identifier"), [Token("NAME", "null")])],
            )

        # for dicts, recursively turn the child k/v pairs into object elements
        # and store within an object
        if isinstance(value, dict):
            elements = []

            # if the object has elements, put it on a newline
            if len(value) > 0:
                elements.append(self._newline(level + 1))

            # iterate through the items and add them to the object
            for i, (k, dict_v) in enumerate(value.items()):
                if k in [START_LINE_KEY, END_LINE_KEY]:
                    continue

                value_expr_term = self._transform_value_to_expr_term(dict_v, level + 1)
                k = self._unwrap_interpolation(k)
                elements.append(
                    Tree(
                        Token("RULE", "object_elem"),
                        [
                            Tree(
                                Token("RULE", "object_elem_key"),
                                [Tree(Token("RULE", "identifier"), [Token("NAME", k)])],
                            ),
                            Token("EQ", " ="),
                            value_expr_term,
                        ],
                    )
                )

                # add indentation appropriately
                if i < len(value) - 1:
                    elements.append(self._newline(level + 1))
                else:
                    elements.append(self._newline(level))
            return Tree(
                Token("RULE", "expr_term"), [Tree(Token("RULE", "object"), elements)]
            )

        # treat booleans appropriately
        if isinstance(value, bool):
            return Tree(
                Token("RULE", "expr_term"),
                [
                    Tree(
                        Token("RULE", "identifier"),
                        [Token("NAME", "true" if value else "false")],
                    )
                ],
            )

        # store integers as literals, digit by digit
        if isinstance(value, int):
            return Tree(
                Token("RULE", "expr_term"),
                [
                    Tree(
                        Token("RULE", "int_lit"),
                        [Token("DECIMAL", digit) for digit in str(value)],
                    )
                ],
            )

        # store strings as single literals
        if isinstance(value, str):
            # potentially unpack a complex syntax structure
            if self._is_string_wrapped_tf(value):
                # we have to unpack it by parsing it
                wrapped_value = re.match(r"\$\{(.*)}", value).group(1)  # type:ignore
                ast = reconstruction_parser().parse(f"value = {wrapped_value}")

                if ast.data != Token("RULE", "start"):
                    raise RuntimeError("Token must be `start` RULE")

                body = ast.children[0]
                if body.data != Token("RULE", "body"):
                    raise RuntimeError("Token must be `body` RULE")

                attribute = body.children[0]
                if attribute.data != Token("RULE", "attribute"):
                    raise RuntimeError("Token must be `attribute` RULE")

                if attribute.children[1] != Token("EQ", " ="):
                    raise RuntimeError("Token must be `EQ (=)` rule")

                parsed_value = attribute.children[2]
                return parsed_value

            # otherwise it's just a string.
            return Tree(
                Token("RULE", "expr_term"),
                [Token("STRING_LIT", self._escape_interpolated_str(value))],
            )

        # otherwise, we don't know the type
        raise RuntimeError(f"Unknown type to transform {type(value)}")