Fix block scalar mangling bug #231

wrouesnel · wrouesnel · commit 0b4c13b286c0 · 2023-06-22T10:05:53.000+10:00
The regex based parsing for fixing comments was breaking block scalars.
By using the ruyaml round trip handler, instead the comment formatting
now can correctly identify block-scalars and avoid mangling them.
diff --git a/src/yamlfix/adapters.py b/src/yamlfix/adapters.py
@@ -1,5 +1,5 @@
 """Define adapter / helper classes to hide unrelated functionality in."""
-
+import io
 import logging
 import re
 from functools import partial
@@ -12,6 +12,7 @@
 from ruyaml.tokens import CommentToken
 
 from yamlfix.model import YamlfixConfig, YamlNodeStyle
+from yamlfix.util import walk_object
 
 log = logging.getLogger(__name__)
 
@@ -334,18 +335,12 @@ def __init__(self, yaml: Yaml, config: Optional[YamlfixConfig]) -> None:
         self.yaml = yaml.yaml
         self.config = config or YamlfixConfig()
 
-    def fix(self, source_code: str) -> str:
-        """Run all yaml source code fixers.
-
-        Args:
-            source_code: Source code to be corrected.
-
-        Returns:
-            Corrected source code.
-        """
-        log.debug("Running source code fixers...")
-
-        fixers = [
+        # The list of fixers to run, and the index of the currently called fixer.
+        # This allows fixers which might need to reinvoke other fixers to reinvoke the
+        # previous fixers (currently fix comments does this as it needs to re-emit the
+        # YAML).
+        self._fixer_idx = -1
+        self._fixers = [
             self._fix_truthy_strings,
             self._fix_jinja_variables,
             self._ruamel_yaml_fixer,
@@ -359,8 +354,40 @@ def fix(self, source_code: str) -> str:
             self._add_newline_at_end_of_file,
         ]
 
-        for fixer in fixers:
+    def _rerun_previous_fixers(self, source_code: str) -> str:
+        """Run all the previous source code fixers except the currently running one.
+
+        This is re-entrant safe and will correctly restore the fixer idx once it's
+        complete.
+
+        Args:
+            source_code: Source code to be corrected.
+
+        Returns:
+            Corrected source code.
+        """
+        cur_fixer_idx = self._fixer_idx
+        for fixer_idx, fixer in enumerate(self._fixers[:cur_fixer_idx]):
+            self._fixer_idx = fixer_idx
             source_code = fixer(source_code)
+        # Restore fixer idx
+        self._fixer_idx = cur_fixer_idx
+        return source_code
+
+    def fix(self, source_code: str) -> str:
+        """Run all yaml source code fixers.
+
+        Args:
+            source_code: Source code to be corrected.
+
+        Returns:
+            Corrected source code.
+        """
+        log.debug("Running source code fixers...")
+
+        # Just use the re-run system do it.
+        self._fixer_idx = len(self._fixers)
+        source_code = self._rerun_previous_fixers(source_code)
 
         return source_code
 
@@ -573,24 +600,102 @@ def _restore_truthy_strings(source_code: str) -> str:
     def _fix_comments(self, source_code: str) -> str:
         log.debug("Fixing comments...")
         config = self.config
-        comment_start = " " * config.comments_min_spaces_from_content + "#"
 
-        fixed_source_lines = []
+        # We need the source lines for the comment fixers to analyze whitespace easily
+        source_lines = source_code.splitlines()
 
-        for line in source_code.splitlines():
-            # Comment at the start of the line
-            if config.comments_require_starting_space and re.search(r"(^|\s)#\w", line):
-                line = line.replace("#", "# ")
-            # Comment in the middle of the line, but it's not part of a string
-            if (
-                config.comments_min_spaces_from_content > 1
-                and " #" in line
-                and line[-1] not in ["'", '"']
-            ):
-                line = re.sub(r"(.+\S)(\s+?)#", rf"\1{comment_start}", line)
-            fixed_source_lines.append(line)
+        yaml = YAML(typ="rt")
+        # Hijack config options from the regular fixer
+        yaml.explicit_start = self.yaml.explicit_start
+        yaml.width = self.yaml.width
+        # preserve_quotes however must always be true, otherwise we change output
+        # unexpectedly.
+        yaml.preserve_quotes = True  # type: ignore
+        yaml_documents = list(yaml.load_all(source_code))
 
-        return "\n".join(fixed_source_lines)
+        handled_comments: List[CommentToken] = []
+
+        def _comment_token_cb(target: Any) -> None:  # noqa:W0613,ANN401
+            """This function is a callback for walk_object.
+
+            This implements the actual comment fixing, and is used because comment
+            objects are structured in nested-lists and can repeat.
+            """
+            if not isinstance(target, CommentToken):
+                return
+            if any(target is e for e in handled_comments):
+                # This comment was handled at a higher level already.
+                return
+            if target.value is None:
+                return
+            comment_lines = target.value.split("\n")
+            fixed_comment_lines = []
+            for line in comment_lines:
+                if config.comments_require_starting_space and re.search(
+                    r"(^|\s)#\w", line
+                ):
+                    line = line.replace("#", "# ")
+                fixed_comment_lines.append(line)
+
+            # Update the comment with the fixed lines
+            target.value = "\n".join(fixed_comment_lines)
+
+            if config.comments_min_spaces_from_content > 1:
+                # It's hard to reconstruct exactly where the content is, but since we
+                # have the line numbers what we do is lookup the literal source line
+                # here and check where the whitespace is compared to where we know the
+                # comment starts.
+                source_line = source_lines[target.start_mark.line]
+                content_part = source_line[0 : target.start_mark.column]  # noqa: E203
+                # Find the non-whitespace position in the content part
+                rx_match = re.match(r"^.*\S", content_part)
+                if (
+                    rx_match is not None
+                ):  # If no match then nothing to do - no content to be away from
+                    _, content_end = rx_match.span()
+                    # If there's less than min-spaces from content, we're going to add
+                    # some.
+                    if (
+                        target.start_mark.column - content_end
+                        < config.comments_min_spaces_from_content
+                    ):
+                        # Handled
+                        target.start_mark.column = (
+                            content_end + config.comments_min_spaces_from_content
+                        )
+            # Some ruyaml objects will return attached comments at multiple levels (but
+            # not all). Keep track of which comments we've already processed to avoid
+            # double processing them (important because we use raw source lines to
+            # determine content position above).
+            handled_comments.append(target)
+
+        def _comment_fixer(target: Any) -> None:  # noqa:W0613,ANN401
+            """This function is a callback for walk_object.
+
+            walk_object calls it for every object it finds, and then will walk the
+            mapping/sequence subvalues and call this function on those too. This gives
+            us direct access to all round tripped comments.
+            """
+            if not hasattr(target, "ca"):
+                # Scalar or other object with no comment parameter.
+                return
+            # Find all comment tokens and fix them
+            walk_object(target.ca.comment, _comment_token_cb)
+            walk_object(target.ca.end, _comment_token_cb)
+            walk_object(target.ca.items, _comment_token_cb)
+            walk_object(target.ca.pre, _comment_token_cb)
+
+        # Walk the object and invoke the comment fixer
+        walk_object(yaml_documents, _comment_fixer)
+
+        # Dump out the YAML documents
+        stream = io.StringIO()
+        yaml.dump_all(yaml_documents, stream=stream)
+
+        fixed_source_code = stream.getvalue()
+        # Reinvoke the previous fixers to ensure we fix the new output we just created.
+        fixed_source_code = self._rerun_previous_fixers(fixed_source_code)
+        return fixed_source_code
 
     def _fix_whitelines(self, source_code: str) -> str:
         """Fixes number of consecutive whitelines.
diff --git a/src/yamlfix/util.py b/src/yamlfix/util.py
@@ -0,0 +1,18 @@
+"""Define utility helpers."""
+from typing import Any, Callable, Iterable, Mapping
+
+
+def walk_object(
+    target: Any, callback_fn: Callable[[Any], None]  # noqa: ANN401
+) -> None:
+    """Walk a YAML/JSON-like object and call a function on all values."""
+    callback_fn(target)  # Call the callback and whatever we received.
+
+    if isinstance(target, Mapping):
+        # Map type
+        for _, value in target.items():
+            walk_object(value, callback_fn)
+    elif isinstance(target, Iterable) and not isinstance(target, (bytes, str)):
+        # List type
+        for value in target:
+            walk_object(value, callback_fn)
diff --git a/tests/unit/test_adapter_yaml.py b/tests/unit/test_adapter_yaml.py
@@ -939,3 +939,51 @@ def test_enforcing_flow_style_together_with_adjustable_newlines(self) -> None:
         result = fix_code(source, config)
 
         assert result == fixed_source
+
+    def test_block_scalar_whitespace_is_preserved(self) -> None:
+        """Check that multi-line blocks with #'s in them are not treated as comments.
+        Regression test for https://github.com/lyz-code/yamlfix/issues/231
+        """
+        source = dedent(
+            """\
+            ---
+            addn_doc_key: |-
+              #######################################
+              #      This would also be broken      #
+              #######################################
+            ---
+            #Comment above the key
+            key: |-
+              ###########################################
+              #     Value with lots of whitespace       #
+              #         Some     More    Whitespace     #
+              ###########################################
+            #Comment below
+            
+            #Comment with some whitespace below
+            """  # noqa: W293
+        )
+        fixed_source = dedent(
+            """\
+            ---
+            addn_doc_key: |-
+              #######################################
+              #      This would also be broken      #
+              #######################################
+            ---
+            # Comment above the key
+            key: |-
+              ###########################################
+              #     Value with lots of whitespace       #
+              #         Some     More    Whitespace     #
+              ###########################################
+            # Comment below
+            
+            # Comment with some whitespace below
+            """  # noqa: W293
+        )
+        config = YamlfixConfig()
+
+        result = fix_code(source, config)
+
+        assert result == fixed_source
diff --git a/tests/unit/test_services.py b/tests/unit/test_services.py
@@ -481,6 +481,14 @@ def test_fix_code_functions_emit_debug_logs(
             "Restoring jinja2 variables...",
             "Restoring double exclamations...",
             "Fixing comments...",
+            # Fixing comments causes a re-run of fixers, so we get duplicates from here
+            "Fixing truthy strings...",
+            "Fixing jinja2 variables...",
+            "Running ruamel yaml fixer...",
+            "Restoring truthy strings...",
+            "Restoring jinja2 variables...",
+            "Restoring double exclamations...",
+            # End fixing comments duplicates
             "Fixing top level lists...",
             "Fixing flow-style lists...",
         }