Skip to content

Commit 0b4c13b

Browse files
committed
Fix block scalar mangling bug #231
The regex based parsing for fixing comments was breaking block scalars. By using the ruyaml round trip handler, instead the comment formatting now can correctly identify block-scalars and avoid mangling them.
1 parent 51410d6 commit 0b4c13b

File tree

4 files changed

+208
-29
lines changed

4 files changed

+208
-29
lines changed

src/yamlfix/adapters.py

Lines changed: 134 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""Define adapter / helper classes to hide unrelated functionality in."""
2-
2+
import io
33
import logging
44
import re
55
from functools import partial
@@ -12,6 +12,7 @@
1212
from ruyaml.tokens import CommentToken
1313

1414
from yamlfix.model import YamlfixConfig, YamlNodeStyle
15+
from yamlfix.util import walk_object
1516

1617
log = logging.getLogger(__name__)
1718

@@ -334,18 +335,12 @@ def __init__(self, yaml: Yaml, config: Optional[YamlfixConfig]) -> None:
334335
self.yaml = yaml.yaml
335336
self.config = config or YamlfixConfig()
336337

337-
def fix(self, source_code: str) -> str:
338-
"""Run all yaml source code fixers.
339-
340-
Args:
341-
source_code: Source code to be corrected.
342-
343-
Returns:
344-
Corrected source code.
345-
"""
346-
log.debug("Running source code fixers...")
347-
348-
fixers = [
338+
# The list of fixers to run, and the index of the currently called fixer.
339+
# This allows fixers which might need to reinvoke other fixers to reinvoke the
340+
# previous fixers (currently fix comments does this as it needs to re-emit the
341+
# YAML).
342+
self._fixer_idx = -1
343+
self._fixers = [
349344
self._fix_truthy_strings,
350345
self._fix_jinja_variables,
351346
self._ruamel_yaml_fixer,
@@ -359,8 +354,40 @@ def fix(self, source_code: str) -> str:
359354
self._add_newline_at_end_of_file,
360355
]
361356

362-
for fixer in fixers:
357+
def _rerun_previous_fixers(self, source_code: str) -> str:
358+
"""Run all the previous source code fixers except the currently running one.
359+
360+
This is re-entrant safe and will correctly restore the fixer idx once it's
361+
complete.
362+
363+
Args:
364+
source_code: Source code to be corrected.
365+
366+
Returns:
367+
Corrected source code.
368+
"""
369+
cur_fixer_idx = self._fixer_idx
370+
for fixer_idx, fixer in enumerate(self._fixers[:cur_fixer_idx]):
371+
self._fixer_idx = fixer_idx
363372
source_code = fixer(source_code)
373+
# Restore fixer idx
374+
self._fixer_idx = cur_fixer_idx
375+
return source_code
376+
377+
def fix(self, source_code: str) -> str:
378+
"""Run all yaml source code fixers.
379+
380+
Args:
381+
source_code: Source code to be corrected.
382+
383+
Returns:
384+
Corrected source code.
385+
"""
386+
log.debug("Running source code fixers...")
387+
388+
# Just use the re-run system do it.
389+
self._fixer_idx = len(self._fixers)
390+
source_code = self._rerun_previous_fixers(source_code)
364391

365392
return source_code
366393

@@ -573,24 +600,102 @@ def _restore_truthy_strings(source_code: str) -> str:
573600
def _fix_comments(self, source_code: str) -> str:
574601
log.debug("Fixing comments...")
575602
config = self.config
576-
comment_start = " " * config.comments_min_spaces_from_content + "#"
577603

578-
fixed_source_lines = []
604+
# We need the source lines for the comment fixers to analyze whitespace easily
605+
source_lines = source_code.splitlines()
579606

580-
for line in source_code.splitlines():
581-
# Comment at the start of the line
582-
if config.comments_require_starting_space and re.search(r"(^|\s)#\w", line):
583-
line = line.replace("#", "# ")
584-
# Comment in the middle of the line, but it's not part of a string
585-
if (
586-
config.comments_min_spaces_from_content > 1
587-
and " #" in line
588-
and line[-1] not in ["'", '"']
589-
):
590-
line = re.sub(r"(.+\S)(\s+?)#", rf"\1{comment_start}", line)
591-
fixed_source_lines.append(line)
607+
yaml = YAML(typ="rt")
608+
# Hijack config options from the regular fixer
609+
yaml.explicit_start = self.yaml.explicit_start
610+
yaml.width = self.yaml.width
611+
# preserve_quotes however must always be true, otherwise we change output
612+
# unexpectedly.
613+
yaml.preserve_quotes = True # type: ignore
614+
yaml_documents = list(yaml.load_all(source_code))
592615

593-
return "\n".join(fixed_source_lines)
616+
handled_comments: List[CommentToken] = []
617+
618+
def _comment_token_cb(target: Any) -> None: # noqa:W0613,ANN401
619+
"""This function is a callback for walk_object.
620+
621+
This implements the actual comment fixing, and is used because comment
622+
objects are structured in nested-lists and can repeat.
623+
"""
624+
if not isinstance(target, CommentToken):
625+
return
626+
if any(target is e for e in handled_comments):
627+
# This comment was handled at a higher level already.
628+
return
629+
if target.value is None:
630+
return
631+
comment_lines = target.value.split("\n")
632+
fixed_comment_lines = []
633+
for line in comment_lines:
634+
if config.comments_require_starting_space and re.search(
635+
r"(^|\s)#\w", line
636+
):
637+
line = line.replace("#", "# ")
638+
fixed_comment_lines.append(line)
639+
640+
# Update the comment with the fixed lines
641+
target.value = "\n".join(fixed_comment_lines)
642+
643+
if config.comments_min_spaces_from_content > 1:
644+
# It's hard to reconstruct exactly where the content is, but since we
645+
# have the line numbers what we do is lookup the literal source line
646+
# here and check where the whitespace is compared to where we know the
647+
# comment starts.
648+
source_line = source_lines[target.start_mark.line]
649+
content_part = source_line[0 : target.start_mark.column] # noqa: E203
650+
# Find the non-whitespace position in the content part
651+
rx_match = re.match(r"^.*\S", content_part)
652+
if (
653+
rx_match is not None
654+
): # If no match then nothing to do - no content to be away from
655+
_, content_end = rx_match.span()
656+
# If there's less than min-spaces from content, we're going to add
657+
# some.
658+
if (
659+
target.start_mark.column - content_end
660+
< config.comments_min_spaces_from_content
661+
):
662+
# Handled
663+
target.start_mark.column = (
664+
content_end + config.comments_min_spaces_from_content
665+
)
666+
# Some ruyaml objects will return attached comments at multiple levels (but
667+
# not all). Keep track of which comments we've already processed to avoid
668+
# double processing them (important because we use raw source lines to
669+
# determine content position above).
670+
handled_comments.append(target)
671+
672+
def _comment_fixer(target: Any) -> None: # noqa:W0613,ANN401
673+
"""This function is a callback for walk_object.
674+
675+
walk_object calls it for every object it finds, and then will walk the
676+
mapping/sequence subvalues and call this function on those too. This gives
677+
us direct access to all round tripped comments.
678+
"""
679+
if not hasattr(target, "ca"):
680+
# Scalar or other object with no comment parameter.
681+
return
682+
# Find all comment tokens and fix them
683+
walk_object(target.ca.comment, _comment_token_cb)
684+
walk_object(target.ca.end, _comment_token_cb)
685+
walk_object(target.ca.items, _comment_token_cb)
686+
walk_object(target.ca.pre, _comment_token_cb)
687+
688+
# Walk the object and invoke the comment fixer
689+
walk_object(yaml_documents, _comment_fixer)
690+
691+
# Dump out the YAML documents
692+
stream = io.StringIO()
693+
yaml.dump_all(yaml_documents, stream=stream)
694+
695+
fixed_source_code = stream.getvalue()
696+
# Reinvoke the previous fixers to ensure we fix the new output we just created.
697+
fixed_source_code = self._rerun_previous_fixers(fixed_source_code)
698+
return fixed_source_code
594699

595700
def _fix_whitelines(self, source_code: str) -> str:
596701
"""Fixes number of consecutive whitelines.

src/yamlfix/util.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""Define utility helpers."""
2+
from typing import Any, Callable, Iterable, Mapping
3+
4+
5+
def walk_object(
6+
target: Any, callback_fn: Callable[[Any], None] # noqa: ANN401
7+
) -> None:
8+
"""Walk a YAML/JSON-like object and call a function on all values."""
9+
callback_fn(target) # Call the callback and whatever we received.
10+
11+
if isinstance(target, Mapping):
12+
# Map type
13+
for _, value in target.items():
14+
walk_object(value, callback_fn)
15+
elif isinstance(target, Iterable) and not isinstance(target, (bytes, str)):
16+
# List type
17+
for value in target:
18+
walk_object(value, callback_fn)

tests/unit/test_adapter_yaml.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -939,3 +939,51 @@ def test_enforcing_flow_style_together_with_adjustable_newlines(self) -> None:
939939
result = fix_code(source, config)
940940

941941
assert result == fixed_source
942+
943+
def test_block_scalar_whitespace_is_preserved(self) -> None:
944+
"""Check that multi-line blocks with #'s in them are not treated as comments.
945+
Regression test for https://github.com/lyz-code/yamlfix/issues/231
946+
"""
947+
source = dedent(
948+
"""\
949+
---
950+
addn_doc_key: |-
951+
#######################################
952+
# This would also be broken #
953+
#######################################
954+
---
955+
#Comment above the key
956+
key: |-
957+
###########################################
958+
# Value with lots of whitespace #
959+
# Some More Whitespace #
960+
###########################################
961+
#Comment below
962+
963+
#Comment with some whitespace below
964+
""" # noqa: W293
965+
)
966+
fixed_source = dedent(
967+
"""\
968+
---
969+
addn_doc_key: |-
970+
#######################################
971+
# This would also be broken #
972+
#######################################
973+
---
974+
# Comment above the key
975+
key: |-
976+
###########################################
977+
# Value with lots of whitespace #
978+
# Some More Whitespace #
979+
###########################################
980+
# Comment below
981+
982+
# Comment with some whitespace below
983+
""" # noqa: W293
984+
)
985+
config = YamlfixConfig()
986+
987+
result = fix_code(source, config)
988+
989+
assert result == fixed_source

tests/unit/test_services.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,14 @@ def test_fix_code_functions_emit_debug_logs(
481481
"Restoring jinja2 variables...",
482482
"Restoring double exclamations...",
483483
"Fixing comments...",
484+
# Fixing comments causes a re-run of fixers, so we get duplicates from here
485+
"Fixing truthy strings...",
486+
"Fixing jinja2 variables...",
487+
"Running ruamel yaml fixer...",
488+
"Restoring truthy strings...",
489+
"Restoring jinja2 variables...",
490+
"Restoring double exclamations...",
491+
# End fixing comments duplicates
484492
"Fixing top level lists...",
485493
"Fixing flow-style lists...",
486494
}

0 commit comments

Comments
 (0)