diff --git a/docs/changes.rst b/docs/changes.rst index 4298f68a5306..27b730b06fbc 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -32,6 +32,7 @@ Weblate 5.17 * Improved API access control for pending tasks. * Faster category and project removals. * Project backup restore no longer trusts repository-local VCS configuration and hooks from the uploaded archive. +* :ref:`check-punctuation-spacing` check no longer triggers false positives for placeholders. * :doc:`/admin/machine` now falls back to the default API URL when base URL is empty. * :ref:`mt-deepl` maps plain Portuguese to European Portuguese. * Push branches are no longer updated with upstream-only commits in multi-branch workflows. diff --git a/weblate/checks/chars.py b/weblate/checks/chars.py index 1a2c02f2d2bd..ba8d581c1306 100644 --- a/weblate/checks/chars.py +++ b/weblate/checks/chars.py @@ -14,7 +14,7 @@ from weblate.checks.base import CountingCheck, TargetCheck, TargetCheckParametrized from weblate.checks.markup import strip_entities from weblate.checks.parser import single_value_flag -from weblate.checks.same import strip_format +from weblate.checks.utils import highlight_string if TYPE_CHECKING: from collections.abc import Iterable @@ -533,15 +533,31 @@ def should_skip(self, unit: Unit) -> bool: return super().should_skip(unit) def check_single(self, source: str, target: str, unit: Unit) -> bool: - # Remove possible markup - target = strip_format(target, unit.all_flags) - # Remove XML/HTML entities to simplify parsing + # Remove XML/HTML entities first (indices must match the string we iterate over) target = strip_entities(target) - + # Skip punctuation inside placeables (e.g XLIFF equiv-text, RST) + highlighted_ranges = [ + (start, end) for start, end, _ in highlight_string(target, unit) + ] + highlighted_ranges.sort() whitespace = {" ", "\u00a0", "\u202f", "\u2009"} - total = len(target) + range_index = 0 + current_start, current_end = ( + highlighted_ranges[0] if highlighted_ranges else (None, None) + ) for i, char in enumerate(target): + # Advance to the next highlighted range if we've passed the current one. + while current_start is not None and i >= current_end: + range_index += 1 + if range_index < len(highlighted_ranges): + current_start, current_end = highlighted_ranges[range_index] + else: + current_start = current_end = None + break + # Skip characters that fall inside a highlighted range. + if current_start is not None and current_start <= i < current_end: + continue if char in FRENCH_PUNCTUATION: if i == 0: # Trigger if punctionation at beginning of the string @@ -559,6 +575,10 @@ def check_single(self, source: str, target: str, unit: Unit) -> bool: return False def get_fixup(self, unit: Unit) -> Iterable[FixupType] | None: + # If there are placeables in target, skip Fix button and rely on save-time + # autofix which has position-aware checks. + if highlight_string(unit.target, unit): + return None return [ # First fix possibly wrong whitespace ( diff --git a/weblate/checks/duplicate.py b/weblate/checks/duplicate.py index d4310a01d405..c05ca1510fff 100644 --- a/weblate/checks/duplicate.py +++ b/weblate/checks/duplicate.py @@ -11,7 +11,7 @@ from django.utils.translation import gettext_lazy, ngettext from weblate.checks.base import TargetCheck -from weblate.checks.same import replace_format_placeholder, strip_format +from weblate.checks.utils import placeholder_replacement, replace_highlighted from weblate.utils.html import format_html_join_comma from weblate.utils.unicodechars import NON_WORD_CHARS @@ -74,15 +74,11 @@ def check_single(self, source: str, target: str, unit: Unit): lang_code = unit.translation.language.base_code source_groups, source_words = self.extract_groups( - strip_format( - source, unit.all_flags, replacement=replace_format_placeholder - ), + replace_highlighted(source, unit, placeholder_replacement), source_code, ) target_groups, target_words = self.extract_groups( - strip_format( - target, unit.all_flags, replacement=replace_format_placeholder - ), + replace_highlighted(target, unit, placeholder_replacement), lang_code, ) diff --git a/weblate/checks/same.py b/weblate/checks/same.py index c74443d3f4c6..1d8c71ebad8e 100644 --- a/weblate/checks/same.py +++ b/weblate/checks/same.py @@ -13,15 +13,9 @@ from weblate.checks.base import TargetCheck from weblate.checks.data import IGNORE_WORDS -from weblate.checks.format import FLAG_RULES, PERCENT_MATCH -from weblate.checks.markup import BBCODE_MATCH -from weblate.checks.qt import QT_FORMAT_MATCH, QT_PLURAL_MATCH -from weblate.checks.ruby import RUBY_FORMAT_MATCH +from weblate.checks.utils import replace_highlighted if TYPE_CHECKING: - from collections.abc import Callable - - from weblate.checks.flags import Flags from weblate.trans.models import Unit # Email address to ignore @@ -49,8 +43,6 @@ TEMPLATE_RE = re.compile(r"{[a-z_-]+}|@[A-Z_]@", re.IGNORECASE) -RST_MATCH = re.compile(r"(:[a-z:]+:`[^`]+`|``[^`]+``)") - SPLIT_RE = re.compile( r"(?:\&(?:nbsp|rsaquo|lt|gt|amp|ldquo|rdquo|times|quot);|" r'[() ,.^`"\'\\/_<>!?;:|{}*@%#&~=+\r\n✓—‑…\[\]0-9-])+', @@ -63,39 +55,6 @@ DB_TAGS = ("screen", "indexterm", "programlisting") -def replace_format_placeholder(match: re.Match) -> str: - return f"x-weblate-{match.start(0)}" - - -def strip_format( - msg: str, flags: Flags, replacement: str | Callable[[re.Match], str] = "" -) -> str: - """ - Remove format strings from the strings. - - These are quite often not changed by translators. - """ - for format_flag, (regex, _is_position_based, _extract_string) in FLAG_RULES.items(): - if format_flag in flags: - return regex.sub("", msg) - - if "qt-format" in flags: - regex = QT_FORMAT_MATCH - elif "qt-plural-format" in flags: - regex = QT_PLURAL_MATCH - elif "ruby-format" in flags: - regex = RUBY_FORMAT_MATCH - elif "rst-text" in flags: - regex = RST_MATCH - elif "percent-placeholders" in flags: - regex = PERCENT_MATCH - elif "bbcode-text" in flags: - regex = BBCODE_MATCH - else: - return msg - return regex.sub(replacement, msg) - - def strip_string(msg: str) -> str: """Strip (usually) untranslated parts from the string.""" # Strip HTML markup @@ -133,17 +92,6 @@ def test_word(word, extra_ignore): ) -def strip_placeholders(msg: str, unit: Unit) -> str: - return re.sub( - "|".join( - re.escape(param) if isinstance(param, str) else param.pattern - for param in unit.all_flags.get_value("placeholders") - ), - "", - msg, - ) - - class SameCheck(TargetCheck): """Check for untranslated entries.""" @@ -153,7 +101,6 @@ class SameCheck(TargetCheck): def should_ignore(self, source: str, unit: Unit) -> bool: """Check whether given unit should be ignored.""" - from weblate.checks.flags import TYPED_FLAGS from weblate.glossary.models import get_glossary_terms # Ignore some strings based on notes (typically from gettext PO file) @@ -167,12 +114,8 @@ def should_ignore(self, source: str, unit: Unit) -> bool: stripped = source flags = unit.all_flags - # Strip format strings - stripped = strip_format(stripped, flags) - - # Strip placeholder strings - if "placeholders" in TYPED_FLAGS and "placeholders" in flags: - stripped = strip_placeholders(stripped, unit) + # Strip all highlighted placeables and format spans. + stripped = replace_highlighted(stripped, unit) if "strict-same" in flags: return not stripped diff --git a/weblate/checks/tests/test_chars_checks.py b/weblate/checks/tests/test_chars_checks.py index 5d555aa8fd55..6227a8ed3438 100644 --- a/weblate/checks/tests/test_chars_checks.py +++ b/weblate/checks/tests/test_chars_checks.py @@ -521,6 +521,30 @@ def test_restructured_text(self) -> None: "fr", ) + def test_angular_fr_placeholders(self) -> None: + # XLIFF placeholder regex so highlight_string skips equiv-text content + xliff_placeholder = r'placeholders:r"]*/>"' + # Check should not fire when punctuation is inside placeholder equiv-text + self.do_test( + False, + ( + 'Orangutan has banana.\n', + 'Orangutan a banane.\n', + xliff_placeholder, + ), + "fr", + ) + # Check should fire when punctuation is outside placeholder + self.do_test( + True, + ( + 'Orangutan has: banana.\n', + 'Orangutan a: banane.\n', + xliff_placeholder, + ), + "fr", + ) + def test_cdata(self) -> None: self.do_test( False, diff --git a/weblate/checks/tests/test_duplicate_checks.py b/weblate/checks/tests/test_duplicate_checks.py index b9a9e6d0e5cd..c854f44315ae 100644 --- a/weblate/checks/tests/test_duplicate_checks.py +++ b/weblate/checks/tests/test_duplicate_checks.py @@ -163,3 +163,23 @@ def test_rst_markup(self) -> None: ), {}, ) + + def test_xliff_placeholders(self) -> None: + xliff_flag = r'placeholders:r"]*/>"' + unit = MockUnit(code="fr", flags=xliff_flag) + # no warning triggered if duplicated words are inside placeholders + self.assertFalse( + self.check.check_single( + "", + 'Limite par jour', # codespell:ignore + unit, + ) + ) + # warning triggered if duplicated words are outside placeholders + self.assertTrue( + self.check.check_single( + "", + 'limite limite par jour', # codespell:ignore + unit, + ) + ) diff --git a/weblate/checks/tests/test_same_checks.py b/weblate/checks/tests/test_same_checks.py index 18365a4b3f17..fee97a12c524 100644 --- a/weblate/checks/tests/test_same_checks.py +++ b/weblate/checks/tests/test_same_checks.py @@ -244,6 +244,14 @@ def test_same_placeholders(self) -> None: r'placeholders:r"%\w+%",strict-same', ), ) + self.do_test( + False, + ( + '', + '', + r'placeholders:r"]*/>"', + ), + ) def test_same_project(self) -> None: self.do_test(False, ("MockProject", "MockProject", "")) diff --git a/weblate/checks/tests/test_utils.py b/weblate/checks/tests/test_utils.py index 6f65cda007db..076bcd5fe239 100644 --- a/weblate/checks/tests/test_utils.py +++ b/weblate/checks/tests/test_utils.py @@ -5,7 +5,7 @@ from django.test import SimpleTestCase from weblate.checks.tests.test_checks import MockUnit -from weblate.checks.utils import highlight_string +from weblate.checks.utils import highlight_string, replace_highlighted class HighlightTestCase(SimpleTestCase): @@ -75,3 +75,21 @@ def test_escaped_markup(self) -> None: (44, 59, "</strong>"), ], ) + + def test_replace_highlighted(self) -> None: + unit = MockUnit( + source="simple {format} %d string", + flags="python-brace-format, python-format", + ) + self.assertEqual( + replace_highlighted(unit.source, unit), + "simple string", + ) + self.assertEqual( + replace_highlighted( + unit.source, + unit, + lambda start: f"x-weblate-{start}", + ), + "simple x-weblate-7 x-weblate-16 string", + ) diff --git a/weblate/checks/utils.py b/weblate/checks/utils.py index 7e958187a8eb..63e752e6b2af 100644 --- a/weblate/checks/utils.py +++ b/weblate/checks/utils.py @@ -9,7 +9,7 @@ from weblate.checks.models import CHECKS if TYPE_CHECKING: - from collections.abc import Generator + from collections.abc import Callable, Generator from weblate.trans.models import Unit @@ -88,3 +88,32 @@ def highlight_string( hl_idx_next += 1 return highlights + + +def replace_highlighted( + source: str, + unit: Unit, + replacement: str | Callable[[int], str] = "", +) -> str: + """Replace highlighted ranges in source string.""" + highlights = highlight_string(source, unit) + if not highlights: + return source + + result = [] + last_end = 0 + for start, end, _text in highlights: + if start < last_end: + continue + result.append(source[last_end:start]) + if callable(replacement): + result.append(replacement(start)) + else: + result.append(replacement) + last_end = end + result.append(source[last_end:]) + return "".join(result) + + +def placeholder_replacement(start_index: int) -> str: + return f"x-weblate-{start_index}" diff --git a/weblate/trans/autofixes/chars.py b/weblate/trans/autofixes/chars.py index 2677667f6513..d565532be3b9 100644 --- a/weblate/trans/autofixes/chars.py +++ b/weblate/trans/autofixes/chars.py @@ -16,7 +16,7 @@ PunctuationSpacingCheck, ZeroWidthSpaceCheck, ) -from weblate.checks.same import RST_MATCH +from weblate.checks.utils import highlight_string from weblate.formats.helpers import CONTROLCHARS_TRANS from weblate.trans.autofixes.base import AutoFix @@ -107,26 +107,38 @@ def get_related_checks(): def fix_single_target( self, target: str, source: str, unit: Unit ) -> tuple[str, bool]: - def spacing_replace(matchobj: re.Match) -> str: - if "rst-text" in unit.all_flags: - offset = matchobj.start(2) - rst_position = RST_MATCH.search(target, offset) - if rst_position is not None and rst_position.start(0) == offset: - # Skip escaping inside rst tag - return matchobj.group(0) - return f"\u00a0{matchobj.group(2)}" - if ( unit.translation.language.is_base({"fr"}) and unit.translation.language.code != "fr_CA" and "ignore-punctuation-spacing" not in unit.all_flags ): + highlights = highlight_string(target, unit) + highlight_ranges = [(start, end) for start, end, _ in highlights] + + def is_highlighted(pos: int) -> bool: + for start, end in highlight_ranges: + if start > pos: + break + if start <= pos < end: + return True + return False + + def make_replacer(replacement_char: str): + def replacer(matchobj: re.Match) -> str: + if is_highlighted(matchobj.start(2)): + return matchobj.group(0) + return f"{replacement_char}{matchobj.group(2)}" + + return replacer + # Fix existing new_target = re.sub( - FRENCH_PUNCTUATION_FIXUP_RE_NBSP, spacing_replace, target + FRENCH_PUNCTUATION_FIXUP_RE_NBSP, make_replacer("\u00a0"), target ) new_target = re.sub( - FRENCH_PUNCTUATION_FIXUP_RE_NNBSP, "\u202f\\2", new_target + FRENCH_PUNCTUATION_FIXUP_RE_NNBSP, + make_replacer("\u202f"), + new_target, ) # Do not add missing as that is likely to trigger issues with other content # such as URLs or Markdown syntax. diff --git a/weblate/trans/tests/test_autofix.py b/weblate/trans/tests/test_autofix.py index 86b5b7555f89..f545eb57832a 100644 --- a/weblate/trans/tests/test_autofix.py +++ b/weblate/trans/tests/test_autofix.py @@ -204,3 +204,46 @@ def test_punctuation_spacing_rst(self) -> None: self.assertEqual( fix.fix_target(["This :"], fr_rst_unit), (["This\u00a0:"], True) ) + + def test_punctuation_spacing_xliff(self) -> None: + fix = PunctuationSpacing() + xliff_flag = r'placeholders:r"]*/>"' + # target with ' :' inside equiv-text, should not be modified + fr_xliff_unit = MockUnit( + source='Quota par jour', + code="fr", + flags=xliff_flag, + ) + self.assertEqual( + fix.fix_target( + [ + 'Quota par jour' + ], + fr_xliff_unit, + ), + ( + [ + 'Quota par jour' + ], + False, + ), + ) + + # ' :' outside a placeholder must still be fixed. + fr_xliff_unit2 = MockUnit( + source='Quota: items', + code="fr", + flags=xliff_flag, + ) + self.assertEqual( + fix.fix_target( + ['Quota : éléments'], + fr_xliff_unit2, + ), + ( + [ + 'Quota\u00a0: éléments' + ], + True, + ), + )