Test pygettext --escape option

tomasr8 · tomasr8 · commit c9f657c744c1 · 2025-03-30T14:09:27.000+02:00
diff --git a/Lib/test/test_tools/i18n_data/escapes.pot b/Lib/test/test_tools/i18n_data/escapes.pot
@@ -0,0 +1,33 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: escapes.py:4
+msgid "ascii"
+msgstr ""
+
+#: escapes.py:6
+msgid "\304\233 \305\241 \304\215 \305\231"
+msgstr ""
+
+#: escapes.py:8
+msgid "\316\261 \316\262 \316\263 \316\264"
+msgstr ""
+
+#: escapes.py:10
+msgid "\343\204\261 \343\204\262 \343\204\264 \343\204\267"
+msgstr ""
+
diff --git a/Lib/test/test_tools/i18n_data/escapes.py b/Lib/test/test_tools/i18n_data/escapes.py
@@ -0,0 +1,10 @@
+import gettext as _
+
+
+_('ascii')
+
+_('ě š č ř')
+
+_('α β γ δ')
+
+_('ㄱ ㄲ ㄴ ㄷ')
diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot
@@ -97,3 +97,7 @@ msgid_plural "worlds"
 msgstr[0] ""
 msgstr[1] ""
 
+#: messages.py:122
+msgid "α β γ δ"
+msgstr ""
+
diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py
@@ -117,3 +117,6 @@ def _(x="don't extract me"):
 # f-strings
 f"Hello, {_('world')}!"
 f"Hello, {ngettext('world', 'worlds', 3)}!"
+
+# non-ascii
+_("α β γ δ")
diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py
@@ -18,7 +18,8 @@
 
 
 with imports_under_tool("i18n"):
-    from pygettext import parse_spec
+    import pygettext
+    from pygettext import make_escapes, parse_spec
 
 
 def normalize_POT_file(pot):
@@ -517,6 +518,110 @@ def test_parse_keyword_spec(self):
                 self.assertEqual(str(cm.exception), message)
 
 
+class TestCharacterEscapes(unittest.TestCase):
+    # Pygettext always escapes the following characters:
+    special_chars = {
+        '\\': r'\\',
+        '\t': r'\t',
+        '\r': r'\r',
+        '\n': r'\n',
+        '\"': r'\"',
+    }
+
+    def tearDownClass():
+        # Reset the global 'escapes' dict to the default
+        make_escapes(pass_nonascii=True)
+
+    def test_special_chars(self):
+        # special_chars are always escaped regardless of the
+        # --escape option
+        for pass_nonascii in (True, False):
+            make_escapes(pass_nonascii=pass_nonascii)
+            with self.subTest(pass_nonascii=pass_nonascii):
+                for char in self.special_chars:
+                    self.assertEqual(pygettext.escape(char, encoding='utf-8'),
+                                     self.special_chars[char])
+
+    def _char_to_octal_escape(self, char):
+        """Convert a character to its octal escape representation."""
+        return r"\%03o" % ord(char)
+
+    def _octal_escape_to_string(self, escaped):
+        """Convert an octal escape representation to string."""
+        octal_escapes = re.findall(r'\\([0-7]{3})', escaped)
+        bytestr = bytes([int(n, 8) for n in octal_escapes])
+        return bytestr.decode('utf-8')
+
+    def test_not_escaped(self):
+        """
+        Test escaping when the --escape is not used.
+
+        When --escape is not used, only some characters withing the ASCII
+        range are escaoped. Characters >= 128 are not escaped.
+        """
+        # This is the same as invoking pygettext without
+        # the --escape option (the default behavior).
+        make_escapes(pass_nonascii=True)
+        # The encoding option is not used when --escape is not passed
+        encoding = 'foo'
+
+        # First 32 characters use octal escapes (except for special chars)
+        for i in range(32):
+            char = chr(i)
+            if char in self.special_chars:
+                continue
+            self.assertEqual(pygettext.escape(char, encoding=encoding),
+                             self._char_to_octal_escape(char))
+
+        # Characters 32-126 are not escaped (except for special chars)
+        for i in range(32, 127):
+            char = chr(i)
+            if char in self.special_chars:
+                continue
+            self.assertEqual(pygettext.escape(char, encoding=encoding), char)
+
+        # chr(127) uses octal escape
+        self.assertEqual(pygettext.escape(chr(127), encoding=encoding),
+                         '\\177')
+
+        # All characters >= 128 are not escaped
+        for i in range(128, 256):
+            char = chr(i)
+            self.assertEqual(pygettext.escape(char, encoding=encoding), char)
+
+
+    def test_escaped(self):
+        """
+        Test escaping when --escape is used.
+
+        When --escape is used, all characters are escaped, including
+        """
+        make_escapes(pass_nonascii=False)
+        encoding = 'utf-8'
+
+        # First 32 characters use octal escapes (except for special chars)
+        for i in range(32):
+            char = chr(i)
+            if char in self.special_chars:
+                continue
+            self.assertEqual(pygettext.escape(char, encoding=encoding),
+                             self._char_to_octal_escape(char))
+
+        # Characters 32-126 are not escaped (except for special chars)
+        for i in range(32, 127):
+            char = chr(i)
+            if char in self.special_chars:
+                continue
+            self.assertEqual(pygettext.escape(char, encoding=encoding), char)
+
+        # Characters >= 127 are escaped
+        for i in range(127, 256):
+            char = chr(i)
+            escaped = pygettext.escape(char, encoding=encoding)
+            decoded_char = self._octal_escape_to_string(escaped)
+            self.assertEqual(char, decoded_char)
+
+
 def extract_from_snapshots():
     snapshots = {
         'messages.py': (),
@@ -526,6 +631,8 @@ def extract_from_snapshots():
         'custom_keywords.py': ('--keyword=foo', '--keyword=nfoo:1,2',
                                '--keyword=pfoo:1c,2',
                                '--keyword=npfoo:1c,2,3', '--keyword=_:1,2'),
+        # Test escaping non-ASCII characters
+        'escapes.py': ('--escape',),
     }
 
     for filename, args in snapshots.items():
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
@@ -188,7 +188,7 @@ def make_escapes(pass_nonascii):
     global escapes, escape
     if pass_nonascii:
         # Allow non-ascii characters to pass through so that e.g. 'msgid
-        # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
+        # "Höhe"' would not result in 'msgid "H\366he"'.  Otherwise we
         # escape any character outside the 32..126 range.
         mod = 128
         escape = escape_ascii