Do not extract messages from function definitions.

tomasr8 · tomasr8 · commit 899413076184 · 2024-11-13T22:27:06.000+01:00
Fixes a bug where pygettext would attempt
to extract a message from a code like this:

def _(x): pass

This is because pygettext only looks at one
token at a time and '_(x)' looks like a
function call.

However, since 'x' is not a string literal,
it would erroneously issue a warning.

This commit fixes that by keeping track
of the previous token and checking if it's
'def' or 'class'.
diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py
@@ -87,17 +87,26 @@ def assert_POT_equal(self, expected, actual):
         self.maxDiff = None
         self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual))
 
-    def extract_docstrings_from_str(self, module_content):
-        """ utility: return all msgids extracted from module_content """
-        filename = 'test_docstrings.py'
-        with temp_cwd(None) as cwd:
+    def extract_from_str(self, module_content, *, args=(), strict=True):
+        filename = 'test.py'
+        with temp_cwd(None):
             with open(filename, 'w', encoding='utf-8') as fp:
                 fp.write(module_content)
-            assert_python_ok('-Xutf8', self.script, '-D', filename)
+            res = assert_python_ok('-Xutf8', self.script, *args, filename)
+            if strict:
+                self.assertEqual(res.err, b'')
             with open('messages.pot', encoding='utf-8') as fp:
                 data = fp.read()
         return self.get_msgids(data)
 
+    def extract_docstrings_from_str(self, module_content):
+        """Return all docstrings extracted from module_content."""
+        return self.extract_from_str(module_content, args=('--docstrings',), strict=False)
+
+    def extract_messages_from_str(self, module_content):
+        """Return all msgids extracted from module_content."""
+        return self.extract_from_str(module_content)
+
     def test_header(self):
         """Make sure the required fields are in the header, according to:
            http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry
@@ -344,6 +353,23 @@ def test_calls_in_fstring_with_partially_wrong_expression(self):
         self.assertNotIn('foo', msgids)
         self.assertIn('bar', msgids)
 
+    def test_function_and_class_names(self):
+        """Test that function and class names are not mistakenly extracted."""
+        msgids = self.extract_messages_from_str(dedent('''\
+        def _(x):
+            pass
+
+        def _(x="foo"):
+            pass
+
+        async def _(x):
+            pass
+
+        class _(object):
+            pass
+        '''))
+        self.assertEqual(msgids, [''])
+
     def test_pygettext_output(self):
         """Test that the pygettext output exactly matches snapshots."""
         for input_file in DATA_DIR.glob('*.py'):
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
@@ -5,7 +5,7 @@
 # Minimally patched to make it even more xgettext compatible
 # by Peter Funk <pf@artcom-gmbh.de>
 #
-# 2002-11-22 J�rgen Hermann <jh@web.de>
+# 2002-11-22 Jürgen Hermann <jh@web.de>
 # Added checks that _() only contains string literals, and
 # command line args are resolved to module lists, i.e. you
 # can now pass a filename, a module or package name, or a
@@ -207,7 +207,7 @@ def make_escapes(pass_nonascii):
     global escapes, escape
     if pass_nonascii:
         # Allow non-ascii characters to pass through so that e.g. 'msgid
-        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
+        # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
         # escape any character outside the 32..126 range.
         mod = 128
         escape = escape_ascii
@@ -306,6 +306,11 @@ def getFilesForName(name):
     return []
 
 
+def _is_def_or_class_keyword(token):
+    ttype, tstring, *_ = token
+    return ttype == tokenize.NAME and tstring in ('def', 'class')
+
+
 class TokenEater:
     def __init__(self, options):
         self.__options = options
@@ -316,13 +321,11 @@ def __init__(self, options):
         self.__freshmodule = 1
         self.__curfile = None
         self.__enclosurecount = 0
+        self.__prev_token = None
 
     def __call__(self, ttype, tstring, stup, etup, line):
-        # dispatch
-##        import token
-##        print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
-##              file=sys.stderr)
         self.__state(ttype, tstring, stup[0])
+        self.__prev_token = (ttype, tstring, stup, etup, line)
 
     def __waiting(self, ttype, tstring, lineno):
         opts = self.__options
@@ -341,7 +344,10 @@ def __waiting(self, ttype, tstring, lineno):
             if ttype == tokenize.NAME and tstring in ('class', 'def'):
                 self.__state = self.__suiteseen
                 return
-        if ttype == tokenize.NAME and tstring in opts.keywords:
+        if (
+            ttype == tokenize.NAME and tstring in opts.keywords
+            and (not self.__prev_token or not _is_def_or_class_keyword(self.__prev_token))
+        ):
             self.__state = self.__keywordseen
             return
         if ttype == tokenize.STRING: