From 7df8e4173d34ce2fa3672559904a340cb5b99f59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 30 Jun 2024 18:21:59 +0200 Subject: [PATCH 1/5] cache failures in `linecache` --- Lib/linecache.py | 105 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 7 deletions(-) diff --git a/Lib/linecache.py b/Lib/linecache.py index 3462f1c451ba29..2c9b3987483010 100644 --- a/Lib/linecache.py +++ b/Lib/linecache.py @@ -10,17 +10,34 @@ # The cache. Maps filenames to either a thunk which will provide source code, # or a tuple (size, mtime, lines, fullname) once loaded. +# +# By construction, the filenames being stored are truthy. cache = {} +# The filenames for which we failed to get a result and the reason. +# +# The value being stored is a tuple (size, mtime_ns, fullname), +# possibly size=None and st_mtime_ns=None if they are unavailable. +# +# By convention, falsey filenames are not cached and treated as failures. +failures = {} + + def clearcache(): """Clear the cache entirely.""" cache.clear() + failures.clear() def getline(filename, lineno, module_globals=None): """Get a line for a Python source file from the cache. - Update the cache if it doesn't contain an entry for this file already.""" + Update the cache if it doesn't contain an entry for this file already. + Previous failures are cached and must be invalidated via checkcache(). + """ + + if filename in failures: + return '' lines = getlines(filename, module_globals) if 1 <= lineno <= len(lines): @@ -30,13 +47,19 @@ def getline(filename, lineno, module_globals=None): def getlines(filename, module_globals=None): """Get the lines for a Python source file from the cache. - Update the cache if it doesn't contain an entry for this file already.""" + Update the cache if it doesn't contain an entry for this file already. + Previous failures are cached and must be invalidated via checkcache(). + """ if filename in cache: + assert filename not in failures entry = cache[filename] if len(entry) != 1: return cache[filename][2] + if filename in failures: + return [] + try: return updatecache(filename, module_globals) except MemoryError: @@ -45,13 +68,19 @@ def getlines(filename, module_globals=None): def checkcache(filename=None): - """Discard cache entries that are out of date. - (This is not checked upon each call!)""" + """Discard cache entries that are out of date or now available for reading. + (This is not checked upon each call!). + """ if filename is None: filenames = list(cache.keys()) + failed_filenames = list(failures.keys()) elif filename in cache: filenames = [filename] + failed_filenames = [] + elif filename in failures: + filenames = [] + failed_filenames = [filename] else: return @@ -71,16 +100,43 @@ def checkcache(filename=None): try: stat = os.stat(fullname) except OSError: + # a cached entry is now a failure + assert filename not in failed_filenames + failures[filename] = (None, None, fullname) cache.pop(filename, None) continue if size != stat.st_size or mtime != stat.st_mtime: cache.pop(filename, None) + for filename in failed_filenames: + size, mtime_ns, fullname = failures[filename] + try: + # This import can fail if the interpreter is shutting down + import os + except ImportError: + return + try: + stat = os.stat(fullname) + except OSError: + if size is not None and mtime_ns is not None: + # Previous failure was a decoding error, + # this failure is due to os.stat() error. + failures[filename] = (None, None, fullname) + continue # still unreadable + + if size is None or mtime_ns is None: + # we may now be able to read the file + failures.pop(filename, None) + elif size != stat.st_size or mtime_ns != stat.st_mtime_ns: + # the file might have been updated + failures.pop(filename, None) + def updatecache(filename, module_globals=None): """Update a cache entry and return its list of lines. - If something's wrong, print a message, discard the cache entry, - and return an empty list.""" + + If something's wrong, possibly print a message, discard the cache entry, + add the file name to the known failures, and return an empty list.""" # These imports are not at top level because linecache is in the critical # path of the interpreter startup and importing os and sys take a lot of time @@ -113,6 +169,8 @@ def updatecache(filename, module_globals=None): # No luck, the PEP302 loader cannot find the source # for this module. return [] + + failures.pop(filename, None) cache[filename] = ( len(data), None, @@ -124,6 +182,8 @@ def updatecache(filename, module_globals=None): # Try looking through the module search path, which is only useful # when handling a relative filename. if os.path.isabs(filename): + # os.stat() failed, so we won't read it + failures[filename] = (None, None, fullname) return [] for dirname in sys.path: @@ -138,11 +198,37 @@ def updatecache(filename, module_globals=None): except OSError: pass else: + failures[filename] = (None, None, fullname) return [] + else: + if filename in failures: + size, mtime_ns, _ = failures[filename] + if size is None or mtime_ns is None: + # we may now be able to read the file + failures.pop(filename, None) + if size != stat.st_size or mtime_ns != stat.st_mtime_ns: + # the file might have been updated + failures.pop(filename, None) + del size, mtime_ns # to avoid using them + + if filename in failures: + return [] + try: with tokenize.open(fullname) as fp: lines = fp.readlines() - except (OSError, UnicodeDecodeError, SyntaxError): + except OSError: + # The file might have been deleted and thus, we need to + # be sure that the next time checkcache() or updatecache() + # is called, we do not trust the old os.stat() values. + failures[filename] = (None, None, fullname) + return [] + except (UnicodeDecodeError, SyntaxError): + # The file content is incorrect but at least we could + # read it. The next time checkcache() or updatecache() + # is called, we can forget reading the file if nothing + # was modified. + failures[filename] = (stat.st_size, stat.st_mtime_ns, fullname) return [] if not lines: lines = ['\n'] @@ -150,6 +236,7 @@ def updatecache(filename, module_globals=None): lines[-1] += '\n' size, mtime = stat.st_size, stat.st_mtime cache[filename] = size, mtime, lines, fullname + failures.pop(filename, None) return lines @@ -186,6 +273,10 @@ def lazycache(filename, module_globals): def get_lines(name=name, *args, **kwargs): return get_source(name, *args, **kwargs) cache[filename] = (get_lines,) + # It might happen that a file is marked as a failure + # before lazycache() is being called but should not + # be a failure after (but before calling getlines()). + failures.pop(filename, None) return True return False From af5ccfcc5a05952ca033f1f70e11f968ac4e1ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 30 Jun 2024 18:22:02 +0200 Subject: [PATCH 2/5] add tests --- Lib/test/test_linecache.py | 188 +++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/Lib/test/test_linecache.py b/Lib/test/test_linecache.py index 8ac521d72ef13e..dc4708bdcb5af2 100644 --- a/Lib/test/test_linecache.py +++ b/Lib/test/test_linecache.py @@ -2,6 +2,7 @@ import linecache import unittest +import unittest.mock import os.path import tempfile import tokenize @@ -47,6 +48,9 @@ def setUp(self): self.file_name = fp.name fp.write(self.file_byte_string) self.addCleanup(os_helper.unlink, self.file_name) + # ensure that the cache is empty between tests + linecache.cache.clear() + linecache.failures.clear() class GetLineTestsGoodData(TempFile): @@ -75,9 +79,61 @@ class GetLineTestsBadData(TempFile): def test_getline(self): self.assertEqual(linecache.getline(self.file_name, 1), '') + self.assertDictEqual(linecache.cache, {}) + + size, mtime, fullname = linecache.failures[self.file_name] + self.assertIsInstance(size, int) + self.assertIsInstance(mtime, int) def test_getlines(self): self.assertEqual(linecache.getlines(self.file_name), []) + self.assertDictEqual(linecache.cache, {}) + + size, mtime, fullname = linecache.failures[self.file_name] + self.assertIsInstance(size, int) + self.assertIsInstance(mtime, int) + + def test_getline_called_once(self): + with unittest.mock.patch('tokenize.open', wraps=tokenize.open) as tok: + self.assertEqual(linecache.getline(self.file_name, 1), '') + tok.assert_called_once_with(self.file_name) + self.assertDictEqual(linecache.cache, {}) + + size, mtime, fullname = linecache.failures[self.file_name] + self.assertIsInstance(size, int) + self.assertIsInstance(mtime, int) + + with unittest.mock.patch('tokenize.open', wraps=tokenize.open) as tok: + self.assertEqual(linecache.getline(self.file_name, 1), '') + tok.assert_not_called() + self.assertEqual(linecache.getline(self.file_name, 1), '') + tok.assert_not_called() + self.assertDictEqual(linecache.cache, {}) + + failure = linecache.failures[self.file_name] + self.assertEqual(failure[0], size) + self.assertAlmostEqual(failure[1], mtime) + + def test_getlines_called_once(self): + with unittest.mock.patch('tokenize.open', wraps=tokenize.open) as tok: + self.assertEqual(linecache.getlines(self.file_name), []) + tok.assert_called_once_with(self.file_name) + self.assertDictEqual(linecache.cache, {}) + + size, mtime, fullname = linecache.failures[self.file_name] + self.assertIsInstance(size, int) + self.assertIsInstance(mtime, int) + + with unittest.mock.patch('tokenize.open', wraps=tokenize.open) as tok: + self.assertEqual(linecache.getlines(self.file_name), []) + tok.assert_not_called() + self.assertEqual(linecache.getlines(self.file_name), []) + tok.assert_not_called() + self.assertDictEqual(linecache.cache, {}) + + failure = linecache.failures[self.file_name] + self.assertEqual(failure[0], size) + self.assertAlmostEqual(failure[1], mtime) class EmptyFile(GetLineTestsGoodData, unittest.TestCase): @@ -163,6 +219,22 @@ def test_clearcache(self): cached_empty = [fn for fn in cached if fn in linecache.cache] self.assertEqual(cached_empty, []) + def test_clearcache_failures(self): + # assert that the cache is cleared at the beginning of the test + linecache.clearcache() + self.assertListEqual(list(linecache.cache), []) + self.assertListEqual(list(linecache.failures), []) + + # try to cache a failure + linecache.getline(NONEXISTENT_FILENAME, 1) + self.assertListEqual(list(linecache.cache), []) + self.assertListEqual(list(linecache.failures), [NONEXISTENT_FILENAME]) + + # check that the cache is cleared + linecache.clearcache() + self.assertListEqual(list(linecache.cache), []) + self.assertListEqual(list(linecache.failures), []) + def test_checkcache(self): getline = linecache.getline # Create a source file and cache its contents @@ -196,6 +268,42 @@ def test_checkcache(self): self.assertEqual(line, getline(source_name, index + 1)) source_list.append(line) + def test_checkcache_removed(self): + linecache.clearcache() + # case: check that a cached file is never a failure even when removed + filename = os_helper.TESTFN + '.py' + self.addCleanup(os_helper.unlink, filename) + with open(filename, 'w', encoding='utf-8') as source: + source.write(SOURCE_1) + # get the cached data + self.assertNotIn(filename, linecache.cache) + self.assertNotIn(filename, linecache.failures) + data = linecache.getlines(filename) + self.assertIn(filename, linecache.cache) + self.assertNotIn(filename, linecache.failures) + # remove the file + os_helper.unlink(filename) + # check that the data is still cached + data = linecache.getlines(filename) + self.assertIn(filename, linecache.cache) + self.assertNotIn(filename, linecache.failures) + # invalidate previous failures + linecache.checkcache(filename) + # no more cached + self.assertNotIn(filename, linecache.cache) + # the file is not stat()'able! + self.assertIn(filename, linecache.failures) + self.assertIsNone(linecache.failures[filename][0]) + self.assertIsNone(linecache.failures[filename][1]) + with unittest.mock.patch('os.stat') as statfn: + lines = linecache.getlines(filename) + statfn.assert_not_called() + self.assertListEqual(lines, []) + self.assertNotIn(filename, linecache.cache) + self.assertIn(filename, linecache.failures) + self.assertIsNone(linecache.failures[filename][0]) + self.assertIsNone(linecache.failures[filename][1]) + def test_lazycache_no_globals(self): lines = linecache.getlines(FILENAME) linecache.clearcache() @@ -323,5 +431,85 @@ def test_checkcache_with_no_parameter(self): self.assertIn(self.unchanged_file, linecache.cache) +class LineCacheUpdateTests(unittest.TestCase): + + def setUp(self): + linecache.clearcache() + self.filename = os_helper.TESTFN + '.py' + self.addCleanup(os_helper.unlink, self.filename) + + def write_good_data(self): + with open(self.filename, 'w', encoding='utf-8') as source: + source.write(SOURCE_1) + with open(self.filename, encoding='utf-8') as source: + return source.readlines() + + def write_bad_data(self): + with open(self.filename, 'wb') as source: + source.write(b'\n\x80') + + def test_updatecache_with_missing_file(self): + # remove the filename + os_helper.unlink(self.filename) + with unittest.mock.patch('tokenize.open', wraps=tokenize.open) as tok: + for _ in range(3): + linecache.updatecache(self.filename) + tok.assert_not_called() + self.assertDictEqual(linecache.cache, {}) + self.assertIn(self.filename, linecache.failures) + failure = linecache.failures[self.filename] + self.assertIsNone(failure[0]) + self.assertIsNone(failure[1]) + + def test_updatecache_after_fix_missing(self): + # remove the filename + os_helper.unlink(self.filename) + linecache.updatecache(self.filename) + self.assertDictEqual(linecache.cache, {}) + self.assertIn(self.filename, linecache.failures) + failure = linecache.failures[self.filename] + self.assertIsNone(failure[0]) + self.assertIsNone(failure[1]) + # patch the failure + data = self.write_good_data() + linecache.updatecache(self.filename) + self.assertDictEqual(linecache.failures, {}) + self.assertIn(self.filename, linecache.cache) + self.assertListEqual(linecache.getlines(self.filename), data) + + def test_updatecache_after_fix_decoding(self): + self.write_bad_data() + # mark it as a failure + with unittest.mock.patch('tokenize.open', wraps=tokenize.open) as tok: + linecache.updatecache(self.filename) + tok.assert_called_once_with(self.filename) + + self.assertDictEqual(linecache.cache, {}) + self.assertIn(self.filename, linecache.failures) + failure = linecache.failures[self.filename] + # failure but still with known stats + self.assertIsInstance(failure[0], int) + self.assertIsInstance(failure[1], int) + size, mtime, fullname = failure + + with unittest.mock.patch('tokenize.open', wraps=tokenize.open) as tok: + for _ in range(3): + linecache.updatecache(self.filename) + tok.assert_not_called() + self.assertDictEqual(linecache.cache, {}) + self.assertIn(self.filename, linecache.failures) + failure = linecache.failures[self.filename] + # failure but still with same stats + self.assertEqual(failure[0], size) + self.assertAlmostEqual(failure[1], mtime) + + # patch the unicode content + data = self.write_good_data() + linecache.updatecache(self.filename) + self.assertDictEqual(linecache.failures, {}) + self.assertIn(self.filename, linecache.cache) + self.assertListEqual(linecache.getlines(self.filename), data) + + if __name__ == "__main__": unittest.main() From 388b2ab4a97285dff5d3631e4866da89e16f0846 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:21:18 +0200 Subject: [PATCH 3/5] blurb --- .../next/Library/2024-07-01-13-21-14.gh-issue-94436.WXRv3Q.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2024-07-01-13-21-14.gh-issue-94436.WXRv3Q.rst diff --git a/Misc/NEWS.d/next/Library/2024-07-01-13-21-14.gh-issue-94436.WXRv3Q.rst b/Misc/NEWS.d/next/Library/2024-07-01-13-21-14.gh-issue-94436.WXRv3Q.rst new file mode 100644 index 00000000000000..2a34ea6bdf43bb --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-01-13-21-14.gh-issue-94436.WXRv3Q.rst @@ -0,0 +1 @@ +TODO. From 68b1d7dff784894153016850f86e55bb51b85597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:40:05 +0200 Subject: [PATCH 4/5] fix tests --- Lib/linecache.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/Lib/linecache.py b/Lib/linecache.py index 2c9b3987483010..9fc52caac31801 100644 --- a/Lib/linecache.py +++ b/Lib/linecache.py @@ -37,6 +37,12 @@ def getline(filename, lineno, module_globals=None): """ if filename in failures: + # We explicitly validate the input even for failures + # because for success, they would raise a TypeError. + if not isinstance(lineno, int): + raise TypeError(f"'lineno' must be an int, " + f"got {type(module_globals)}") + _validate_module_globals_type(module_globals) return '' lines = getlines(filename, module_globals) @@ -51,6 +57,8 @@ def getlines(filename, module_globals=None): Previous failures are cached and must be invalidated via checkcache(). """ + _validate_module_globals_type(module_globals) + if filename in cache: assert filename not in failures entry = cache[filename] @@ -136,7 +144,8 @@ def updatecache(filename, module_globals=None): """Update a cache entry and return its list of lines. If something's wrong, possibly print a message, discard the cache entry, - add the file name to the known failures, and return an empty list.""" + add the file name to the known failures, and return an empty list. + """ # These imports are not at top level because linecache is in the critical # path of the interpreter startup and importing os and sys take a lot of time @@ -145,6 +154,8 @@ def updatecache(filename, module_globals=None): import sys import tokenize + _validate_module_globals_type(module_globals) + if filename in cache: if len(cache[filename]) != 1: cache.pop(filename, None) @@ -287,3 +298,14 @@ def _register_code(code, string, name): None, [line + '\n' for line in string.splitlines()], name) + + +def _validate_module_globals_type(module_globals): + if module_globals is not None: + # Validation is done here and not in linecache + # since the latter may not necessarily use it. + from collections.abc import Mapping + + if not isinstance(module_globals, Mapping): + raise TypeError(f"'module_globals' must be a Mapping, " + f"got {type(module_globals)}") From 5a27394a73f0a3ee1b583ba8858ee0849d702c23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:44:33 +0200 Subject: [PATCH 5/5] fix typo --- Lib/linecache.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/linecache.py b/Lib/linecache.py index 9fc52caac31801..4a65c76c0329e0 100644 --- a/Lib/linecache.py +++ b/Lib/linecache.py @@ -37,11 +37,10 @@ def getline(filename, lineno, module_globals=None): """ if filename in failures: - # We explicitly validate the input even for failures - # because for success, they would raise a TypeError. + # We explicitly validate the arguments, because + # for cached entries, they would raise a TypeError. if not isinstance(lineno, int): - raise TypeError(f"'lineno' must be an int, " - f"got {type(module_globals)}") + raise TypeError(f"'lineno' must be an int, got {type(lineno)}") _validate_module_globals_type(module_globals) return ''