From 11b96023e0283c0a0017066dab17430117118ad6 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 15 Nov 2024 01:47:41 +0000 Subject: [PATCH 1/4] GH-85168: Use filesystem encoding when converting to/from `file` URIs Adjust `urllib.request.url2pathname()` and `pathname2url()` to use the filesystem encoding when quoting and unquoting file URIs, rather than forcing use of UTF-8. No changes are needed in the `nturl2path` module because Windows always uses UTF-8, per PEP 529. --- Lib/test/test_urllib.py | 17 +++++++++++++---- Lib/test/test_urllib2.py | 4 ---- Lib/urllib/request.py | 6 +++--- ...024-11-15-01-50-36.gh-issue-85168.bP8VIN.rst | 4 ++++ 4 files changed, 20 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-15-01-50-36.gh-issue-85168.bP8VIN.rst diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 2c53ce3f99e675..91ab012ff041f2 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -709,10 +709,6 @@ def tearDown(self): def constructLocalFileUrl(self, filePath): filePath = os.path.abspath(filePath) - try: - filePath.encode("utf-8") - except UnicodeEncodeError: - raise unittest.SkipTest("filePath is not encodable to utf8") return "file://%s" % urllib.request.pathname2url(filePath) def createNewTempFile(self, data=b""): @@ -1561,6 +1557,13 @@ def test_pathname2url_posix(self): self.assertEqual(fn('/'), '/') self.assertEqual(fn('/a/b.c'), '/a/b.c') self.assertEqual(fn('/a/b%#c'), '/a/b%25%23c') + try: + expect = os.fsencode('\xe9') + except UnicodeEncodeError: + pass + else: + expect = urllib.parse.quote_from_bytes(expect) + self.assertEqual(fn('\xe9'), expect) @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') @@ -1611,6 +1614,12 @@ def test_url2pathname_posix(self): self.assertEqual(fn('///foo/bar'), '/foo/bar') self.assertEqual(fn('////foo/bar'), '//foo/bar') self.assertEqual(fn('//localhost/foo/bar'), '//localhost/foo/bar') + try: + expect = os.fsdecode(b'\xe9') + except UnicodeDecodeError: + pass + else: + self.assertEqual(fn('%e9'), expect) class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index b90ccc2f125b93..99ad11cf0552eb 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -718,10 +718,6 @@ def test_processors(self): def sanepathname2url(path): - try: - path.encode("utf-8") - except UnicodeEncodeError: - raise unittest.SkipTest("path is not encodable to utf8") urlpath = urllib.request.pathname2url(path) if os.name == "nt" and urlpath.startswith("///"): urlpath = urlpath[2:] diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 18a837dd57ed59..73892e363ab611 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -103,7 +103,7 @@ urlparse, urlsplit, urljoin, unwrap, quote, unquote, _splittype, _splithost, _splitport, _splituser, _splitpasswd, _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes, - unquote_to_bytes, urlunparse) + quote_from_bytes, unquote_to_bytes, urlunparse) from urllib.response import addinfourl, addclosehook # check for SSL @@ -1660,12 +1660,12 @@ def url2pathname(pathname): # URL has an empty authority section, so the path begins on the # third character. pathname = pathname[2:] - return unquote(pathname) + return os.fsdecode(unquote_to_bytes(pathname)) def pathname2url(pathname): """OS-specific conversion from a file system path to a relative URL of the 'file' scheme; not recommended for general use.""" - return quote(pathname) + return quote_from_bytes(os.fsencode(pathname)) ftpcache = {} diff --git a/Misc/NEWS.d/next/Library/2024-11-15-01-50-36.gh-issue-85168.bP8VIN.rst b/Misc/NEWS.d/next/Library/2024-11-15-01-50-36.gh-issue-85168.bP8VIN.rst new file mode 100644 index 00000000000000..abceda8f6fd707 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-15-01-50-36.gh-issue-85168.bP8VIN.rst @@ -0,0 +1,4 @@ +Fix issue where :func:`urllib.request.url2pathname` and +:func:`~urllib.request.pathname2url` always used UTF-8 when quoting and +unquoting file URIs. They now use the :term:`filesystem encoding and error +handler`. From a6f57d333994ee4066f0e7ba00bc0d8d9c5f090a Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 15 Nov 2024 18:52:02 +0000 Subject: [PATCH 2/4] Address review feedback --- Lib/test/test_urllib.py | 27 ++++++++++++++------------- Lib/urllib/request.py | 8 ++++++-- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 91ab012ff041f2..1b75f33a7c6979 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1557,13 +1557,13 @@ def test_pathname2url_posix(self): self.assertEqual(fn('/'), '/') self.assertEqual(fn('/a/b.c'), '/a/b.c') self.assertEqual(fn('/a/b%#c'), '/a/b%25%23c') - try: - expect = os.fsencode('\xe9') - except UnicodeEncodeError: - pass - else: - expect = urllib.parse.quote_from_bytes(expect) - self.assertEqual(fn('\xe9'), expect) + + @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') + def test_pathname2url_nonascii(self): + encoding = sys.getfilesystemencoding() + errors = sys.getfilesystemencodeerrors() + url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, errors=errors) + self.assertEqual(urllib.request.pathname2url(os_helper.FS_NONASCII), url) @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') @@ -1614,12 +1614,13 @@ def test_url2pathname_posix(self): self.assertEqual(fn('///foo/bar'), '/foo/bar') self.assertEqual(fn('////foo/bar'), '//foo/bar') self.assertEqual(fn('//localhost/foo/bar'), '//localhost/foo/bar') - try: - expect = os.fsdecode(b'\xe9') - except UnicodeDecodeError: - pass - else: - self.assertEqual(fn('%e9'), expect) + + @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') + def test_url2pathname_nonascii(self): + encoding = sys.getfilesystemencoding() + errors = sys.getfilesystemencodeerrors() + url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, errors=errors) + self.assertEqual(urllib.request.url2pathname(url), os_helper.FS_NONASCII) class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 73892e363ab611..8482809f1fada1 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1660,12 +1660,16 @@ def url2pathname(pathname): # URL has an empty authority section, so the path begins on the # third character. pathname = pathname[2:] - return os.fsdecode(unquote_to_bytes(pathname)) + encoding = sys.getfilesystemencoding() + errors = sys.getfilesystemencodeerrors() + return unquote(pathname, encoding=encoding, errors=errors) def pathname2url(pathname): """OS-specific conversion from a file system path to a relative URL of the 'file' scheme; not recommended for general use.""" - return quote_from_bytes(os.fsencode(pathname)) + encoding = sys.getfilesystemencoding() + errors = sys.getfilesystemencodeerrors() + return quote(pathname, encoding=encoding, errors=errors) ftpcache = {} From f123044ecdfabac7a1e31ce0322bebf725802e45 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 15 Nov 2024 22:25:05 +0000 Subject: [PATCH 3/4] Undo unnecessary import --- Lib/urllib/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 8482809f1fada1..c02c36d4899401 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -103,7 +103,7 @@ urlparse, urlsplit, urljoin, unwrap, quote, unquote, _splittype, _splithost, _splitport, _splituser, _splitpasswd, _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes, - quote_from_bytes, unquote_to_bytes, urlunparse) + unquote_to_bytes, urlunparse) from urllib.response import addinfourl, addclosehook # check for SSL From b03b10da3acfebf0bb423cd8681e6ecfb4ae5eee Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 16 Nov 2024 17:27:30 +0000 Subject: [PATCH 4/4] Add test for unquoted non-ASCII character in URL. --- Lib/test/test_urllib.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 1b75f33a7c6979..ab18e80663e3bc 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1619,7 +1619,9 @@ def test_url2pathname_posix(self): def test_url2pathname_nonascii(self): encoding = sys.getfilesystemencoding() errors = sys.getfilesystemencodeerrors() - url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, errors=errors) + url = os_helper.FS_NONASCII + self.assertEqual(urllib.request.url2pathname(url), os_helper.FS_NONASCII) + url = urllib.parse.quote(url, encoding=encoding, errors=errors) self.assertEqual(urllib.request.url2pathname(url), os_helper.FS_NONASCII) class Utility_Tests(unittest.TestCase):