Remove # only from path-specific safe characters

Gallaecio · Gallaecio · commit bad299a9edcd · 2019-10-18T13:39:00.000+02:00
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -33,9 +33,7 @@ def _quote_byte(error):
 EXTRA_SAFE_CHARS = b'|'  # see https://github.com/scrapy/w3lib/pull/25
 
 _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
-
-# see https://github.com/scrapy/w3lib/issues/91
-_safe_chars = _safe_chars.replace(b'#', b'')
+_path_safe_chars = _safe_chars.replace(b'#', b'')
 
 _ascii_tab_newline_re = re.compile(r'[\t\n\r]')  # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
 
@@ -417,7 +415,7 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
         to_native_str(netloc),
 
         # default encoding for path component SHOULD be UTF-8
-        quote(to_bytes(parts.path, path_encoding), _safe_chars),
+        quote(to_bytes(parts.path, path_encoding), _path_safe_chars),
         quote(to_bytes(parts.params, path_encoding), _safe_chars),
 
         # encoding of query and fragment follows page encoding
@@ -505,7 +503,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
     # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
     #    and percent-encode path again (this normalizes to upper-case %XX)
     uqp = _unquotepath(path)
-    path = quote(uqp, _safe_chars) or '/'
+    path = quote(uqp, _path_safe_chars) or '/'
 
     fragment = '' if not keep_fragments else fragment