11import codecs
22import datetime
3- from decimal import Decimal
43import locale
4+ from decimal import Decimal
5+ try :
6+ from types import NoneType
7+ except ImportError :
8+ NoneType = type (None )
59from urllib .parse import quote
610
711from .functional import Promise
812
9- class DjangoUnicodeDecodeError (UnicodeDecodeError ):
10- def __init__ (self , obj , * args ):
11- self .obj = obj
12- UnicodeDecodeError .__init__ (self , * args )
1313
14+ class DjangoUnicodeDecodeError (UnicodeDecodeError ):
1415 def __str__ (self ):
15- original = UnicodeDecodeError .__str__ (self )
16- return '{}. You passed in {!r} ({})' .format (original , self .obj ,
17- type (self .obj ))
16+ return "%s. You passed in %r (%s)" % (
17+ super ().__str__ (),
18+ self .object ,
19+ type (self .object ),
20+ )
21+
1822
19- def smart_text (s , encoding = ' utf-8' , strings_only = False , errors = ' strict' ):
23+ def smart_str (s , encoding = " utf-8" , strings_only = False , errors = " strict" ):
2024 """
21- Returns a text object representing 's' -- unicode on Python 2 and str on
22- Python 3. Treats bytestrings using the 'encoding' codec.
25+ Return a string representing 's'. Treat bytestrings using the 'encoding'
26+ codec.
2327
2428 If strings_only is True, don't convert (some) non-string-like objects.
2529 """
2630 if isinstance (s , Promise ):
2731 # The input is the result of a gettext_lazy() call.
2832 return s
29- return force_text (s , encoding , strings_only , errors )
33+ return force_str (s , encoding , strings_only , errors )
34+
35+
36+ _PROTECTED_TYPES = (
37+ NoneType ,
38+ int ,
39+ float ,
40+ Decimal ,
41+ datetime .datetime ,
42+ datetime .date ,
43+ datetime .time ,
44+ )
45+
3046
3147def is_protected_type (obj ):
3248 """Determine if the object instance is of a protected type.
3349
3450 Objects of protected types are preserved as-is when passed to
35- force_text (strings_only=True).
51+ force_str (strings_only=True).
3652 """
37- return isinstance (obj , (int , ) + (type (None ), float , Decimal ,
38- datetime .datetime , datetime .date , datetime .time ))
53+ return isinstance (obj , _PROTECTED_TYPES )
3954
40- def force_text (s , encoding = 'utf-8' , strings_only = False , errors = 'strict' ):
55+
56+ def force_str (s , encoding = "utf-8" , strings_only = False , errors = "strict" ):
4157 """
42- Similar to smart_text , except that lazy instances are resolved to
58+ Similar to smart_str() , except that lazy instances are resolved to
4359 strings, rather than kept as lazy objects.
4460
4561 If strings_only is True, don't convert (some) non-string-like objects.
4662 """
47- # Handle the common case first, saves 30-40% when s is an instance
48- # of str. This function gets called often in that setting.
49- if isinstance (s , str ):
63+ # Handle the common case first for performance reasons.
64+ if issubclass (type (s ), str ):
5065 return s
5166 if strings_only and is_protected_type (s ):
5267 return s
5368 try :
54- if not isinstance (s , str ):
55- if hasattr (s , '__unicode__' ):
56- s = s .__unicode__ ()
57- else :
58- try :
59- if isinstance (s , bytes ):
60- s = str (s , encoding , errors )
61- else :
62- s = str (s )
63- except UnicodeEncodeError :
64- if not isinstance (s , Exception ):
65- raise
66- # If we get to here, the caller has passed in an Exception
67- # subclass populated with non-ASCII data without special
68- # handling to display as a string. We need to handle this
69- # without raising a further exception. We do an
70- # approximation to what the Exception's standard str()
71- # output should be.
72- s = ' ' .join ([force_text (arg , encoding , strings_only ,
73- errors ) for arg in s ])
69+ if isinstance (s , bytes ):
70+ s = str (s , encoding , errors )
7471 else :
75- # Note: We use .decode() here, instead of str(s, encoding,
76- # errors), so that if s is a SafeBytes, it ends up being a
77- # SafeText at the end.
78- s = s .decode (encoding , errors )
72+ s = str (s )
7973 except UnicodeDecodeError as e :
80- if not isinstance (s , Exception ):
81- raise DjangoUnicodeDecodeError (s , * e .args )
82- else :
83- # If we get to here, the caller has passed in an Exception
84- # subclass populated with non-ASCII bytestring data without a
85- # working unicode method. Try to handle this without raising a
86- # further exception by individually forcing the exception args
87- # to unicode.
88- s = ' ' .join ([force_text (arg , encoding , strings_only ,
89- errors ) for arg in s ])
74+ raise DjangoUnicodeDecodeError (* e .args ) from None
9075 return s
9176
92- def smart_bytes (s , encoding = 'utf-8' , strings_only = False , errors = 'strict' ):
77+
78+ def smart_bytes (s , encoding = "utf-8" , strings_only = False , errors = "strict" ):
9379 """
94- Returns a bytestring version of 's', encoded as specified in 'encoding'.
80+ Return a bytestring version of 's', encoded as specified in 'encoding'.
9581
9682 If strings_only is True, don't convert (some) non-string-like objects.
9783 """
@@ -101,101 +87,179 @@ def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
10187 return force_bytes (s , encoding , strings_only , errors )
10288
10389
104- def force_bytes (s , encoding = ' utf-8' , strings_only = False , errors = ' strict' ):
90+ def force_bytes (s , encoding = " utf-8" , strings_only = False , errors = " strict" ):
10591 """
10692 Similar to smart_bytes, except that lazy instances are resolved to
10793 strings, rather than kept as lazy objects.
10894
10995 If strings_only is True, don't convert (some) non-string-like objects.
11096 """
97+ # Handle the common case first for performance reasons.
11198 if isinstance (s , bytes ):
112- if encoding == ' utf-8' :
99+ if encoding == " utf-8" :
113100 return s
114101 else :
115- return s .decode (' utf-8' , errors ).encode (encoding , errors )
116- if strings_only and ( s is None or isinstance ( s , int ) ):
102+ return s .decode (" utf-8" , errors ).encode (encoding , errors )
103+ if strings_only and is_protected_type ( s ):
117104 return s
118- if isinstance (s , Promise ):
119- return str .encode (encoding , errors )
120- if not isinstance (s , str ):
121- try :
122- return str (s ).encode (encoding )
123- except UnicodeEncodeError :
124- if isinstance (s , Exception ):
125- # An Exception subclass containing non-ASCII data that doesn't
126- # know how to print itself properly. We shouldn't raise a
127- # further exception.
128- return ' ' .join ([smart_bytes (arg , encoding , strings_only ,
129- errors ) for arg in s ])
130- return str (s ).encode (encoding , errors )
131- else :
132- return s .encode (encoding , errors )
133-
134-
135- smart_str = smart_text
136- force_str = force_text
137-
138- smart_str .__doc__ = """\
139- Apply smart_text in Python 3 and smart_bytes in Python 2.
140-
141- This is suitable for writing to sys.stdout (for instance).
142- """
105+ if isinstance (s , memoryview ):
106+ return bytes (s )
107+ return str (s ).encode (encoding , errors )
143108
144- force_str .__doc__ = """\
145- Apply force_text in Python 3 and force_bytes in Python 2.
146- """
147109
148110def iri_to_uri (iri ):
149111 """
150112 Convert an Internationalized Resource Identifier (IRI) portion to a URI
151113 portion that is suitable for inclusion in a URL.
152114
153- This is the algorithm from section 3.1 of RFC 3987. However, since we are
154- assuming input is either UTF-8 or unicode already, we can simplify things a
155- little from the full method.
115+ This is the algorithm from RFC 3987 Section 3.1, slightly simplified since
116+ the input is assumed to be a string rather than an arbitrary byte stream.
156117
157- Returns an ASCII string containing the encoded result.
118+ Take an IRI (string or UTF-8 bytes, e.g. '/I ♥ Django/' or
119+ b'/I \xe2 \x99 \xa5 Django/') and return a string containing the encoded
120+ result with ASCII chars only (e.g. '/I%20%E2%99%A5%20Django/').
158121 """
159122 # The list of safe characters here is constructed from the "reserved" and
160- # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986 :
123+ # "unreserved" characters specified in RFC 3986 Sections 2.2 and 2.3:
161124 # reserved = gen-delims / sub-delims
162125 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
163126 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
164127 # / "*" / "+" / "," / ";" / "="
165128 # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
166- # Of the unreserved characters, urllib.quote already considers all but
167- # the ~ safe.
129+ # Of the unreserved characters, urllib.parse. quote() already considers all
130+ # but the ~ safe.
168131 # The % character is also added to the list of safe characters here, as the
169- # end of section 3.1 of RFC 3987 specifically mentions that % must not be
132+ # end of RFC 3987 Section 3.1 specifically mentions that % must not be
170133 # converted.
171134 if iri is None :
172135 return iri
173- return quote (smart_bytes (iri ), safe = b"/#%[]=:;$&()+,!?*@'~" )
136+ elif isinstance (iri , Promise ):
137+ iri = str (iri )
138+ return quote (iri , safe = "/#%[]=:;$&()+,!?*@'~" )
174139
175- def filepath_to_uri (path ):
176- """Convert an file system path to a URI portion that is suitable for
177- inclusion in a URL.
178140
179- We are assuming input is either UTF-8 or unicode already.
141+ # List of byte values that uri_to_iri() decodes from percent encoding.
142+ # First, the unreserved characters from RFC 3986:
143+ _ascii_ranges = [[45 , 46 , 95 , 126 ], range (65 , 91 ), range (97 , 123 )]
144+ _hextobyte = {
145+ (fmt % char ).encode (): bytes ((char ,))
146+ for ascii_range in _ascii_ranges
147+ for char in ascii_range
148+ for fmt in ["%02x" , "%02X" ]
149+ }
150+ # And then everything above 128, because bytes ≥ 128 are part of multibyte
151+ # Unicode characters.
152+ _hexdig = "0123456789ABCDEFabcdef"
153+ _hextobyte .update (
154+ {(a + b ).encode (): bytes .fromhex (a + b ) for a in _hexdig [8 :] for b in _hexdig }
155+ )
156+
180157
181- This method will encode certain chars that would normally be recognized as
182- special chars for URIs. Note that this method does not encode the '
183- character, as it is a valid character within URIs. See
184- encodeURIComponent() JavaScript function for more details.
158+ def uri_to_iri (uri ):
159+ """
160+ Convert a Uniform Resource Identifier(URI) into an Internationalized
161+ Resource Identifier(IRI).
162+
163+ This is the algorithm from RFC 3987 Section 3.2, excluding step 4.
185164
186- Returns an ASCII string containing the encoded result.
165+ Take an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and return
166+ a string containing the encoded result (e.g. '/I%20♥%20Django/').
167+ """
168+ if uri is None :
169+ return uri
170+ uri = force_bytes (uri )
171+ # Fast selective unquote: First, split on '%' and then starting with the
172+ # second block, decode the first 2 bytes if they represent a hex code to
173+ # decode. The rest of the block is the part after '%AB', not containing
174+ # any '%'. Add that to the output without further processing.
175+ bits = uri .split (b"%" )
176+ if len (bits ) == 1 :
177+ iri = uri
178+ else :
179+ parts = [bits [0 ]]
180+ append = parts .append
181+ hextobyte = _hextobyte
182+ for item in bits [1 :]:
183+ hex = item [:2 ]
184+ if hex in hextobyte :
185+ append (hextobyte [item [:2 ]])
186+ append (item [2 :])
187+ else :
188+ append (b"%" )
189+ append (item )
190+ iri = b"" .join (parts )
191+ return repercent_broken_unicode (iri ).decode ()
192+
193+
194+ def escape_uri_path (path ):
195+ """
196+ Escape the unsafe characters from the path portion of a Uniform Resource
197+ Identifier (URI).
198+ """
199+ # These are the "reserved" and "unreserved" characters specified in RFC
200+ # 3986 Sections 2.2 and 2.3:
201+ # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
202+ # unreserved = alphanum | mark
203+ # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
204+ # The list of safe characters here is constructed subtracting ";", "=",
205+ # and "?" according to RFC 3986 Section 3.3.
206+ # The reason for not subtracting and escaping "/" is that we are escaping
207+ # the entire path, not a path segment.
208+ return quote (path , safe = "/:@&+$,-_.!~*'()" )
209+
210+
211+ def punycode (domain ):
212+ """Return the Punycode of the given domain if it's non-ASCII."""
213+ return domain .encode ("idna" ).decode ("ascii" )
214+
215+
216+ def repercent_broken_unicode (path ):
217+ """
218+ As per RFC 3987 Section 3.2, step three of converting a URI into an IRI,
219+ repercent-encode any octet produced that is not part of a strictly legal
220+ UTF-8 octet sequence.
221+ """
222+ changed_parts = []
223+ while True :
224+ try :
225+ path .decode ()
226+ except UnicodeDecodeError as e :
227+ # CVE-2019-14235: A recursion shouldn't be used since the exception
228+ # handling uses massive amounts of memory
229+ repercent = quote (path [e .start : e .end ], safe = b"/#%[]=:;$&()+,!?*@'~" )
230+ changed_parts .append (path [: e .start ] + repercent .encode ())
231+ path = path [e .end :]
232+ else :
233+ return b"" .join (changed_parts ) + path
234+
235+
236+ def filepath_to_uri (path ):
237+ """Convert a file system path to a URI portion that is suitable for
238+ inclusion in a URL.
239+
240+ Encode certain chars that would normally be recognized as special chars
241+ for URIs. Do not encode the ' character, as it is a valid character
242+ within URIs. See the encodeURIComponent() JavaScript function for details.
187243 """
188244 if path is None :
189245 return path
190246 # I know about `os.sep` and `os.altsep` but I want to leave
191247 # some flexibility for hardcoding separators.
192- return quote (smart_bytes (path .replace ("\\ " , "/" )) , safe = b "/~!*()'" )
248+ return quote (str (path ) .replace ("\\ " , "/" ), safe = "/~!*()'" )
193249
194- # The encoding of the default system locale but falls back to the
195- # given fallback encoding if the encoding is unsupported by python or could
196- # not be determined. See tickets #10335 and #5846
197- try :
198- DEFAULT_LOCALE_ENCODING = locale .getlocale ()[1 ] or 'ascii'
199- codecs .lookup (DEFAULT_LOCALE_ENCODING )
200- except :
201- DEFAULT_LOCALE_ENCODING = 'ascii'
250+
251+ def get_system_encoding ():
252+ """
253+ The encoding for the character type functions. Fallback to 'ascii' if the
254+ #encoding is unsupported by Python or could not be determined. See tickets
255+ #10335 and #5846.
256+ """
257+ try :
258+ encoding = locale .getlocale ()[1 ] or "ascii"
259+ codecs .lookup (encoding )
260+ except Exception :
261+ encoding = "ascii"
262+ return encoding
263+
264+
265+ DEFAULT_LOCALE_ENCODING = get_system_encoding ()
0 commit comments