Skip to content

Commit 60c177a

Browse files
authored
Merge pull request #44 from uda/update-django-5-2
2 parents c1e47f4 + d2ba18f commit 60c177a

File tree

8 files changed

+860
-809
lines changed

8 files changed

+860
-809
lines changed
Lines changed: 179 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,97 +1,83 @@
11
import codecs
22
import datetime
3-
from decimal import Decimal
43
import locale
4+
from decimal import Decimal
5+
try:
6+
from types import NoneType
7+
except ImportError:
8+
NoneType = type(None)
59
from urllib.parse import quote
610

711
from .functional import Promise
812

9-
class DjangoUnicodeDecodeError(UnicodeDecodeError):
10-
def __init__(self, obj, *args):
11-
self.obj = obj
12-
UnicodeDecodeError.__init__(self, *args)
1313

14+
class DjangoUnicodeDecodeError(UnicodeDecodeError):
1415
def __str__(self):
15-
original = UnicodeDecodeError.__str__(self)
16-
return '{}. You passed in {!r} ({})'.format(original, self.obj,
17-
type(self.obj))
16+
return "%s. You passed in %r (%s)" % (
17+
super().__str__(),
18+
self.object,
19+
type(self.object),
20+
)
21+
1822

19-
def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'):
23+
def smart_str(s, encoding="utf-8", strings_only=False, errors="strict"):
2024
"""
21-
Returns a text object representing 's' -- unicode on Python 2 and str on
22-
Python 3. Treats bytestrings using the 'encoding' codec.
25+
Return a string representing 's'. Treat bytestrings using the 'encoding'
26+
codec.
2327
2428
If strings_only is True, don't convert (some) non-string-like objects.
2529
"""
2630
if isinstance(s, Promise):
2731
# The input is the result of a gettext_lazy() call.
2832
return s
29-
return force_text(s, encoding, strings_only, errors)
33+
return force_str(s, encoding, strings_only, errors)
34+
35+
36+
_PROTECTED_TYPES = (
37+
NoneType,
38+
int,
39+
float,
40+
Decimal,
41+
datetime.datetime,
42+
datetime.date,
43+
datetime.time,
44+
)
45+
3046

3147
def is_protected_type(obj):
3248
"""Determine if the object instance is of a protected type.
3349
3450
Objects of protected types are preserved as-is when passed to
35-
force_text(strings_only=True).
51+
force_str(strings_only=True).
3652
"""
37-
return isinstance(obj, (int, ) + (type(None), float, Decimal,
38-
datetime.datetime, datetime.date, datetime.time))
53+
return isinstance(obj, _PROTECTED_TYPES)
3954

40-
def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
55+
56+
def force_str(s, encoding="utf-8", strings_only=False, errors="strict"):
4157
"""
42-
Similar to smart_text, except that lazy instances are resolved to
58+
Similar to smart_str(), except that lazy instances are resolved to
4359
strings, rather than kept as lazy objects.
4460
4561
If strings_only is True, don't convert (some) non-string-like objects.
4662
"""
47-
# Handle the common case first, saves 30-40% when s is an instance
48-
# of str. This function gets called often in that setting.
49-
if isinstance(s, str):
63+
# Handle the common case first for performance reasons.
64+
if issubclass(type(s), str):
5065
return s
5166
if strings_only and is_protected_type(s):
5267
return s
5368
try:
54-
if not isinstance(s, str):
55-
if hasattr(s, '__unicode__'):
56-
s = s.__unicode__()
57-
else:
58-
try:
59-
if isinstance(s, bytes):
60-
s = str(s, encoding, errors)
61-
else:
62-
s = str(s)
63-
except UnicodeEncodeError:
64-
if not isinstance(s, Exception):
65-
raise
66-
# If we get to here, the caller has passed in an Exception
67-
# subclass populated with non-ASCII data without special
68-
# handling to display as a string. We need to handle this
69-
# without raising a further exception. We do an
70-
# approximation to what the Exception's standard str()
71-
# output should be.
72-
s = ' '.join([force_text(arg, encoding, strings_only,
73-
errors) for arg in s])
69+
if isinstance(s, bytes):
70+
s = str(s, encoding, errors)
7471
else:
75-
# Note: We use .decode() here, instead of str(s, encoding,
76-
# errors), so that if s is a SafeBytes, it ends up being a
77-
# SafeText at the end.
78-
s = s.decode(encoding, errors)
72+
s = str(s)
7973
except UnicodeDecodeError as e:
80-
if not isinstance(s, Exception):
81-
raise DjangoUnicodeDecodeError(s, *e.args)
82-
else:
83-
# If we get to here, the caller has passed in an Exception
84-
# subclass populated with non-ASCII bytestring data without a
85-
# working unicode method. Try to handle this without raising a
86-
# further exception by individually forcing the exception args
87-
# to unicode.
88-
s = ' '.join([force_text(arg, encoding, strings_only,
89-
errors) for arg in s])
74+
raise DjangoUnicodeDecodeError(*e.args) from None
9075
return s
9176

92-
def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
77+
78+
def smart_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
9379
"""
94-
Returns a bytestring version of 's', encoded as specified in 'encoding'.
80+
Return a bytestring version of 's', encoded as specified in 'encoding'.
9581
9682
If strings_only is True, don't convert (some) non-string-like objects.
9783
"""
@@ -101,101 +87,179 @@ def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
10187
return force_bytes(s, encoding, strings_only, errors)
10288

10389

104-
def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
90+
def force_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
10591
"""
10692
Similar to smart_bytes, except that lazy instances are resolved to
10793
strings, rather than kept as lazy objects.
10894
10995
If strings_only is True, don't convert (some) non-string-like objects.
11096
"""
97+
# Handle the common case first for performance reasons.
11198
if isinstance(s, bytes):
112-
if encoding == 'utf-8':
99+
if encoding == "utf-8":
113100
return s
114101
else:
115-
return s.decode('utf-8', errors).encode(encoding, errors)
116-
if strings_only and (s is None or isinstance(s, int)):
102+
return s.decode("utf-8", errors).encode(encoding, errors)
103+
if strings_only and is_protected_type(s):
117104
return s
118-
if isinstance(s, Promise):
119-
return str.encode(encoding, errors)
120-
if not isinstance(s, str):
121-
try:
122-
return str(s).encode(encoding)
123-
except UnicodeEncodeError:
124-
if isinstance(s, Exception):
125-
# An Exception subclass containing non-ASCII data that doesn't
126-
# know how to print itself properly. We shouldn't raise a
127-
# further exception.
128-
return ' '.join([smart_bytes(arg, encoding, strings_only,
129-
errors) for arg in s])
130-
return str(s).encode(encoding, errors)
131-
else:
132-
return s.encode(encoding, errors)
133-
134-
135-
smart_str = smart_text
136-
force_str = force_text
137-
138-
smart_str.__doc__ = """\
139-
Apply smart_text in Python 3 and smart_bytes in Python 2.
140-
141-
This is suitable for writing to sys.stdout (for instance).
142-
"""
105+
if isinstance(s, memoryview):
106+
return bytes(s)
107+
return str(s).encode(encoding, errors)
143108

144-
force_str.__doc__ = """\
145-
Apply force_text in Python 3 and force_bytes in Python 2.
146-
"""
147109

148110
def iri_to_uri(iri):
149111
"""
150112
Convert an Internationalized Resource Identifier (IRI) portion to a URI
151113
portion that is suitable for inclusion in a URL.
152114
153-
This is the algorithm from section 3.1 of RFC 3987. However, since we are
154-
assuming input is either UTF-8 or unicode already, we can simplify things a
155-
little from the full method.
115+
This is the algorithm from RFC 3987 Section 3.1, slightly simplified since
116+
the input is assumed to be a string rather than an arbitrary byte stream.
156117
157-
Returns an ASCII string containing the encoded result.
118+
Take an IRI (string or UTF-8 bytes, e.g. '/I ♥ Django/' or
119+
b'/I \xe2\x99\xa5 Django/') and return a string containing the encoded
120+
result with ASCII chars only (e.g. '/I%20%E2%99%A5%20Django/').
158121
"""
159122
# The list of safe characters here is constructed from the "reserved" and
160-
# "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
123+
# "unreserved" characters specified in RFC 3986 Sections 2.2 and 2.3:
161124
# reserved = gen-delims / sub-delims
162125
# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
163126
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
164127
# / "*" / "+" / "," / ";" / "="
165128
# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
166-
# Of the unreserved characters, urllib.quote already considers all but
167-
# the ~ safe.
129+
# Of the unreserved characters, urllib.parse.quote() already considers all
130+
# but the ~ safe.
168131
# The % character is also added to the list of safe characters here, as the
169-
# end of section 3.1 of RFC 3987 specifically mentions that % must not be
132+
# end of RFC 3987 Section 3.1 specifically mentions that % must not be
170133
# converted.
171134
if iri is None:
172135
return iri
173-
return quote(smart_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
136+
elif isinstance(iri, Promise):
137+
iri = str(iri)
138+
return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
174139

175-
def filepath_to_uri(path):
176-
"""Convert an file system path to a URI portion that is suitable for
177-
inclusion in a URL.
178140

179-
We are assuming input is either UTF-8 or unicode already.
141+
# List of byte values that uri_to_iri() decodes from percent encoding.
142+
# First, the unreserved characters from RFC 3986:
143+
_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
144+
_hextobyte = {
145+
(fmt % char).encode(): bytes((char,))
146+
for ascii_range in _ascii_ranges
147+
for char in ascii_range
148+
for fmt in ["%02x", "%02X"]
149+
}
150+
# And then everything above 128, because bytes ≥ 128 are part of multibyte
151+
# Unicode characters.
152+
_hexdig = "0123456789ABCDEFabcdef"
153+
_hextobyte.update(
154+
{(a + b).encode(): bytes.fromhex(a + b) for a in _hexdig[8:] for b in _hexdig}
155+
)
156+
180157

181-
This method will encode certain chars that would normally be recognized as
182-
special chars for URIs. Note that this method does not encode the '
183-
character, as it is a valid character within URIs. See
184-
encodeURIComponent() JavaScript function for more details.
158+
def uri_to_iri(uri):
159+
"""
160+
Convert a Uniform Resource Identifier(URI) into an Internationalized
161+
Resource Identifier(IRI).
162+
163+
This is the algorithm from RFC 3987 Section 3.2, excluding step 4.
185164
186-
Returns an ASCII string containing the encoded result.
165+
Take an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and return
166+
a string containing the encoded result (e.g. '/I%20♥%20Django/').
167+
"""
168+
if uri is None:
169+
return uri
170+
uri = force_bytes(uri)
171+
# Fast selective unquote: First, split on '%' and then starting with the
172+
# second block, decode the first 2 bytes if they represent a hex code to
173+
# decode. The rest of the block is the part after '%AB', not containing
174+
# any '%'. Add that to the output without further processing.
175+
bits = uri.split(b"%")
176+
if len(bits) == 1:
177+
iri = uri
178+
else:
179+
parts = [bits[0]]
180+
append = parts.append
181+
hextobyte = _hextobyte
182+
for item in bits[1:]:
183+
hex = item[:2]
184+
if hex in hextobyte:
185+
append(hextobyte[item[:2]])
186+
append(item[2:])
187+
else:
188+
append(b"%")
189+
append(item)
190+
iri = b"".join(parts)
191+
return repercent_broken_unicode(iri).decode()
192+
193+
194+
def escape_uri_path(path):
195+
"""
196+
Escape the unsafe characters from the path portion of a Uniform Resource
197+
Identifier (URI).
198+
"""
199+
# These are the "reserved" and "unreserved" characters specified in RFC
200+
# 3986 Sections 2.2 and 2.3:
201+
# reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
202+
# unreserved = alphanum | mark
203+
# mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
204+
# The list of safe characters here is constructed subtracting ";", "=",
205+
# and "?" according to RFC 3986 Section 3.3.
206+
# The reason for not subtracting and escaping "/" is that we are escaping
207+
# the entire path, not a path segment.
208+
return quote(path, safe="/:@&+$,-_.!~*'()")
209+
210+
211+
def punycode(domain):
212+
"""Return the Punycode of the given domain if it's non-ASCII."""
213+
return domain.encode("idna").decode("ascii")
214+
215+
216+
def repercent_broken_unicode(path):
217+
"""
218+
As per RFC 3987 Section 3.2, step three of converting a URI into an IRI,
219+
repercent-encode any octet produced that is not part of a strictly legal
220+
UTF-8 octet sequence.
221+
"""
222+
changed_parts = []
223+
while True:
224+
try:
225+
path.decode()
226+
except UnicodeDecodeError as e:
227+
# CVE-2019-14235: A recursion shouldn't be used since the exception
228+
# handling uses massive amounts of memory
229+
repercent = quote(path[e.start : e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
230+
changed_parts.append(path[: e.start] + repercent.encode())
231+
path = path[e.end :]
232+
else:
233+
return b"".join(changed_parts) + path
234+
235+
236+
def filepath_to_uri(path):
237+
"""Convert a file system path to a URI portion that is suitable for
238+
inclusion in a URL.
239+
240+
Encode certain chars that would normally be recognized as special chars
241+
for URIs. Do not encode the ' character, as it is a valid character
242+
within URIs. See the encodeURIComponent() JavaScript function for details.
187243
"""
188244
if path is None:
189245
return path
190246
# I know about `os.sep` and `os.altsep` but I want to leave
191247
# some flexibility for hardcoding separators.
192-
return quote(smart_bytes(path.replace("\\", "/")), safe=b"/~!*()'")
248+
return quote(str(path).replace("\\", "/"), safe="/~!*()'")
193249

194-
# The encoding of the default system locale but falls back to the
195-
# given fallback encoding if the encoding is unsupported by python or could
196-
# not be determined. See tickets #10335 and #5846
197-
try:
198-
DEFAULT_LOCALE_ENCODING = locale.getlocale()[1] or 'ascii'
199-
codecs.lookup(DEFAULT_LOCALE_ENCODING)
200-
except:
201-
DEFAULT_LOCALE_ENCODING = 'ascii'
250+
251+
def get_system_encoding():
252+
"""
253+
The encoding for the character type functions. Fallback to 'ascii' if the
254+
#encoding is unsupported by Python or could not be determined. See tickets
255+
#10335 and #5846.
256+
"""
257+
try:
258+
encoding = locale.getlocale()[1] or "ascii"
259+
codecs.lookup(encoding)
260+
except Exception:
261+
encoding = "ascii"
262+
return encoding
263+
264+
265+
DEFAULT_LOCALE_ENCODING = get_system_encoding()

0 commit comments

Comments
 (0)