getpelican
diff --git a/‎feedgenerator/django/utils/encoding.py‎
Lines changed: 179 additions & 115 deletions b/‎feedgenerator/django/utils/encoding.py‎
Lines changed: 179 additions & 115 deletions
@@ -1,97 +1,83 @@
 import codecs
 import datetime
-from decimal import Decimal
 import locale
+from decimal import Decimal
+try:
+    from types import NoneType
+except ImportError:
+    NoneType = type(None)
 from urllib.parse import quote
 
 from .functional import Promise
 
-class DjangoUnicodeDecodeError(UnicodeDecodeError):
-    def __init__(self, obj, *args):
-        self.obj = obj
-        UnicodeDecodeError.__init__(self, *args)
 
+class DjangoUnicodeDecodeError(UnicodeDecodeError):
     def __str__(self):
-        original = UnicodeDecodeError.__str__(self)
-        return '{}. You passed in {!r} ({})'.format(original, self.obj,
-                type(self.obj))
+        return "%s. You passed in %r (%s)" % (
+            super().__str__(),
+            self.object,
+            type(self.object),
+        )
+
 
-def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'):
+def smart_str(s, encoding="utf-8", strings_only=False, errors="strict"):
     """
-    Returns a text object representing 's' -- unicode on Python 2 and str on
-    Python 3. Treats bytestrings using the 'encoding' codec.
+    Return a string representing 's'. Treat bytestrings using the 'encoding'
+    codec.
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
     if isinstance(s, Promise):
         # The input is the result of a gettext_lazy() call.
         return s
-    return force_text(s, encoding, strings_only, errors)
+    return force_str(s, encoding, strings_only, errors)
+
+
+_PROTECTED_TYPES = (
+    NoneType,
+    int,
+    float,
+    Decimal,
+    datetime.datetime,
+    datetime.date,
+    datetime.time,
+)
+
 
 def is_protected_type(obj):
     """Determine if the object instance is of a protected type.
 
     Objects of protected types are preserved as-is when passed to
-    force_text(strings_only=True).
+    force_str(strings_only=True).
     """
-    return isinstance(obj, (int, ) + (type(None), float, Decimal,
-        datetime.datetime, datetime.date, datetime.time))
+    return isinstance(obj, _PROTECTED_TYPES)
 
-def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
+
+def force_str(s, encoding="utf-8", strings_only=False, errors="strict"):
     """
-    Similar to smart_text, except that lazy instances are resolved to
+    Similar to smart_str(), except that lazy instances are resolved to
     strings, rather than kept as lazy objects.
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
-    # Handle the common case first, saves 30-40% when s is an instance
-    # of str. This function gets called often in that setting.
-    if isinstance(s, str):
+    # Handle the common case first for performance reasons.
+    if issubclass(type(s), str):
         return s
     if strings_only and is_protected_type(s):
         return s
     try:
-        if not isinstance(s, str):
-            if hasattr(s, '__unicode__'):
-                s = s.__unicode__()
-            else:
-                try:
-                    if isinstance(s, bytes):
-                        s = str(s, encoding, errors)
-                    else:
-                        s = str(s)
-                except UnicodeEncodeError:
-                    if not isinstance(s, Exception):
-                        raise
-                    # If we get to here, the caller has passed in an Exception
-                    # subclass populated with non-ASCII data without special
-                    # handling to display as a string. We need to handle this
-                    # without raising a further exception. We do an
-                    # approximation to what the Exception's standard str()
-                    # output should be.
-                    s = ' '.join([force_text(arg, encoding, strings_only,
-                            errors) for arg in s])
+        if isinstance(s, bytes):
+            s = str(s, encoding, errors)
         else:
-            # Note: We use .decode() here, instead of str(s, encoding,
-            # errors), so that if s is a SafeBytes, it ends up being a
-            # SafeText at the end.
-            s = s.decode(encoding, errors)
+            s = str(s)
     except UnicodeDecodeError as e:
-        if not isinstance(s, Exception):
-            raise DjangoUnicodeDecodeError(s, *e.args)
-        else:
-            # If we get to here, the caller has passed in an Exception
-            # subclass populated with non-ASCII bytestring data without a
-            # working unicode method. Try to handle this without raising a
-            # further exception by individually forcing the exception args
-            # to unicode.
-            s = ' '.join([force_text(arg, encoding, strings_only,
-                    errors) for arg in s])
+        raise DjangoUnicodeDecodeError(*e.args) from None
     return s
 
-def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
+
+def smart_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
     """
-    Returns a bytestring version of 's', encoded as specified in 'encoding'.
+    Return a bytestring version of 's', encoded as specified in 'encoding'.
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
@@ -101,101 +87,179 @@ def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
     return force_bytes(s, encoding, strings_only, errors)
 
 
-def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
+def force_bytes(s, encoding="utf-8", strings_only=False, errors="strict"):
     """
     Similar to smart_bytes, except that lazy instances are resolved to
     strings, rather than kept as lazy objects.
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
+    # Handle the common case first for performance reasons.
     if isinstance(s, bytes):
-        if encoding == 'utf-8':
+        if encoding == "utf-8":
             return s
         else:
-            return s.decode('utf-8', errors).encode(encoding, errors)
-    if strings_only and (s is None or isinstance(s, int)):
+            return s.decode("utf-8", errors).encode(encoding, errors)
+    if strings_only and is_protected_type(s):
         return s
-    if isinstance(s, Promise):
-        return str.encode(encoding, errors)
-    if not isinstance(s, str):
-        try:
-            return str(s).encode(encoding)
-        except UnicodeEncodeError:
-            if isinstance(s, Exception):
-                # An Exception subclass containing non-ASCII data that doesn't
-                # know how to print itself properly. We shouldn't raise a
-                # further exception.
-                return ' '.join([smart_bytes(arg, encoding, strings_only,
-                        errors) for arg in s])
-            return str(s).encode(encoding, errors)
-    else:
-        return s.encode(encoding, errors)
-
-
-smart_str = smart_text
-force_str = force_text
-
-smart_str.__doc__ = """\
-Apply smart_text in Python 3 and smart_bytes in Python 2.
-
-This is suitable for writing to sys.stdout (for instance).
-"""
+    if isinstance(s, memoryview):
+        return bytes(s)
+    return str(s).encode(encoding, errors)
 
-force_str.__doc__ = """\
-Apply force_text in Python 3 and force_bytes in Python 2.
-"""
 
 def iri_to_uri(iri):
     """
     Convert an Internationalized Resource Identifier (IRI) portion to a URI
     portion that is suitable for inclusion in a URL.
 
-    This is the algorithm from section 3.1 of RFC 3987.  However, since we are
-    assuming input is either UTF-8 or unicode already, we can simplify things a
-    little from the full method.
+    This is the algorithm from RFC 3987 Section 3.1, slightly simplified since
+    the input is assumed to be a string rather than an arbitrary byte stream.
 
-    Returns an ASCII string containing the encoded result.
+    Take an IRI (string or UTF-8 bytes, e.g. '/I ♥ Django/' or
+    b'/I \xe2\x99\xa5 Django/') and return a string containing the encoded
+    result with ASCII chars only (e.g. '/I%20%E2%99%A5%20Django/').
     """
     # The list of safe characters here is constructed from the "reserved" and
-    # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
+    # "unreserved" characters specified in RFC 3986 Sections 2.2 and 2.3:
     #     reserved    = gen-delims / sub-delims
     #     gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
     #     sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
     #                   / "*" / "+" / "," / ";" / "="
     #     unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
-    # Of the unreserved characters, urllib.quote already considers all but
-    # the ~ safe.
+    # Of the unreserved characters, urllib.parse.quote() already considers all
+    # but the ~ safe.
     # The % character is also added to the list of safe characters here, as the
-    # end of section 3.1 of RFC 3987 specifically mentions that % must not be
+    # end of RFC 3987 Section 3.1 specifically mentions that % must not be
     # converted.
     if iri is None:
         return iri
-    return quote(smart_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
+    elif isinstance(iri, Promise):
+        iri = str(iri)
+    return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
 
-def filepath_to_uri(path):
-    """Convert an file system path to a URI portion that is suitable for
-    inclusion in a URL.
 
-    We are assuming input is either UTF-8 or unicode already.
+# List of byte values that uri_to_iri() decodes from percent encoding.
+# First, the unreserved characters from RFC 3986:
+_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
+_hextobyte = {
+    (fmt % char).encode(): bytes((char,))
+    for ascii_range in _ascii_ranges
+    for char in ascii_range
+    for fmt in ["%02x", "%02X"]
+}
+# And then everything above 128, because bytes ≥ 128 are part of multibyte
+# Unicode characters.
+_hexdig = "0123456789ABCDEFabcdef"
+_hextobyte.update(
+    {(a + b).encode(): bytes.fromhex(a + b) for a in _hexdig[8:] for b in _hexdig}
+)
+
 
-    This method will encode certain chars that would normally be recognized as
-    special chars for URIs.  Note that this method does not encode the '
-    character, as it is a valid character within URIs.  See
-    encodeURIComponent() JavaScript function for more details.
+def uri_to_iri(uri):
+    """
+    Convert a Uniform Resource Identifier(URI) into an Internationalized
+    Resource Identifier(IRI).
+
+    This is the algorithm from RFC 3987 Section 3.2, excluding step 4.
 
-    Returns an ASCII string containing the encoded result.
+    Take an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and return
+    a string containing the encoded result (e.g. '/I%20♥%20Django/').
+    """
+    if uri is None:
+        return uri
+    uri = force_bytes(uri)
+    # Fast selective unquote: First, split on '%' and then starting with the
+    # second block, decode the first 2 bytes if they represent a hex code to
+    # decode. The rest of the block is the part after '%AB', not containing
+    # any '%'. Add that to the output without further processing.
+    bits = uri.split(b"%")
+    if len(bits) == 1:
+        iri = uri
+    else:
+        parts = [bits[0]]
+        append = parts.append
+        hextobyte = _hextobyte
+        for item in bits[1:]:
+            hex = item[:2]
+            if hex in hextobyte:
+                append(hextobyte[item[:2]])
+                append(item[2:])
+            else:
+                append(b"%")
+                append(item)
+        iri = b"".join(parts)
+    return repercent_broken_unicode(iri).decode()
+
+
+def escape_uri_path(path):
+    """
+    Escape the unsafe characters from the path portion of a Uniform Resource
+    Identifier (URI).
+    """
+    # These are the "reserved" and "unreserved" characters specified in RFC
+    # 3986 Sections 2.2 and 2.3:
+    #   reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
+    #   unreserved  = alphanum | mark
+    #   mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
+    # The list of safe characters here is constructed subtracting ";", "=",
+    # and "?" according to RFC 3986 Section 3.3.
+    # The reason for not subtracting and escaping "/" is that we are escaping
+    # the entire path, not a path segment.
+    return quote(path, safe="/:@&+$,-_.!~*'()")
+
+
+def punycode(domain):
+    """Return the Punycode of the given domain if it's non-ASCII."""
+    return domain.encode("idna").decode("ascii")
+
+
+def repercent_broken_unicode(path):
+    """
+    As per RFC 3987 Section 3.2, step three of converting a URI into an IRI,
+    repercent-encode any octet produced that is not part of a strictly legal
+    UTF-8 octet sequence.
+    """
+    changed_parts = []
+    while True:
+        try:
+            path.decode()
+        except UnicodeDecodeError as e:
+            # CVE-2019-14235: A recursion shouldn't be used since the exception
+            # handling uses massive amounts of memory
+            repercent = quote(path[e.start : e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
+            changed_parts.append(path[: e.start] + repercent.encode())
+            path = path[e.end :]
+        else:
+            return b"".join(changed_parts) + path
+
+
+def filepath_to_uri(path):
+    """Convert a file system path to a URI portion that is suitable for
+    inclusion in a URL.
+
+    Encode certain chars that would normally be recognized as special chars
+    for URIs. Do not encode the ' character, as it is a valid character
+    within URIs. See the encodeURIComponent() JavaScript function for details.
     """
     if path is None:
         return path
     # I know about `os.sep` and `os.altsep` but I want to leave
     # some flexibility for hardcoding separators.
-    return quote(smart_bytes(path.replace("\\", "/")), safe=b"/~!*()'")
+    return quote(str(path).replace("\\", "/"), safe="/~!*()'")
 
-# The encoding of the default system locale but falls back to the
-# given fallback encoding if the encoding is unsupported by python or could
-# not be determined.  See tickets #10335 and #5846
-try:
-    DEFAULT_LOCALE_ENCODING = locale.getlocale()[1] or 'ascii'
-    codecs.lookup(DEFAULT_LOCALE_ENCODING)
-except:
-    DEFAULT_LOCALE_ENCODING = 'ascii'
+
+def get_system_encoding():
+    """
+    The encoding for the character type functions. Fallback to 'ascii' if the
+    #encoding is unsupported by Python or could not be determined. See tickets
+    #10335 and #5846.
+    """
+    try:
+        encoding = locale.getlocale()[1] or "ascii"
+        codecs.lookup(encoding)
+    except Exception:
+        encoding = "ascii"
+    return encoding
+
+
+DEFAULT_LOCALE_ENCODING = get_system_encoding()