Skip to content

Commit 10bc3ab

Browse files
committed
Vendorize webencodings, it is a very small library, it hasn't been updated in more than 7 years, we tweak it slightly in this codebase to enhance its compatibility with Python 3.8+
1 parent 76b7eb0 commit 10bc3ab

File tree

11 files changed

+1135
-6
lines changed

11 files changed

+1135
-6
lines changed

html5lib/_inputstream.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import re
77
from io import BytesIO, StringIO
88

9-
import webencodings
9+
from .contrib import webencodings
1010

1111
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
1212
from .constants import _ReparseException

html5lib/contrib/__init__.py

Whitespace-only changes.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Copyright (c) 2012 by Simon Sapin.
2+
3+
Some rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are
7+
met:
8+
9+
* Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
* Redistributions in binary form must reproduce the above
13+
copyright notice, this list of conditions and the following
14+
disclaimer in the documentation and/or other materials provided
15+
with the distribution.
16+
17+
* The names of the contributors may not be used to endorse or
18+
promote products derived from this software without specific
19+
prior written permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
# coding: utf-8
2+
"""
3+
4+
webencodings
5+
~~~~~~~~~~~~
6+
7+
This is a Python implementation of the `WHATWG Encoding standard
8+
<http://encoding.spec.whatwg.org/>`. See README for details.
9+
10+
:copyright: Copyright 2012 by Simon Sapin
11+
:license: BSD, see LICENSE for details.
12+
13+
"""
14+
15+
import codecs
16+
17+
from .labels import LABELS
18+
19+
20+
VERSION = '0.6-dev'
21+
22+
23+
# Some names in Encoding are not valid Python aliases. Remap these.
24+
PYTHON_NAMES = {
25+
'iso-8859-8-i': 'iso-8859-8',
26+
'x-mac-cyrillic': 'mac-cyrillic',
27+
'macintosh': 'mac-roman',
28+
'windows-874': 'cp874'}
29+
30+
CACHE = {}
31+
32+
33+
def ascii_lower(string):
34+
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
35+
36+
:param string: A Unicode string.
37+
:returns: A new Unicode string.
38+
39+
This is used for `ASCII case-insensitive
40+
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
41+
matching of encoding labels.
42+
The same matching is also used, among other things,
43+
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
44+
45+
This is different from the :meth:`~py:str.lower` method of Unicode strings
46+
which also affect non-ASCII characters,
47+
sometimes mapping them into the ASCII range:
48+
49+
>>> keyword = u'Bac\N{KELVIN SIGN}ground'
50+
>>> assert keyword.lower() == u'background'
51+
>>> assert ascii_lower(keyword) != keyword.lower()
52+
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
53+
54+
"""
55+
# This turns out to be faster than unicode.translate()
56+
return string.encode('utf8').lower().decode('utf8')
57+
58+
59+
def lookup(label):
60+
"""
61+
Look for an encoding by its label.
62+
This is the spec’s `get an encoding
63+
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
64+
Supported labels are listed there.
65+
66+
:param label: A string.
67+
:returns:
68+
An :class:`Encoding` object, or :obj:`None` for an unknown label.
69+
70+
"""
71+
# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
72+
label = ascii_lower(label.strip('\t\n\f\r '))
73+
name = LABELS.get(label)
74+
if name is None:
75+
return None
76+
encoding = CACHE.get(name)
77+
if encoding is None:
78+
if name == 'x-user-defined':
79+
from .x_user_defined import codec_info
80+
else:
81+
python_name = PYTHON_NAMES.get(name, name)
82+
# Any python_name value that gets to here should be valid.
83+
codec_info = codecs.lookup(python_name)
84+
encoding = Encoding(name, codec_info)
85+
CACHE[name] = encoding
86+
return encoding
87+
88+
89+
def _get_encoding(encoding_or_label):
90+
"""
91+
Accept either an encoding object or label.
92+
93+
:param encoding: An :class:`Encoding` object or a label string.
94+
:returns: An :class:`Encoding` object.
95+
:raises: :exc:`~exceptions.LookupError` for an unknown label.
96+
97+
"""
98+
if hasattr(encoding_or_label, 'codec_info'):
99+
return encoding_or_label
100+
101+
encoding = lookup(encoding_or_label)
102+
if encoding is None:
103+
raise LookupError('Unknown encoding label: %r' % encoding_or_label)
104+
return encoding
105+
106+
107+
class Encoding(object):
108+
"""Reresents a character encoding such as UTF-8,
109+
that can be used for decoding or encoding.
110+
111+
.. attribute:: name
112+
113+
Canonical name of the encoding
114+
115+
.. attribute:: codec_info
116+
117+
The actual implementation of the encoding,
118+
a stdlib :class:`~codecs.CodecInfo` object.
119+
See :func:`codecs.register`.
120+
121+
"""
122+
def __init__(self, name, codec_info):
123+
self.name = name
124+
self.codec_info = codec_info
125+
126+
def __repr__(self):
127+
return '<Encoding %s>' % self.name
128+
129+
130+
#: The UTF-8 encoding. Should be used for new content and formats.
131+
UTF8 = lookup('utf-8')
132+
133+
_UTF16LE = lookup('utf-16le')
134+
_UTF16BE = lookup('utf-16be')
135+
136+
137+
def decode(input, fallback_encoding, errors='replace'):
138+
"""
139+
Decode a single string.
140+
141+
:param input: A byte string
142+
:param fallback_encoding:
143+
An :class:`Encoding` object or a label string.
144+
The encoding to use if :obj:`input` does note have a BOM.
145+
:param errors: Type of error handling. See :func:`codecs.register`.
146+
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
147+
:return:
148+
A ``(output, encoding)`` tuple of an Unicode string
149+
and an :obj:`Encoding`.
150+
151+
"""
152+
# Fail early if `encoding` is an invalid label.
153+
fallback_encoding = _get_encoding(fallback_encoding)
154+
bom_encoding, input = _detect_bom(input)
155+
encoding = bom_encoding or fallback_encoding
156+
return encoding.codec_info.decode(input, errors)[0], encoding
157+
158+
159+
def _detect_bom(input):
160+
"""Return (bom_encoding, input), with any BOM removed from the input."""
161+
if input.startswith(b'\xFF\xFE'):
162+
return _UTF16LE, input[2:]
163+
if input.startswith(b'\xFE\xFF'):
164+
return _UTF16BE, input[2:]
165+
if input.startswith(b'\xEF\xBB\xBF'):
166+
return UTF8, input[3:]
167+
return None, input
168+
169+
170+
def encode(input, encoding=UTF8, errors='strict'):
171+
"""
172+
Encode a single string.
173+
174+
:param input: An Unicode string.
175+
:param encoding: An :class:`Encoding` object or a label string.
176+
:param errors: Type of error handling. See :func:`codecs.register`.
177+
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
178+
:return: A byte string.
179+
180+
"""
181+
return _get_encoding(encoding).codec_info.encode(input, errors)[0]
182+
183+
184+
def iter_decode(input, fallback_encoding, errors='replace'):
185+
"""
186+
"Pull"-based decoder.
187+
188+
:param input:
189+
An iterable of byte strings.
190+
191+
The input is first consumed just enough to determine the encoding
192+
based on the precense of a BOM,
193+
then consumed on demand when the return value is.
194+
:param fallback_encoding:
195+
An :class:`Encoding` object or a label string.
196+
The encoding to use if :obj:`input` does note have a BOM.
197+
:param errors: Type of error handling. See :func:`codecs.register`.
198+
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
199+
:returns:
200+
An ``(output, encoding)`` tuple.
201+
:obj:`output` is an iterable of Unicode strings,
202+
:obj:`encoding` is the :obj:`Encoding` that is being used.
203+
204+
"""
205+
206+
decoder = IncrementalDecoder(fallback_encoding, errors)
207+
generator = _iter_decode_generator(input, decoder)
208+
encoding = next(generator)
209+
return generator, encoding
210+
211+
212+
def _iter_decode_generator(input, decoder):
213+
"""Return a generator that first yields the :obj:`Encoding`,
214+
then yields output chukns as Unicode strings.
215+
216+
"""
217+
decode = decoder.decode
218+
input = iter(input)
219+
for chunck in input:
220+
output = decode(chunck)
221+
if output:
222+
assert decoder.encoding is not None
223+
yield decoder.encoding
224+
yield output
225+
break
226+
else:
227+
# Input exhausted without determining the encoding
228+
output = decode(b'', final=True)
229+
assert decoder.encoding is not None
230+
yield decoder.encoding
231+
if output:
232+
yield output
233+
return
234+
235+
for chunck in input:
236+
output = decode(chunck)
237+
if output:
238+
yield output
239+
output = decode(b'', final=True)
240+
if output:
241+
yield output
242+
243+
244+
def iter_encode(input, encoding=UTF8, errors='strict'):
245+
"""
246+
“Pull”-based encoder.
247+
248+
:param input: An iterable of Unicode strings.
249+
:param encoding: An :class:`Encoding` object or a label string.
250+
:param errors: Type of error handling. See :func:`codecs.register`.
251+
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
252+
:returns: An iterable of byte strings.
253+
254+
"""
255+
# Fail early if `encoding` is an invalid label.
256+
encode = IncrementalEncoder(encoding, errors).encode
257+
return _iter_encode_generator(input, encode)
258+
259+
260+
def _iter_encode_generator(input, encode):
261+
for chunck in input:
262+
output = encode(chunck)
263+
if output:
264+
yield output
265+
output = encode('', final=True)
266+
if output:
267+
yield output
268+
269+
270+
class IncrementalDecoder(object):
271+
"""
272+
“Push”-based decoder.
273+
274+
:param fallback_encoding:
275+
An :class:`Encoding` object or a label string.
276+
The encoding to use if :obj:`input` does note have a BOM.
277+
:param errors: Type of error handling. See :func:`codecs.register`.
278+
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
279+
280+
"""
281+
def __init__(self, fallback_encoding, errors='replace'):
282+
# Fail early if `encoding` is an invalid label.
283+
self._fallback_encoding = _get_encoding(fallback_encoding)
284+
self._errors = errors
285+
self._buffer = b''
286+
self._decoder = None
287+
#: The actual :class:`Encoding` that is being used,
288+
#: or :obj:`None` if that is not determined yet.
289+
#: (Ie. if there is not enough input yet to determine
290+
#: if there is a BOM.)
291+
self.encoding = None # Not known yet.
292+
293+
def decode(self, input, final=False):
294+
"""Decode one chunk of the input.
295+
296+
:param input: A byte string.
297+
:param final:
298+
Indicate that no more input is available.
299+
Must be :obj:`True` if this is the last call.
300+
:returns: An Unicode string.
301+
302+
"""
303+
decoder = self._decoder
304+
if decoder is not None:
305+
return decoder(input, final)
306+
307+
input = self._buffer + input
308+
encoding, input = _detect_bom(input)
309+
if encoding is None:
310+
if len(input) < 3 and not final: # Not enough data yet.
311+
self._buffer = input
312+
return ''
313+
else: # No BOM
314+
encoding = self._fallback_encoding
315+
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
316+
self._decoder = decoder
317+
self.encoding = encoding
318+
return decoder(input, final)
319+
320+
321+
class IncrementalEncoder(object):
322+
"""
323+
“Push”-based encoder.
324+
325+
:param encoding: An :class:`Encoding` object or a label string.
326+
:param errors: Type of error handling. See :func:`codecs.register`.
327+
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
328+
329+
.. method:: encode(input, final=False)
330+
331+
:param input: An Unicode string.
332+
:param final:
333+
Indicate that no more input is available.
334+
Must be :obj:`True` if this is the last call.
335+
:returns: A byte string.
336+
337+
"""
338+
def __init__(self, encoding=UTF8, errors='strict'):
339+
encoding = _get_encoding(encoding)
340+
self.encode = encoding.codec_info.incrementalencoder(errors).encode

0 commit comments

Comments
 (0)