Skip to content

Commit 5d088f0

Browse files
committed
Fix charset detection when meta has many attrs
1 parent 1970e87 commit 5d088f0

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

w3lib/encoding.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,25 @@ def http_content_type_encoding(content_type):
2222

2323
# regexp for parsing HTTP meta tags
2424
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
25+
_SKIP_ATTRS = '''(?x)(?:\\s+
26+
[^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name
27+
(?:\\s*=\\s*
28+
(?: # ' and " are entity encoded (&apos;, &quot;), so no need for \', \"
29+
'[^']*' # attr in '
30+
|
31+
"[^"]*" # attr in "
32+
|
33+
[^'"\\s]+ # attr having no ' nor "
34+
))?
35+
)*?'''
2536
_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
2637
_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
2738
_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
2839
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
2940

3041
# check for meta tags, or xml decl. and stop search if a body tag is encountered
31-
_BODY_ENCODING_PATTERN = r'<\s*(?:meta(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % \
32-
(_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
42+
_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
43+
_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
3344
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I)
3445
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)
3546

0 commit comments

Comments
 (0)