Skip to content

Commit cc6d7df

Browse files
committed
Merge pull request #42 from Digenis/detect_encoding_from_separate_charset_attr
[MRG+2] Detect encoding when specified as a separate attribute in <meta>
2 parents 8691272 + f7f48f8 commit cc6d7df

File tree

3 files changed

+20
-2
lines changed

3 files changed

+20
-2
lines changed

NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ Changes to safe_url_string:
1313

1414
Package is now properly marked as ``zip_safe``.
1515

16+
html_body_declared_encoding also detects encoding
17+
when not sole attribute in <meta>
18+
1619
1.13.0 (2015-11-05)
1720
-------------------
1821

tests/test_encoding.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@ class RequestEncodingTests(unittest.TestCase):
88
# Content-Type as meta http-equiv
99
b"""<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
1010
b"""\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""",
11+
b"""<meta http-equiv="Content-Type" content="text/html" charset="utf-8">""",
12+
b"""<meta http-equiv=Content-Type content="text/html" charset='utf-8'>""",
13+
b"""<meta http-equiv="Content-Type" content\t=\n"text/html" charset\t="utf-8">""",
1114
b"""<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""",
1215
b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
1316
# html5 meta charset
1417
b"""<meta charset="utf-8">""",
18+
b"""<meta charset =\n"utf-8">""",
1519
# xml encoding
1620
b"""<?xml version="1.0" encoding="utf-8"?>""",
1721
]

w3lib/encoding.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,25 @@ def http_content_type_encoding(content_type):
2222

2323
# regexp for parsing HTTP meta tags
2424
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
25+
_SKIP_ATTRS = '''(?x)(?:\\s+
26+
[^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name
27+
(?:\\s*=\\s*
28+
(?: # ' and " are entity encoded (&apos;, &quot;), so no need for \', \"
29+
'[^']*' # attr in '
30+
|
31+
"[^"]*" # attr in "
32+
|
33+
[^'"\\s]+ # attr having no ' nor "
34+
))?
35+
)*?'''
2536
_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
2637
_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
2738
_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
2839
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
2940

3041
# check for meta tags, or xml decl. and stop search if a body tag is encountered
31-
_BODY_ENCODING_PATTERN = r'<\s*(?:meta(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % \
32-
(_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
42+
_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
43+
_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
3344
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I)
3445
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)
3546

0 commit comments

Comments
 (0)