@@ -22,14 +22,25 @@ def http_content_type_encoding(content_type):
2222
2323# regexp for parsing HTTP meta tags
2424_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
25+ _SKIP_ATTRS = '''(?x)(?:\\ s+
26+ [^=<>/\\ s"'\x00 -\x1f \x7f ]+ # Attribute name
27+ (?:\\ s*=\\ s*
28+ (?: # ' and " are entity encoded (', "), so no need for \' , \"
29+ '[^']*' # attr in '
30+ |
31+ "[^"]*" # attr in "
32+ |
33+ [^'"\\ s]+ # attr having no ' nor "
34+ ))?
35+ )*?'''
2536_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv' , 'Content-Type' )
2637_CONTENT_RE = _TEMPLATE % ('content' , r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)' )
2738_CONTENT2_RE = _TEMPLATE % ('charset' , r'(?P<charset2>[\w-]+)' )
2839_XML_ENCODING_RE = _TEMPLATE % ('encoding' , r'(?P<xmlcharset>[\w-]+)' )
2940
3041# check for meta tags, or xml decl. and stop search if a body tag is encountered
31- _BODY_ENCODING_PATTERN = r'<\s*(?:meta(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % \
32- ( _HTTPEQUIV_RE , _CONTENT_RE , _CONTENT2_RE , _XML_ENCODING_RE )
42+ _BODY_ENCODING_PATTERN = r'<\s*(?:meta%s (?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
43+ _SKIP_ATTRS , _HTTPEQUIV_RE , _CONTENT_RE , _CONTENT2_RE , _XML_ENCODING_RE )
3344_BODY_ENCODING_STR_RE = re .compile (_BODY_ENCODING_PATTERN , re .I )
3445_BODY_ENCODING_BYTES_RE = re .compile (_BODY_ENCODING_PATTERN .encode ('ascii' ), re .I )
3546
0 commit comments