This repository was archived by the owner on Apr 26, 2024. It is now read-only.
File tree Expand file tree Collapse file tree 3 files changed +18
-2
lines changed Expand file tree Collapse file tree 3 files changed +18
-2
lines changed Original file line number Diff line number Diff line change 1+ Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.
Original file line number Diff line number Diff line change 5858
5959logger = logging .getLogger (__name__ )
6060
61- _charset_match = re .compile (br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?' , flags = re .I )
61+ _charset_match = re .compile (
62+ br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?' , flags = re .I
63+ )
6264_xml_encoding_match = re .compile (
63- br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9 -]+)"' , flags = re .I
65+ br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_ -]+)"' , flags = re .I
6466)
6567_content_type_match = re .compile (r'.*; *charset="?(.*?)"?(;|$)' , flags = re .I )
6668
Original file line number Diff line number Diff line change @@ -325,6 +325,19 @@ def test_meta_charset(self):
325325 )
326326 self .assertEqual (encoding , "ascii" )
327327
328+ def test_meta_charset_underscores (self ):
329+ """A character encoding contains underscore."""
330+ encoding = get_html_media_encoding (
331+ b"""
332+ <html>
333+ <head><meta charset="Shift_JIS">
334+ </head>
335+ </html>
336+ """ ,
337+ "text/html" ,
338+ )
339+ self .assertEqual (encoding , "Shift_JIS" )
340+
328341 def test_xml_encoding (self ):
329342 """A character encoding is found via the meta tag."""
330343 encoding = get_html_media_encoding (
You can’t perform that action at this time.
0 commit comments