1515
1616from synapse .rest .media .v1 .preview_url_resource import (
1717 decode_and_calc_og ,
18+ get_html_media_encoding ,
1819 summarize_paragraphs ,
1920)
2021
2627 lxml = None
2728
2829
29- class PreviewTestCase (unittest .TestCase ):
30+ class SummarizeTestCase (unittest .TestCase ):
3031 if not lxml :
3132 skip = "url preview feature requires lxml"
3233
@@ -144,12 +145,12 @@ def test_small_then_large_summarize(self):
144145 )
145146
146147
147- class PreviewUrlTestCase (unittest .TestCase ):
148+ class CalcOgTestCase (unittest .TestCase ):
148149 if not lxml :
149150 skip = "url preview feature requires lxml"
150151
151152 def test_simple (self ):
152- html = """
153+ html = b """
153154 <html>
154155 <head><title>Foo</title></head>
155156 <body>
@@ -163,7 +164,7 @@ def test_simple(self):
163164 self .assertEqual (og , {"og:title" : "Foo" , "og:description" : "Some text." })
164165
165166 def test_comment (self ):
166- html = """
167+ html = b """
167168 <html>
168169 <head><title>Foo</title></head>
169170 <body>
@@ -178,7 +179,7 @@ def test_comment(self):
178179 self .assertEqual (og , {"og:title" : "Foo" , "og:description" : "Some text." })
179180
180181 def test_comment2 (self ):
181- html = """
182+ html = b """
182183 <html>
183184 <head><title>Foo</title></head>
184185 <body>
@@ -202,7 +203,7 @@ def test_comment2(self):
202203 )
203204
204205 def test_script (self ):
205- html = """
206+ html = b """
206207 <html>
207208 <head><title>Foo</title></head>
208209 <body>
@@ -217,7 +218,7 @@ def test_script(self):
217218 self .assertEqual (og , {"og:title" : "Foo" , "og:description" : "Some text." })
218219
219220 def test_missing_title (self ):
220- html = """
221+ html = b """
221222 <html>
222223 <body>
223224 Some text.
@@ -230,7 +231,7 @@ def test_missing_title(self):
230231 self .assertEqual (og , {"og:title" : None , "og:description" : "Some text." })
231232
232233 def test_h1_as_title (self ):
233- html = """
234+ html = b """
234235 <html>
235236 <meta property="og:description" content="Some text."/>
236237 <body>
@@ -244,7 +245,7 @@ def test_h1_as_title(self):
244245 self .assertEqual (og , {"og:title" : "Title" , "og:description" : "Some text." })
245246
246247 def test_missing_title_and_broken_h1 (self ):
247- html = """
248+ html = b """
248249 <html>
249250 <body>
250251 <h1><a href="foo"/></h1>
@@ -258,13 +259,20 @@ def test_missing_title_and_broken_h1(self):
258259 self .assertEqual (og , {"og:title" : None , "og:description" : "Some text." })
259260
260261 def test_empty (self ):
261- html = ""
262+ """Test a body with no data in it."""
263+ html = b""
264+ og = decode_and_calc_og (html , "http://example.com/test.html" )
265+ self .assertEqual (og , {})
266+
267+ def test_no_tree (self ):
268+ """A valid body with no tree in it."""
269+ html = b"\x00 "
262270 og = decode_and_calc_og (html , "http://example.com/test.html" )
263271 self .assertEqual (og , {})
264272
265273 def test_invalid_encoding (self ):
266274 """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
267- html = """
275+ html = b """
268276 <html>
269277 <head><title>Foo</title></head>
270278 <body>
@@ -290,3 +298,76 @@ def test_invalid_encoding2(self):
290298 """
291299 og = decode_and_calc_og (html , "http://example.com/test.html" )
292300 self .assertEqual (og , {"og:title" : "ÿÿ Foo" , "og:description" : "Some text." })
301+
302+
303+ class MediaEncodingTestCase (unittest .TestCase ):
304+ def test_meta_charset (self ):
305+ """A character encoding is found via the meta tag."""
306+ encoding = get_html_media_encoding (
307+ b"""
308+ <html>
309+ <head><meta charset="ascii">
310+ </head>
311+ </html>
312+ """ ,
313+ "text/html" ,
314+ )
315+ self .assertEqual (encoding , "ascii" )
316+
317+ # A less well-formed version.
318+ encoding = get_html_media_encoding (
319+ b"""
320+ <html>
321+ <head>< meta charset = ascii>
322+ </head>
323+ </html>
324+ """ ,
325+ "text/html" ,
326+ )
327+ self .assertEqual (encoding , "ascii" )
328+
329+ def test_xml_encoding (self ):
330+ """A character encoding is found via the meta tag."""
331+ encoding = get_html_media_encoding (
332+ b"""
333+ <?xml version="1.0" encoding="ascii"?>
334+ <html>
335+ </html>
336+ """ ,
337+ "text/html" ,
338+ )
339+ self .assertEqual (encoding , "ascii" )
340+
341+ def test_meta_xml_encoding (self ):
342+ """Meta tags take precedence over XML encoding."""
343+ encoding = get_html_media_encoding (
344+ b"""
345+ <?xml version="1.0" encoding="ascii"?>
346+ <html>
347+ <head><meta charset="UTF-16">
348+ </head>
349+ </html>
350+ """ ,
351+ "text/html" ,
352+ )
353+ self .assertEqual (encoding , "UTF-16" )
354+
355+ def test_content_type (self ):
356+ """A character encoding is found via the Content-Type header."""
357+ # Test a few variations of the header.
358+ headers = (
359+ 'text/html; charset="ascii";' ,
360+ "text/html;charset=ascii;" ,
361+ 'text/html; charset="ascii"' ,
362+ "text/html; charset=ascii" ,
363+ 'text/html; charset="ascii;' ,
364+ 'text/html; charset=ascii";' ,
365+ )
366+ for header in headers :
367+ encoding = get_html_media_encoding (b"" , header )
368+ self .assertEqual (encoding , "ascii" )
369+
370+ def test_fallback (self ):
371+ """A character encoding cannot be found in the body or header."""
372+ encoding = get_html_media_encoding (b"" , "text/html" )
373+ self .assertEqual (encoding , "utf-8" )
0 commit comments