@@ -348,18 +348,16 @@ def test_convert_charrefs(self):
348348 collector = lambda : EventCollectorCharrefs ()
349349 self .assertTrue (collector ().convert_charrefs )
350350 charrefs = ['"' , '"' , '"' , '"' , '"' , '"' ]
351- # check charrefs in the middle of the text/attributes
352- expected = [('starttag' , 'a' , [('href' , 'foo"zar' )]),
353- ('data' , 'a"z' ), ('endtag' , 'a' )]
351+ # check charrefs in the middle of the text
352+ expected = [('starttag' , 'a' , []), ('data' , 'a"z' ), ('endtag' , 'a' )]
354353 for charref in charrefs :
355- self ._run_check ('<a href="foo{0}zar" >a{0}z</a>' .format (charref ),
354+ self ._run_check ('<a>a{0}z</a>' .format (charref ),
356355 expected , collector = collector ())
357- # check charrefs at the beginning/end of the text/attributes
358- expected = [('data' , '"' ),
359- ('starttag' , 'a' , [('x' , '"' ), ('y' , '"X' ), ('z' , 'X"' )]),
356+ # check charrefs at the beginning/end of the text
357+ expected = [('data' , '"' ), ('starttag' , 'a' , []),
360358 ('data' , '"' ), ('endtag' , 'a' ), ('data' , '"' )]
361359 for charref in charrefs :
362- self ._run_check ('{0}<a x="{0}" y="{0}X" z="X{0}" >'
360+ self ._run_check ('{0}<a>'
363361 '{0}</a>{0}' .format (charref ),
364362 expected , collector = collector ())
365363 # check charrefs in <script>/<style> elements
@@ -382,6 +380,35 @@ def test_convert_charrefs(self):
382380 self ._run_check ('no charrefs here' , [('data' , 'no charrefs here' )],
383381 collector = collector ())
384382
383+ def test_convert_charrefs_in_attribute_values (self ):
384+ # default value for convert_charrefs is now True
385+ collector = lambda : EventCollectorCharrefs ()
386+ self .assertTrue (collector ().convert_charrefs )
387+
388+ # always unescape terminated entity refs, numeric and hex char refs:
389+ # - regardless whether they are at start, middle, end of attribute
390+ # - or followed by alphanumeric, non-alphanumeric, or equals char
391+ charrefs = ['¢' , '¢' , '¢' , '¢' , '¢' ]
392+ expected = [('starttag' , 'a' ,
393+ [('x' , '¢' ), ('x' , 'z¢' ), ('x' , '¢z' ),
394+ ('x' , 'z¢z' ), ('x' , '¢ z' ), ('x' , '¢=z' )]),
395+ ('endtag' , 'a' )]
396+ for charref in charrefs :
397+ self ._run_check ('<a x="{0}" x="z{0}" x="{0}z" '
398+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
399+ .format (charref ), expected , collector = collector ())
400+
401+ # only unescape unterminated entity matches if they are not followed by
402+ # an alphanumeric or an equals sign
403+ charref = '¢'
404+ expected = [('starttag' , 'a' ,
405+ [('x' , '¢' ), ('x' , 'z¢' ), ('x' , '¢z' ),
406+ ('x' , 'z¢z' ), ('x' , '¢ z' ), ('x' , '¢=z' )]),
407+ ('endtag' , 'a' )]
408+ self ._run_check ('<a x="{0}" x="z{0}" x="{0}z" '
409+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
410+ .format (charref ), expected , collector = collector ())
411+
385412 # the remaining tests were for the "tolerant" parser (which is now
386413 # the default), and check various kind of broken markup
387414 def test_tolerant_parsing (self ):
0 commit comments