3030commentclose = re .compile (r'--!?>' )
3131commentabruptclose = re .compile (r'-?>' )
3232# Note:
33- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
34- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
34+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
3535# explode, so don't do it.
36- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
37- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
38- tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' )
39- attrfind_tolerant = re .compile (
40- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
41- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
36+ # see the HTML5 specs section "13.2.5.6 Tag open state",
37+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
38+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
39+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
40+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
41+ tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
42+ attrfind_tolerant = re .compile (r"""
43+ (
44+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
45+ )
46+ (= # value indicator
47+ ('[^']*' # LITA-enclosed value
48+ |"[^"]*" # LIT-enclosed value
49+ |(?!['"])[^>\t\n\r\f ]* # bare value
50+ )
51+ )?
52+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
53+ """ , re .VERBOSE )
54+ locatetagend = re .compile (r"""
55+ [a-zA-Z][^\t\n\r\f />]* # tag name
56+ [\t\n\r\f /]* # optional whitespace before attribute name
57+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
58+ (?:= # value indicator
59+ (?:'[^']*' # LITA-enclosed value
60+ |"[^"]*" # LIT-enclosed value
61+ |(?!['"])[^>\t\n\r\f ]* # bare value
62+ )
63+ )?
64+ [\t\n\r\f /]* # possibly followed by a space
65+ )*
66+ >?
67+ """ , re .VERBOSE )
68+ # The following variables are not used, but are temporarily left for
69+ # backward compatibility.
4270locatestarttagend_tolerant = re .compile (r"""
4371 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4472 (?:[\s/]* # optional whitespace before attribute name
5583 \s* # trailing whitespace
5684""" , re .VERBOSE )
5785endendtag = re .compile ('>' )
58- # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
59- # </ and the tag name, so maybe this should be fixed
6086endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
6187
6288
@@ -123,7 +149,8 @@ def get_starttag_text(self):
123149
124150 def set_cdata_mode (self , elem ):
125151 self .cdata_elem = elem .lower ()
126- self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
152+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
153+ re .IGNORECASE | re .ASCII )
127154
128155 def clear_cdata_mode (self ):
129156 self .interesting = interesting_normal
@@ -148,7 +175,7 @@ def goahead(self, end):
148175 # & near the end and see if it's followed by a space or ;.
149176 amppos = rawdata .rfind ('&' , max (i , n - 34 ))
150177 if (amppos >= 0 and
151- not re .compile (r'[\s ;]' ).search (rawdata , amppos )):
178+ not re .compile (r'[\t\n\r\f ;]' ).search (rawdata , amppos )):
152179 break # wait till we get all the text
153180 j = n
154181 else :
@@ -261,7 +288,7 @@ def goahead(self, end):
261288 else :
262289 assert 0 , "interesting.search() lied"
263290 # end while
264- if end and i < n and not self . cdata_elem :
291+ if end and i < n :
265292 if self .convert_charrefs and not self .cdata_elem :
266293 self .handle_data (unescape (rawdata [i :n ]))
267294 else :
@@ -307,7 +334,7 @@ def parse_comment(self, i, report=True):
307334 return match .end ()
308335
309336 # Internal -- parse bogus comment, return length or -1 if not terminated
310- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
337+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
311338 def parse_bogus_comment (self , i , report = 1 ):
312339 rawdata = self .rawdata
313340 assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -333,6 +360,8 @@ def parse_pi(self, i):
333360
334361 # Internal -- handle starttag, return end or -1 if not terminated
335362 def parse_starttag (self , i ):
363+ # See the HTML5 specs section "13.2.5.8 Tag name state"
364+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
336365 self .__starttag_text = None
337366 endpos = self .check_for_whole_start_tag (i )
338367 if endpos < 0 :
@@ -378,76 +407,42 @@ def parse_starttag(self, i):
378407 # or -1 if incomplete.
379408 def check_for_whole_start_tag (self , i ):
380409 rawdata = self .rawdata
381- m = locatestarttagend_tolerant .match (rawdata , i )
382- if m :
383- j = m .end ()
384- next = rawdata [j :j + 1 ]
385- if next == ">" :
386- return j + 1
387- if next == "/" :
388- if rawdata .startswith ("/>" , j ):
389- return j + 2
390- if rawdata .startswith ("/" , j ):
391- # buffer boundary
392- return - 1
393- # else bogus input
394- if j > i :
395- return j
396- else :
397- return i + 1
398- if next == "" :
399- # end of input
400- return - 1
401- if next in ("abcdefghijklmnopqrstuvwxyz=/"
402- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ):
403- # end of input in or before attribute value, or we have the
404- # '/' from a '/>' ending
405- return - 1
406- if j > i :
407- return j
408- else :
409- return i + 1
410- raise AssertionError ("we should not get here!" )
410+ match = locatetagend .match (rawdata , i + 1 )
411+ assert match
412+ j = match .end ()
413+ if rawdata [j - 1 ] != ">" :
414+ return - 1
415+ return j
411416
412417 # Internal -- parse endtag, return end or -1 if incomplete
413418 def parse_endtag (self , i ):
419+ # See the HTML5 specs section "13.2.5.7 End tag open state"
420+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
414421 rawdata = self .rawdata
415422 assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
416- match = endendtag .search (rawdata , i + 1 ) # >
417- if not match :
423+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
418424 return - 1
419- gtpos = match .end ()
420- match = endtagfind .match (rawdata , i ) # </ + tag + >
421- if not match :
422- if self .cdata_elem is not None :
423- self .handle_data (rawdata [i :gtpos ])
424- return gtpos
425- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
426- namematch = tagfind_tolerant .match (rawdata , i + 2 )
427- if not namematch :
428- # w3.org/TR/html5/tokenization.html#end-tag-open-state
429- if rawdata [i :i + 3 ] == '</>' :
430- return i + 3
431- else :
432- return self .parse_bogus_comment (i )
433- tagname = namematch .group (1 ).lower ()
434- # consume and ignore other stuff between the name and the >
435- # Note: this is not 100% correct, since we might have things like
436- # </tag attr=">">, but looking for > after the name should cover
437- # most of the cases and is much simpler
438- gtpos = rawdata .find ('>' , namematch .end ())
439- self .handle_endtag (tagname )
440- return gtpos + 1
425+ if not endtagopen .match (rawdata , i ): # </ + letter
426+ if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
427+ # "missing-end-tag-name" parser error
428+ return i + 3
429+ else :
430+ return self .parse_bogus_comment (i )
441431
442- elem = match . group ( 1 ). lower () # script or style
443- if self . cdata_elem is not None :
444- if elem != self . cdata_elem :
445- self . handle_data ( rawdata [i : gtpos ])
446- return gtpos
432+ match = locatetagend . match ( rawdata , i + 2 )
433+ assert match
434+ j = match . end ()
435+ if rawdata [j - 1 ] != ">" :
436+ return - 1
447437
448- self .handle_endtag (elem )
438+ # find the name: "13.2.5.8 Tag name state"
439+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
440+ match = tagfind_tolerant .match (rawdata , i + 2 )
441+ assert match
442+ tag = match .group (1 ).lower ()
443+ self .handle_endtag (tag )
449444 self .clear_cdata_mode ()
450- return gtpos
445+ return j
451446
452447 # Overridable -- finish processing of start+end tag: <tag.../>
453448 def handle_startendtag (self , tag , attrs ):
0 commit comments