2929piclose = re .compile ('>' )
3030commentclose = re .compile (r'--\s*>' )
3131# Note:
32- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
33- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
32+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
33+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
3434# explode, so don't do it.
35- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37- tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' )
38- attrfind_tolerant = re .compile (
39- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
40- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
35+ # see the HTML5 specs section "13.2.5.6 Tag open state",
36+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
37+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
38+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
39+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
40+ tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
41+ attrfind_tolerant = re .compile (r"""
42+ (
43+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
44+ )
45+ (= # value indicator
46+ ('[^']*' # LITA-enclosed value
47+ |"[^"]*" # LIT-enclosed value
48+ |(?!['"])[^>\t\n\r\f ]* # bare value
49+ )
50+ )?
51+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
52+ """ , re .VERBOSE )
53+ locatetagend = re .compile (r"""
54+ [a-zA-Z][^\t\n\r\f />]* # tag name
55+ [\t\n\r\f /]* # optional whitespace before attribute name
56+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
57+ (?:= # value indicator
58+ (?:'[^']*' # LITA-enclosed value
59+ |"[^"]*" # LIT-enclosed value
60+ |(?!['"])[^>\t\n\r\f ]* # bare value
61+ )
62+ )?
63+ [\t\n\r\f /]* # possibly followed by a space
64+ )*
65+ >?
66+ """ , re .VERBOSE )
67+ # The following variables are not used, but are temporarily left for
68+ # backward compatibility.
4169locatestarttagend_tolerant = re .compile (r"""
4270 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4371 (?:[\s/]* # optional whitespace before attribute name
5482 \s* # trailing whitespace
5583""" , re .VERBOSE )
5684endendtag = re .compile ('>' )
57- # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
58- # </ and the tag name, so maybe this should be fixed
5985endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
6086
6187
@@ -122,7 +148,8 @@ def get_starttag_text(self):
122148
123149 def set_cdata_mode (self , elem ):
124150 self .cdata_elem = elem .lower ()
125- self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
151+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
152+ re .IGNORECASE | re .ASCII )
126153
127154 def clear_cdata_mode (self ):
128155 self .interesting = interesting_normal
@@ -147,7 +174,7 @@ def goahead(self, end):
147174 # & near the end and see if it's followed by a space or ;.
148175 amppos = rawdata .rfind ('&' , max (i , n - 34 ))
149176 if (amppos >= 0 and
150- not re .compile (r'[\s ;]' ).search (rawdata , amppos )):
177+ not re .compile (r'[\t\n\r\f ;]' ).search (rawdata , amppos )):
151178 break # wait till we get all the text
152179 j = n
153180 else :
@@ -260,7 +287,7 @@ def goahead(self, end):
260287 else :
261288 assert 0 , "interesting.search() lied"
262289 # end while
263- if end and i < n and not self . cdata_elem :
290+ if end and i < n :
264291 if self .convert_charrefs and not self .cdata_elem :
265292 self .handle_data (unescape (rawdata [i :n ]))
266293 else :
@@ -291,7 +318,7 @@ def parse_html_declaration(self, i):
291318 return self .parse_bogus_comment (i )
292319
293320 # Internal -- parse bogus comment, return length or -1 if not terminated
294- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
321+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
295322 def parse_bogus_comment (self , i , report = 1 ):
296323 rawdata = self .rawdata
297324 assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -317,6 +344,8 @@ def parse_pi(self, i):
317344
318345 # Internal -- handle starttag, return end or -1 if not terminated
319346 def parse_starttag (self , i ):
347+ # See the HTML5 specs section "13.2.5.8 Tag name state"
348+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
320349 self .__starttag_text = None
321350 endpos = self .check_for_whole_start_tag (i )
322351 if endpos < 0 :
@@ -362,76 +391,42 @@ def parse_starttag(self, i):
362391 # or -1 if incomplete.
363392 def check_for_whole_start_tag (self , i ):
364393 rawdata = self .rawdata
365- m = locatestarttagend_tolerant .match (rawdata , i )
366- if m :
367- j = m .end ()
368- next = rawdata [j :j + 1 ]
369- if next == ">" :
370- return j + 1
371- if next == "/" :
372- if rawdata .startswith ("/>" , j ):
373- return j + 2
374- if rawdata .startswith ("/" , j ):
375- # buffer boundary
376- return - 1
377- # else bogus input
378- if j > i :
379- return j
380- else :
381- return i + 1
382- if next == "" :
383- # end of input
384- return - 1
385- if next in ("abcdefghijklmnopqrstuvwxyz=/"
386- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ):
387- # end of input in or before attribute value, or we have the
388- # '/' from a '/>' ending
389- return - 1
390- if j > i :
391- return j
392- else :
393- return i + 1
394- raise AssertionError ("we should not get here!" )
394+ match = locatetagend .match (rawdata , i + 1 )
395+ assert match
396+ j = match .end ()
397+ if rawdata [j - 1 ] != ">" :
398+ return - 1
399+ return j
395400
396401 # Internal -- parse endtag, return end or -1 if incomplete
397402 def parse_endtag (self , i ):
403+ # See the HTML5 specs section "13.2.5.7 End tag open state"
404+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
398405 rawdata = self .rawdata
399406 assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
400- match = endendtag .search (rawdata , i + 1 ) # >
401- if not match :
407+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
402408 return - 1
403- gtpos = match .end ()
404- match = endtagfind .match (rawdata , i ) # </ + tag + >
405- if not match :
406- if self .cdata_elem is not None :
407- self .handle_data (rawdata [i :gtpos ])
408- return gtpos
409- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
410- namematch = tagfind_tolerant .match (rawdata , i + 2 )
411- if not namematch :
412- # w3.org/TR/html5/tokenization.html#end-tag-open-state
413- if rawdata [i :i + 3 ] == '</>' :
414- return i + 3
415- else :
416- return self .parse_bogus_comment (i )
417- tagname = namematch .group (1 ).lower ()
418- # consume and ignore other stuff between the name and the >
419- # Note: this is not 100% correct, since we might have things like
420- # </tag attr=">">, but looking for > after the name should cover
421- # most of the cases and is much simpler
422- gtpos = rawdata .find ('>' , namematch .end ())
423- self .handle_endtag (tagname )
424- return gtpos + 1
409+ if not endtagopen .match (rawdata , i ): # </ + letter
410+ if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
411+ # "missing-end-tag-name" parser error
412+ return i + 3
413+ else :
414+ return self .parse_bogus_comment (i )
425415
426- elem = match . group ( 1 ). lower () # script or style
427- if self . cdata_elem is not None :
428- if elem != self . cdata_elem :
429- self . handle_data ( rawdata [i : gtpos ])
430- return gtpos
416+ match = locatetagend . match ( rawdata , i + 2 )
417+ assert match
418+ j = match . end ()
419+ if rawdata [j - 1 ] != ">" :
420+ return - 1
431421
432- self .handle_endtag (elem )
422+ # find the name: "13.2.5.8 Tag name state"
423+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
424+ match = tagfind_tolerant .match (rawdata , i + 2 )
425+ assert match
426+ tag = match .group (1 ).lower ()
427+ self .handle_endtag (tag )
433428 self .clear_cdata_mode ()
434- return gtpos
429+ return j
435430
436431 # Overridable -- finish processing of start+end tag: <tag.../>
437432 def handle_startendtag (self , tag , attrs ):
0 commit comments