@@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
110110 """
111111
112112 CDATA_CONTENT_ELEMENTS = ("script" , "style" )
113+ RCDATA_CONTENT_ELEMENTS = ("textarea" , "title" )
113114
114115 def __init__ (self , * , convert_charrefs = True ):
115116 """Initialize and reset this instance.
@@ -126,6 +127,8 @@ def reset(self):
126127 self .lasttag = '???'
127128 self .interesting = interesting_normal
128129 self .cdata_elem = None
130+ self ._support_cdata = True
131+ self ._escapable = True
129132 _markupbase .ParserBase .reset (self )
130133
131134 def feed (self , data ):
@@ -147,14 +150,33 @@ def get_starttag_text(self):
147150 """Return full source of start tag: '<...>'."""
148151 return self .__starttag_text
149152
150- def set_cdata_mode (self , elem ):
153+ def set_cdata_mode (self , elem , * , escapable = False ):
151154 self .cdata_elem = elem .lower ()
152- self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
153- re .IGNORECASE | re .ASCII )
155+ self ._escapable = escapable
156+ if escapable and not self .convert_charrefs :
157+ self .interesting = re .compile (r'&|</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
158+ re .IGNORECASE | re .ASCII )
159+ else :
160+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
161+ re .IGNORECASE | re .ASCII )
154162
155163 def clear_cdata_mode (self ):
156164 self .interesting = interesting_normal
157165 self .cdata_elem = None
166+ self ._escapable = True
167+
168+ def _set_support_cdata (self , flag = True ):
169+ """Enable or disable support of the CDATA sections.
170+ If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
171+ If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
172+
173+ This method is not called by default. Its purpose is to be called
174+ in custom handle_starttag() and handle_endtag() methods, with
175+ value that depends on the adjusted current node.
176+ See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
177+ for details.
178+ """
179+ self ._support_cdata = flag
158180
159181 # Internal -- handle data as far as reasonable. May leave state
160182 # and data to be processed by a subsequent call. If 'end' is
@@ -187,7 +209,7 @@ def goahead(self, end):
187209 break
188210 j = n
189211 if i < j :
190- if self .convert_charrefs and not self .cdata_elem :
212+ if self .convert_charrefs and self ._escapable :
191213 self .handle_data (unescape (rawdata [i :j ]))
192214 else :
193215 self .handle_data (rawdata [i :j ])
@@ -230,7 +252,7 @@ def goahead(self, end):
230252 j -= len (suffix )
231253 break
232254 self .handle_comment (rawdata [i + 4 :j ])
233- elif startswith ("<![CDATA[" , i ):
255+ elif startswith ("<![CDATA[" , i ) and self . _support_cdata :
234256 self .unknown_decl (rawdata [i + 3 :])
235257 elif rawdata [i :i + 9 ].lower () == '<!doctype' :
236258 self .handle_decl (rawdata [i + 2 :])
@@ -289,7 +311,7 @@ def goahead(self, end):
289311 assert 0 , "interesting.search() lied"
290312 # end while
291313 if end and i < n :
292- if self .convert_charrefs and not self .cdata_elem :
314+ if self .convert_charrefs and self ._escapable :
293315 self .handle_data (unescape (rawdata [i :n ]))
294316 else :
295317 self .handle_data (rawdata [i :n ])
@@ -306,15 +328,28 @@ def parse_html_declaration(self, i):
306328 if rawdata [i :i + 4 ] == '<!--' :
307329 # this case is actually already handled in goahead()
308330 return self .parse_comment (i )
309- elif rawdata [i :i + 3 ] == '<![' :
310- return self .parse_marked_section (i )
331+ elif rawdata [i :i + 9 ] == '<![CDATA[' and self ._support_cdata :
332+ j = rawdata .find (']]>' , i + 9 )
333+ if j < 0 :
334+ return - 1
335+ self .unknown_decl (rawdata [i + 3 : j ])
336+ return j + 3
311337 elif rawdata [i :i + 9 ].lower () == '<!doctype' :
312338 # find the closing >
313339 gtpos = rawdata .find ('>' , i + 9 )
314340 if gtpos == - 1 :
315341 return - 1
316342 self .handle_decl (rawdata [i + 2 :gtpos ])
317343 return gtpos + 1
344+ elif rawdata [i :i + 3 ] == '<![' :
345+ j = rawdata .find ('>' , i + 3 )
346+ if j < 0 :
347+ return - 1
348+ if rawdata [j - 1 ] == ']' :
349+ self .unknown_decl (rawdata [i + 3 : j - 1 ])
350+ else :
351+ self .handle_comment (rawdata [i + 2 : j ])
352+ return j + 1
318353 else :
319354 return self .parse_bogus_comment (i )
320355
@@ -408,6 +443,8 @@ def parse_starttag(self, i):
408443 self .handle_starttag (tag , attrs )
409444 if tag in self .CDATA_CONTENT_ELEMENTS :
410445 self .set_cdata_mode (tag )
446+ elif tag in self .RCDATA_CONTENT_ELEMENTS :
447+ self .set_cdata_mode (tag , escapable = True )
411448 return endpos
412449
413450 # Internal -- check to see if we have a complete starttag; return end
0 commit comments