python · timonviola · Jul 14, 2024 · May 13, 2025 · May 13, 2025 · May 13, 2025
@@ -26,6 +26,7 @@
 
 starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
+escapable_raw_text_close = re.compile('</(title|textarea)>', re.I)
 commentclose = re.compile(r'--\s*>')
 # Note:
 #  1) if you change tagfind/attrfind remember to update locatestarttagend too;
@@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase):
     """
 
     CDATA_CONTENT_ELEMENTS = ("script", "style")
+    ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")
 
     def __init__(self, *, convert_charrefs=True):
         """Initialize and reset this instance.
@@ -99,6 +101,7 @@ def reset(self):
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self.escapable_raw_text_elem = None
         super().reset()
 
     def feed(self, data):
@@ -120,6 +123,14 @@ def get_starttag_text(self):
         """Return full source of start tag: '<...>'."""
         return self.__starttag_text
 
+    def set_escapable_raw_text_mode(self, elem):
+        self.escapable_raw_text_elem = elem.lower()
+        self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I)
+
+    def clear_escapable_raw_text_mode(self):
+        self.interesting = interesting_normal
+        self.escapable_raw_text_elem = None
+
     def set_cdata_mode(self, elem):
         self.cdata_elem = elem.lower()
         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
@@ -136,7 +147,7 @@ def goahead(self, end):
         i = 0
         n = len(rawdata)
         while i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
                 j = rawdata.find('<', i)
                 if j < 0:
                     # if we can't find the next <, either we are at the end
@@ -155,11 +166,13 @@ def goahead(self, end):
                 if match:
                     j = match.start()
                 else:
+                    if self.escapable_raw_text_elem:
+                        break
                     if self.cdata_elem:
                         break
                     j = n
             if i < j:
-                if self.convert_charrefs and not self.cdata_elem:
+                if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
                     self.handle_data(unescape(rawdata[i:j]))
                 else:
                     self.handle_data(rawdata[i:j])
@@ -336,6 +349,8 @@ def parse_starttag(self, i):
             self.handle_startendtag(tag, attrs)
         else:
             self.handle_starttag(tag, attrs)
+            if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
+                self.set_escapable_raw_text_mode(tag)
             if tag in self.CDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag)
         return endpos
@@ -411,8 +426,14 @@ def parse_endtag(self, i):
                 self.handle_data(rawdata[i:gtpos])
                 return gtpos
 
+        if self.escapable_raw_text_elem is not None: # title or textarea
+            if elem != self.escapable_raw_text_elem:
+                self.handle_data(rawdata[i:gtpos])
+                return gtpos
+
         self.handle_endtag(elem)
         self.clear_cdata_mode()
+        self.clear_escapable_raw_text_mode()
         return gtpos
 
     # Overridable -- finish processing of start+end tag: <tag.../>

@@ -285,7 +285,7 @@ def test_cdata_content(self):
             #'foo = </\nscript>',
             #'foo = </ script>',
         ]
-        elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
+        elements = ['script', 'style', 'SCRIPT', 'TEXTAREA', 'Script', 'Textarea']
         for content in contents:
             for element in elements:
                 element_lower = element.lower()
@@ -317,6 +317,58 @@ def get_events(self):
                                 ("endtag", element_lower)],
                             collector=Collector(convert_charrefs=False))
 
+    def test_escapable_raw_text_content(self):
+        contents = [
+            '<h2>This is a header</h2>',
+            'Rebelious<h1>Heading'
+            '<!-- not a comment --> &not-an-entity-ref;',
+            "<not a='start tag'>",
+            '<a href="" /> <p> <span></span>',
+            'foo = "</scr" + "ipt>";',
+            'foo = "</TITLE" + ">";',
+            'foo = <\n/title> ',
+            '<!-- document.write("</scr" + "ipt>"); -->',
+            '\n//<![CDATA[\n'
+            '\n<!-- //\nvar foo = 3.14;\n// -->\n',
+            'foo = "</sty" + "le>";',
+            '<!-- \u2603 -->',
+            # these two should be invalid according to the HTML 5 spec,
+            # section 8.1.2.2
+            #'foo = </\nscript>',
+            #'foo = </ script>',
+        ]
+        elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
+        for content in contents:
+            for element in elements:
+                element_lower = element.lower()
+                s = '<{element}>{content}</{element}>'.format(element=element,
+                                                               content=content)
+                self._run_check(s, [("starttag", element_lower, []),
+                                    ("data", content),
+                                    ("endtag", element_lower)])
+
+    def test_escapable_raw_text_with_closing_tags(self):
+        # see issue #13358
+        # make sure that HTMLParser calls handle_data only once for each CDATA.
+        # The normal event collector normalizes  the events in get_events,
+        # so we override it to return the original list of events.
+        class Collector(EventCollector):
+            def get_events(self):
+                return self.events
+
+        content = """<!-- not a comment --> &not-an-entity-ref;
+                  <a href="" /> </p><p> <span></span></style>
+                  '</script' + '>'"""
+        for element in [' script', 'script ', ' script ',
+                        '\nscript', 'script\n', '\nscript\n']:
+            element_lower = element.lower().strip()
+            s = '<script>{content}</{element}>'.format(element=element,
+                                                       content=content)
+            self._run_check(s, [("starttag", element_lower, []),
+                                ("data", content),
+                                ("endtag", element_lower)],
+                            collector=Collector(convert_charrefs=False))
+
     def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'