From b7900cd4cbe129eb7d6b405f6e3507156cc35fd4 Mon Sep 17 00:00:00 2001
From: Timon Viola <44016238+timonviola@users.noreply.github.com>
Date: Mon, 9 Jun 2025 20:24:09 +0200
Subject: [PATCH 1/8] fix: fix html parser raw text escapable mode

---
 Lib/html/parser.py          | 20 ++++++++-----
 Lib/test/test_htmlparser.py | 59 +++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 1e30956fe24f83..6a7a2d982aaba6 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -98,8 +98,8 @@ class HTMLParser(_markupbase.ParserBase):
     containing respectively the named or numeric reference as the
     argument.
     """
-
-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # For escapable raw text elements (textarea and title), CDATA mode is reused
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title")
 
     def __init__(self, *, convert_charrefs=True):
         """Initialize and reset this instance.
@@ -117,6 +117,7 @@ def reset(self):
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self._raw_escapable = False
         super().reset()
 
     def feed(self, data):
@@ -140,11 +141,16 @@ def get_starttag_text(self):
 
     def set_cdata_mode(self, elem):
         self.cdata_elem = elem.lower()
-        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+        if self.cdata_elem in ["textarea", "title"]:
+            self._raw_escapable = True
+            self.interesting = re.compile('[&]')
+        else:
+            self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
 
     def clear_cdata_mode(self):
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self._raw_escapable = False
 
     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
@@ -154,7 +160,7 @@ def goahead(self, end):
         i = 0
         n = len(rawdata)
         while i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                 j = rawdata.find('<', i)
                 if j < 0:
                     # if we can't find the next <, either we are at the end
@@ -177,7 +183,7 @@ def goahead(self, end):
                         break
                     j = n
             if i < j:
-                if self.convert_charrefs and not self.cdata_elem:
+                if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                     self.handle_data(unescape(rawdata[i:j]))
                 else:
                     self.handle_data(rawdata[i:j])
@@ -210,7 +216,7 @@ def goahead(self, end):
                             k = i + 1
                     else:
                         k += 1
-                    if self.convert_charrefs and not self.cdata_elem:
+                    if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                         self.handle_data(unescape(rawdata[i:k]))
                     else:
                         self.handle_data(rawdata[i:k])
@@ -261,7 +267,7 @@ def goahead(self, end):
                 assert 0, "interesting.search() lied"
         # end while
         if end and i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
                 self.handle_data(unescape(rawdata[i:n]))
             else:
                 self.handle_data(rawdata[i:n])
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 61fa24fab574f2..9ae600c07b13cb 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -295,6 +295,65 @@ def test_cdata_content(self):
                                     ("data", content),
                                     ("endtag", element_lower)])
 
+    def test_raw_text_content(self):
+        # Tags should be treated as text in raw text and escapable raw text content.
+        content = """<h1>tagshould be handled as text"""
+        elements = [
+            "script",
+            "style",
+            "title",
+            "textarea",
+            "SCRIPT",
+            "STYLE",
+            "TITLE",
+            "TEXTAREA",
+            "Script",
+            "Style",
+            "Title",
+            "Textarea",
+        ]
+        for element in elements:
+            source = f"<{element}>{content}"
+            self._run_check(source, [
+                ("starttag", element.lower(), []),
+                ("data", content)
+            ])
+
+    def test_escapable_raw_text_content(self):
+        # Charrefs should be escaped in esacapable raw text content.
+        class Collector(EventCollector):
+            pass
+
+        content = "Timon &amp; Pumba"
+        expected = "Timon & Pumba"
+        elements = [
+            "title",
+            "textarea",
+            "TITLE",
+            "TEXTAREA",
+            "Title",
+            "Textarea",
+        ]
+        for element in elements:
+            source = f"<{element}>{content}"
+            self._run_check(
+                source, [
+                  ("starttag", element.lower(), []),
+                  ('data', expected),
+                ],
+                collector=Collector(convert_charrefs=True),
+            )
+            # test with convert_charrefs=False
+            self._run_check(
+                source, [
+                  ("starttag", element.lower(), []),
+                  ('data', 'Timon '),
+                  ('entityref', 'amp'),
+                  ('data', ' Pumba')
+                ],
+            )
+
+
     def test_cdata_with_closing_tags(self):
         # see issue #13358
         # make sure that HTMLParser calls handle_data only once for each CDATA.

From 8f60744b68fdcc37f2a58c48a5bccb34fa07eb2a Mon Sep 17 00:00:00 2001
From: Timon Viola <44016238+timonviola@users.noreply.github.com>
Date: Mon, 9 Jun 2025 20:38:38 +0200
Subject: [PATCH 2/8] docs: add blurb new fragment

---
 .../next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst  | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst

diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
new file mode 100644
index 00000000000000..a9754421d5ac97
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
@@ -0,0 +1 @@
+Fix a bug in html parser related to escapable raw text mode.

From 7824ee88b82f86440bd9aedae5f7ef0ac815463d Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 14 Jul 2025 20:03:37 +0300
Subject: [PATCH 3/8] Fix errors and rewrite tests.

---
 Lib/html/parser.py          |   9 ++-
 Lib/test/test_htmlparser.py | 150 ++++++++++++++++++++++--------------
 2 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index b55f6c65900a83..92a25d62666451 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -169,9 +169,10 @@ def get_starttag_text(self):
 
     def set_cdata_mode(self, elem):
         self.cdata_elem = elem.lower()
-        if self.cdata_elem in ["textarea", "title"]:
-            self._raw_escapable = True
-            self.interesting = re.compile('[&]')
+        self._raw_escapable = self.cdata_elem in ("textarea", "title")
+        if self._raw_escapable and not self.convert_charrefs:
+            self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+                                          re.IGNORECASE|re.ASCII)
         else:
             self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                           re.IGNORECASE|re.ASCII)
@@ -189,7 +190,7 @@ def goahead(self, end):
         i = 0
         n = len(rawdata)
         while i < n:
-            if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
+            if self.convert_charrefs and not self.cdata_elem:
                 j = rawdata.find('<', i)
                 if j < 0:
                     # if we can't find the next <, either we are at the end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 62b662b666a169..6286b34f734911 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -317,63 +317,48 @@ def test_style_content(self, content):
                             ("data", content),
                             ("endtag", "style")])
 
-    def test_raw_text_content(self):
-        # Tags should be treated as text in raw text and escapable raw text content.
-        content = """<h1>tagshould be handled as text"""
-        elements = [
-            "script",
-            "style",
-            "title",
-            "textarea",
-            "SCRIPT",
-            "STYLE",
-            "TITLE",
-            "TEXTAREA",
-            "Script",
-            "Style",
-            "Title",
-            "Textarea",
-        ]
-        for element in elements:
-            source = f"<{element}>{content}"
-            self._run_check(source, [
-                ("starttag", element.lower(), []),
-                ("data", content)
-            ])
+    @support.subTests('content', [
+            '<!-- not a comment -->',
+            "<not a='start tag'>",
+            '<![CDATA[not a cdata]]>',
+            '<!not a bogus comment>',
+            '</not a bogus comment>',
+            '\u2603',
+            '< /title>',
+            '</ title>',
+            '</titled>',
+            '</title\v>',
+            '</title\xa0>',
+            '</tıtle>',
+        ])
+    def test_title_content(self, content):
+        source = f"<title>{content}</title>"
+        self._run_check(source, [
+            ("starttag", "title", []),
+            ("data", content),
+            ("endtag", "title"),
+        ])
 
-    def test_escapable_raw_text_content(self):
-        # Charrefs should be escaped in esacapable raw text content.
-        class Collector(EventCollector):
-            pass
-
-        content = "Timon &amp; Pumba"
-        expected = "Timon & Pumba"
-        elements = [
-            "title",
-            "textarea",
-            "TITLE",
-            "TEXTAREA",
-            "Title",
-            "Textarea",
-        ]
-        for element in elements:
-            source = f"<{element}>{content}"
-            self._run_check(
-                source, [
-                  ("starttag", element.lower(), []),
-                  ('data', expected),
-                ],
-                collector=Collector(convert_charrefs=True),
-            )
-            # test with convert_charrefs=False
-            self._run_check(
-                source, [
-                  ("starttag", element.lower(), []),
-                  ('data', 'Timon '),
-                  ('entityref', 'amp'),
-                  ('data', ' Pumba')
-                ],
-            )
+    @support.subTests('content', [
+            '<!-- not a comment -->',
+            "<not a='start tag'>",
+            '<![CDATA[not a cdata]]>',
+            '<!not a bogus comment>',
+            '</not a bogus comment>',
+            '\u2603',
+            '< /textarea>',
+            '</ textarea>',
+            '</textareable>',
+            '</textarea\v>',
+            '</textarea\xa0>',
+        ])
+    def test_textarea_content(self, content):
+        source = f"<textarea>{content}</textarea>"
+        self._run_check(source, [
+            ("starttag", "textarea", []),
+            ("data", content),
+            ("endtag", "textarea"),
+        ])
 
     @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
                                  'script/', 'script foo=bar', 'script foo=">"'])
@@ -404,6 +389,38 @@ def test_style_closing_tag(self, endtag):
                             ("endtag", "style")],
                         collector=EventCollectorNoNormalize(convert_charrefs=False))
 
+    @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
+                                 'title/', 'title foo=bar', 'title foo=">"'])
+    def test_title_closing_tag(self, endtag):
+        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
+        s = f'<TitLe>{content}</{endtag}>'
+        self._run_check(s, [("starttag", "title", []),
+                            ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
+                            ("endtag", "title")],
+                        collector=EventCollectorNoNormalize(convert_charrefs=True))
+        self._run_check(s, [("starttag", "title", []),
+                            ('data', '<!-- not a comment --><i>Egg '),
+                            ('entityref', 'amp'),
+                            ('data', ' Spam</i>'),
+                            ("endtag", "title")],
+                        collector=EventCollectorNoNormalize(convert_charrefs=False))
+
+    @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
+                                 'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
+    def test_textarea_closing_tag(self, endtag):
+        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
+        s = f'<TexTarEa>{content}</{endtag}>'
+        self._run_check(s, [("starttag", "textarea", []),
+                            ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
+                            ("endtag", "textarea")],
+                        collector=EventCollectorNoNormalize(convert_charrefs=True))
+        self._run_check(s, [("starttag", "textarea", []),
+                            ('data', '<!-- not a comment --><i>Egg '),
+                            ('entityref', 'amp'),
+                            ('data', ' Spam</i>'),
+                            ("endtag", "textarea")],
+                        collector=EventCollectorNoNormalize(convert_charrefs=False))
+
     @support.subTests('tail,end', [
         ('', False),
         ('<', False),
@@ -421,6 +438,27 @@ def test_eof_in_script(self, tail, end):
                             ("data", content if end else content + tail)],
                         collector=EventCollectorNoNormalize(convert_charrefs=False))
 
+    @support.subTests('tail,end', [
+        ('', False),
+        ('<', False),
+        ('</', False),
+        ('</t', False),
+        ('</title', False),
+        ('</title ', True),
+        ('</title foo=bar', True),
+        ('</title foo=">', True),
+    ])
+    def test_eof_in_title(self, tail, end):
+        s = f'<TitLe>Egg &amp; Spam{tail}'
+        self._run_check(s, [("starttag", "title", []),
+                            ("data", "Egg & Spam" + ('' if end else tail))],
+                        collector=EventCollectorNoNormalize(convert_charrefs=True))
+        self._run_check(s, [("starttag", "title", []),
+                            ('data', 'Egg '),
+                            ('entityref', 'amp'),
+                            ('data', ' Spam' + ('' if end else tail))],
+                        collector=EventCollectorNoNormalize(convert_charrefs=False))
+
     def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'

From 18c6ea80b2387103eff3ced2fb11e1baff8aba84 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 14 Jul 2025 20:11:56 +0300
Subject: [PATCH 4/8] Refactoring.

---
 Lib/html/parser.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 92a25d62666451..9e3507ced4e710 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -126,8 +126,8 @@ class HTMLParser(_markupbase.ParserBase):
     containing respectively the named or numeric reference as the
     argument.
     """
-    # For escapable raw text elements (textarea and title), CDATA mode is reused
-    CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title")
+    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
     def __init__(self, *, convert_charrefs=True):
         """Initialize and reset this instance.
@@ -145,7 +145,7 @@ def reset(self):
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
-        self._raw_escapable = False
+        self._escapable = True
         super().reset()
 
     def feed(self, data):
@@ -167,10 +167,10 @@ def get_starttag_text(self):
         """Return full source of start tag: '<...>'."""
         return self.__starttag_text
 
-    def set_cdata_mode(self, elem):
+    def set_cdata_mode(self, elem, escapable=False):
         self.cdata_elem = elem.lower()
-        self._raw_escapable = self.cdata_elem in ("textarea", "title")
-        if self._raw_escapable and not self.convert_charrefs:
+        self._escapable = escapable
+        if escapable and not self.convert_charrefs:
             self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                           re.IGNORECASE|re.ASCII)
         else:
@@ -180,7 +180,7 @@ def set_cdata_mode(self, elem):
     def clear_cdata_mode(self):
         self.interesting = interesting_normal
         self.cdata_elem = None
-        self._raw_escapable = False
+        self._escapable = True
 
     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
@@ -213,7 +213,7 @@ def goahead(self, end):
                         break
                     j = n
             if i < j:
-                if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
+                if self.convert_charrefs and self._escapable:
                     self.handle_data(unescape(rawdata[i:j]))
                 else:
                     self.handle_data(rawdata[i:j])
@@ -315,7 +315,7 @@ def goahead(self, end):
                 assert 0, "interesting.search() lied"
         # end while
         if end and i < n:
-            if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
+            if self.convert_charrefs and self._escapable:
                 self.handle_data(unescape(rawdata[i:n]))
             else:
                 self.handle_data(rawdata[i:n])
@@ -427,6 +427,8 @@ def parse_starttag(self, i):
             self.handle_starttag(tag, attrs)
             if tag in self.CDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag)
+            elif tag in self.RCDATA_CONTENT_ELEMENTS:
+                self.set_cdata_mode(tag, True)
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end

From 051516af7b13b564df8c46139f237c4103b35f76 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 14 Jul 2025 20:13:39 +0300
Subject: [PATCH 5/8] Update the NEWS entry.

---
 .../Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
index a9754421d5ac97..f364c133813551 100644
--- a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
+++ b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
@@ -1 +1,2 @@
-Fix a bug in html parser related to escapable raw text mode.
+Fix support of escapable raw text mode (elements "textarea" and "title")
+in :class:`http.parser.HTMLParser`.

From c93a7718608a21f2f926adcade8c6f43a4d87e8e Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 14 Jul 2025 20:14:42 +0300
Subject: [PATCH 6/8] Reclassify

---
 .../2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst                | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename Misc/NEWS.d/next/{Library => Security}/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst (100%)

diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
similarity index 100%
rename from Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
rename to Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst

From a27159f4f923259c787c6028c2289401d955f8e0 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 15 Jul 2025 07:47:19 +0300
Subject: [PATCH 7/8] Update 2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst

---
 .../Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
index f364c133813551..6ad3caf33b2201 100644
--- a/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
+++ b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
@@ -1,2 +1,2 @@
 Fix support of escapable raw text mode (elements "textarea" and "title")
-in :class:`http.parser.HTMLParser`.
+in :class:`html.parser.HTMLParser`.

From 5681bbc7a2793eeaa58869f3f1d27add53a59f0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Langa?= <lukasz@langa.pl>
Date: Tue, 22 Jul 2025 12:39:37 +0200
Subject: [PATCH 8/8] Make `escapable=` kwarg-only

---
 Lib/html/parser.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 9e3507ced4e710..9c06a42dc9eddf 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -126,6 +126,7 @@ class HTMLParser(_markupbase.ParserBase):
     containing respectively the named or numeric reference as the
     argument.
     """
+
     CDATA_CONTENT_ELEMENTS = ("script", "style")
     RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
@@ -167,7 +168,7 @@ def get_starttag_text(self):
         """Return full source of start tag: '<...>'."""
         return self.__starttag_text
 
-    def set_cdata_mode(self, elem, escapable=False):
+    def set_cdata_mode(self, elem, *, escapable=False):
         self.cdata_elem = elem.lower()
         self._escapable = escapable
         if escapable and not self.convert_charrefs:
@@ -428,7 +429,7 @@ def parse_starttag(self, i):
             if tag in self.CDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag)
             elif tag in self.RCDATA_CONTENT_ELEMENTS:
-                self.set_cdata_mode(tag, True)
+                self.set_cdata_mode(tag, escapable=True)
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end