Relax HTML validation for Fortunes test

jonathanhefner · jonathanhefner · commit 7f84497e6d12 · 2025-01-08T12:15:51.000-06:00
Some frameworks, such as Next.js, automatically inject `&lt;meta&gt;`,
`&lt;link&gt;`, and `&lt;script&gt;` tags into pages rendered by their templating
system.  Since the Fortunes test is meant to exercise the templating
system (as opposed to raw string concatenation), it should allow these
tags.

Furthermore, React warns against nesting a `&lt;tr&gt;` directly inside a
`&lt;table&gt;` ("&lt;tr&gt; cannot be a child of &lt;table&gt;") because browsers will
automatically wrap the `&lt;tr&gt;` elements in a `&lt;tbody&gt;`, causing a
mismatch with the virtual DOM.  Therefore, the Fortunes test should
allow optional `&lt;tbody&gt;` (and `&lt;thead&gt;`) tags.

This commit relaxes the HTML validation for the Fortunes test to allow
these tags by simply ignoring them when building the comparison string.
diff --git a/toolset/test_types/fortune/fortune_html_parser.py b/toolset/test_types/fortune/fortune_html_parser.py
@@ -8,8 +8,17 @@
 
 
 class FortuneHTMLParser(HTMLParser):
+    IGNORED_TAGS = (
+        "<meta>", "</meta>",
+        "<link>", "</link>",
+        "<script>", "</script>",
+        "<thead>", "</thead>",
+        "<tbody>", "</tbody>",
+    )
+
     def __init__(self):
         HTMLParser.__init__(self, convert_charrefs=False)
+        self.ignore_content = False
         self.body = []
 
     valid_fortune = '''<!doctype html><html>
@@ -41,7 +50,7 @@ def handle_decl(self, decl):
         # and since we did not specify xml compliance (where
         # incorrect casing would throw a syntax error), we must
         # allow all casings. We will lower for our normalization.
-        self.body.append("<!{d}>".format(d=decl.lower()))
+        self.append("<!{d}>".format(d=decl.lower()))
 
     def handle_charref(self, name):
         '''
@@ -63,58 +72,58 @@ def handle_charref(self, name):
         # equality.
         if val == "34" or val == "034" or val == "x22":
             # Append our normalized entity reference to our body.
-            self.body.append("&quot;")
+            self.append("&quot;")
         # "&#39;" is a valid escaping of "-", but it is not
         # required, so we normalize for equality checking.
         if val == "39" or val == "039" or val == "x27":
-            self.body.append("&apos;")
+            self.append("&apos;")
         # Again, "&#43;" is a valid escaping of the "+", but
         # it is not required, so we need to normalize for out
         # final parse and equality check.
         if val == "43" or val == "043" or val == "x2b":
-            self.body.append("+")
+            self.append("+")
         # Again, "&#62;" is a valid escaping of ">", but we
         # need to normalize to "&gt;" for equality checking.
         if val == "62" or val == "062" or val == "x3e":
-            self.body.append("&gt;")
+            self.append("&gt;")
         # Again, "&#60;" is a valid escaping of "<", but we
         # need to normalize to "&lt;" for equality checking.
         if val == "60" or val == "060" or val == "x3c":
-            self.body.append("&lt;")
+            self.append("&lt;")
         # Not sure why some are escaping '/'
         if val == "47" or val == "047" or val == "x2f":
-            self.body.append("/")
+            self.append("/")
         # "&#40;" is a valid escaping of "(", but
         # it is not required, so we need to normalize for out
         # final parse and equality check.
         if val == "40" or val == "040" or val == "x28":
-            self.body.append("(")
+            self.append("(")
         # "&#41;" is a valid escaping of ")", but
         # it is not required, so we need to normalize for out
         # final parse and equality check.
         if val == "41" or val == "041" or val == "x29":
-            self.body.append(")")
+            self.append(")")
 
     def handle_entityref(self, name):
         '''
         Again, "&mdash;" is a valid escaping of "—", but we
         need to normalize to "—" for equality checking.
         '''
         if name == "mdash":
-            self.body.append("—")
+            self.append("—")
         else:
-            self.body.append("&{n};".format(n=name))
+            self.append("&{n};".format(n=name))
 
     def handle_starttag(self, tag, attrs):
         '''
         This is called every time a tag is opened. We append
         each one wrapped in "<" and ">".
         '''
-        self.body.append("<{t}>".format(t=tag))
+        self.append("<{t}>".format(t=tag))
 
         # Append a newline after the <table> and <html>
         if tag.lower() == 'table' or tag.lower() == 'html':
-            self.body.append(os.linesep)
+            self.append(os.linesep)
 
     def handle_data(self, data):
         '''
@@ -146,18 +155,24 @@ def handle_data(self, data):
             data = data.replace('"', '&quot;')
             data = data.replace('>', '&gt;')
 
-            self.body.append("{d}".format(d=data))
+            self.append("{d}".format(d=data))
 
     def handle_endtag(self, tag):
         '''
         This is called every time a tag is closed. We append
         each one wrapped in "</" and ">".
         '''
-        self.body.append("</{t}>".format(t=tag))
+        self.append("</{t}>".format(t=tag))
 
         # Append a newline after each </tr> and </head>
         if tag.lower() == 'tr' or tag.lower() == 'head':
-            self.body.append(os.linesep)
+            self.append(os.linesep)
+
+    def append(self, item):
+        self.ignore_content = item == "<script>" or (self.ignore_content and item != "</script>")
+
+        if not (self.ignore_content or item in self.IGNORED_TAGS):
+            self.body.append(item)
 
     def isValidFortune(self, name, out):
         '''