[FIX] util/snippets: fix conversion of full html docs

cawo-odoo · cawo-odoo · commit 648c9409d400 · 2023-11-07T18:17:24.000Z
upg-1158494 opw-3577439 ``` File "/tmp/tmpjzebte5h/migrations/account_online_synchronization/saas~15.1.1.0/pre-migrate.py", line 6, in migrate util.remove_field(cr, "account.link.journal.line", "action") File "/tmp/tmpjzebte5h/migrations/util/fields.py", line 163, in remove_field adapt_domains(cr, model, fieldname, "ignored", adapter=adapter, skip_inherit=skip_inherit, force_adapt=True) File "/tmp/tmpjzebte5h/migrations/util/domains.py", line 316, in adapt_domains with suppress(_Skip), edit_view(cr, view_id=view_id, active=None) as view: File "/usr/lib/python3.8/contextlib.py", line 113, in __enter__ return next(self.gen) File "/tmp/tmpjzebte5h/migrations/util/records.py", line 210, in edit_view arch_etree = lxml.etree.fromstring(arch["en_US"]) File "src/lxml/etree.pyx", line 3257, in lxml.etree.fromstring File "src/lxml/parser.pxi", line 1916, in lxml.etree._parseMemoryDocument File "src/lxml/parser.pxi", line 1796, in lxml.etree._parseDoc File "src/lxml/parser.pxi", line 1085, in lxml.etree._BaseParser._parseUnicodeDoc File "src/lxml/parser.pxi", line 618, in lxml.etree._ParserContext._handleParseResultDoc File "src/lxml/parser.pxi", line 728, in lxml.etree._handleParseResult File "src/lxml/parser.pxi", line 657, in lxml.etree._raiseParseError File "<string>", line 14 lxml.etree.XMLSyntaxError: Extra content at the end of the document, line 14, column 17 ``` The Traceback from the lxml etree parser is caused by a previous corruption of the document in question by a previous upgrade script. The document as is in the origin DB is a full HTML doc with an `html` root tag and can be parsed by lxml.etree just fine. The culprit script is `website/16.0.1.0/pre-convert_html.py`, which ultimately calls the `HTMLConverter` in `snippets.py`. The HTMLConverter always encloses the document with tags `<wrap>` and `</wrap>`. This breaks a certain logic in the `lxml.html.fromstring()` (see https://github.com/lxml/lxml/blob/2ac88908ffd6df380615c0af35f2134325e4bf30/src/lxml/html/html5parser.py#L184) and leads to a corrupted result when the converted document is recreated via `etree.tostring()`: The tags `html`, `head` and `body` are lost. To fix this, do not add the `wrap` tags if the document looks like a full HTML doc according to the test and logic of `lxml.html.fromstring()`. Part of odoo/upgrade#5352 Signed-off-by: Christophe Simonis (chs) <chs@odoo.com>
diff --git a/src/util/snippets.py b/src/util/snippets.py
@@ -199,9 +199,13 @@ def __call__(self, content):
         if not content:
             return (False, content)
         content = re.sub(r"^<\?xml .+\?>\s*", "", content.strip())
-        # Wrap in <wrap> node before parsing to preserve external comments and
-        # multi-root nodes
-        els = html.fromstring(f"<wrap>{content}</wrap>", parser=utf8_parser)
+        # Wrap in <wrap> node before parsing to preserve external comments and multi-root nodes,
+        # except for when this looks like a full html doc, because in this case the wrap tag breaks the logic in
+        # https://github.com/lxml/lxml/blob/2ac88908ffd6df380615c0af35f2134325e4bf30/src/lxml/html/html5parser.py#L184
+        els = html.fromstring(
+            content if content.strip()[:5].lower() == "<html" else f"<wrap>{content}</wrap>",
+            parser=utf8_parser,
+        )
         has_changed = self.has_changed(els)
         new_content = (
             re.sub(r"(^<wrap>|</wrap>$)", "", etree.tostring(els, encoding="unicode").strip())