Merge pull request #8 from LindenRegex/cpc/specification_check-cleanup

Aurele-Barriere · web-flow · commit cb79b54062d4 · 2026-03-16T13:12:25.000+01:00
Small cleanups in the specification checker
diff --git a/specification_check/ecma_parser.py b/specification_check/ecma_parser.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import Dict, List
+from pathlib import Path
 
 import bs4
 import requests
@@ -9,6 +10,7 @@
 from spec_merger.aligner_utils import Position
 from spec_merger.utils import ParserState, ParsedPage, Parser
 
+SCRIPT_DIR = Path(__file__).parent
 
 @dataclass(frozen=True)
 class URLPosition(Position):
@@ -28,20 +30,24 @@ def add_case(cases: dict[str, Dictionary], case: tuple[str, String], key: str):
 
 class ECMAParser(Parser):
 
-    def __init__(self, url, parser_name="ECMA", sections=None):
-        self.name = parser_name
+    def __init__(self, version: str, sections=None):
+        self.name = f"ECMAScript v{version}"
         if sections is None:
             sections = ["sec-regexp-regular-expression-objects"]
         self.sections = sections
-        self.url = url
+        self.version = version
+        self.url = f"https://262.ecma-international.org/{version}/"
         self.page = self.__get_page()
-        self.sections_by_number: Dict[str, Dictionary] = None
+        self.sections_by_number: dict[str, Dictionary] = None
         self.avoid = {None, "emu-note", "\n"}
 
     def __get_page(self):
-        html_spec = requests.get(self.url).content
-        soup = BeautifulSoup(html_spec, 'html.parser')
-        return soup
+        if (cache := SCRIPT_DIR / f"ecma-{self.version}.html").exists():
+            html_spec = cache.read_bytes()
+        else:
+            html_spec = requests.get(self.url).content
+            _ = cache.write_bytes(html_spec)
+        return BeautifulSoup(html_spec, 'html.parser')
 
     def __parse_section(self, section_html: BeautifulSoup, sections_by_number: Dict[str, Dictionary]):
         position = URLPosition(self.url + "#" + section_html.get("id"))
@@ -128,7 +134,7 @@ def __parse_emu_grammar(self, emu_grammar_section: BeautifulSoup) -> list[list[s
             result.append(tmp)
         return result
 
-    def __parse_p(self, p: BeautifulSoup):
+    def __parse_p(self, p):
         res = ""
         if type(p) is bs4.NavigableString:
             return p.text
@@ -183,7 +189,7 @@ def __parse_subsection(self, subsection: List[BeautifulSoup], position: URLPosit
                                      current_case_title[0])
                         current_case = ""
                     current_case_titles = self.__parse_emu_grammar(children)
-                case "span" | "emu-table" | "emu-import" | "h2" | "emu-table":
+                case "span" | "emu-table" | "emu-import" | "h2":
                     pass
                 case _:
                     print(f"ERROR: Unhandled tag in html section : {children.name}, {children.text}")
diff --git a/specification_check/main.py b/specification_check/main.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python
+import sys
+
 from ecma_parser import ECMAParser
 from rocq_parser import ROCQParser
 from spec_merger.aligner import Aligner
@@ -7,17 +10,13 @@
 def main():
     paths = [Path("../mechanization/spec/", True)]
     files_to_exclude = [Path("../mechanization/spec/Node.v", False)]
-    url = "https://262.ecma-international.org/14.0/"
+    rocq_parsed_page = ROCQParser(paths, files_to_exclude).get_parsed_page()
 
-    rocq_parser = ROCQParser(paths, files_to_exclude)
-    rocq_parsed_page = rocq_parser.get_parsed_page()
-    ecma_parser_v14 = ECMAParser(url, parser_name="ECMAScript v14.0")
-    ecma_parsed_page_v14 = ecma_parser_v14.get_parsed_page()
+    ecma_version = sys.argv[1] if len(sys.argv) > 1 else "14.0"
+    ecma_parsed_page = ECMAParser(ecma_version).get_parsed_page()
 
-    a = Aligner()
-    result = a.align(rocq_parsed_page.entries, ecma_parsed_page_v14.entries)
-    text_result = result.to_text()
-    print(text_result, end="")
+    result = Aligner().align(rocq_parsed_page.entries, ecma_parsed_page.entries)
+    print(result.to_text(), end="")
 
 
 if __name__ == "__main__":