Skip to content

Commit cb79b54

Browse files
Merge pull request #8 from LindenRegex/cpc/specification_check-cleanup
Small cleanups in the specification checker
2 parents a5a3120 + 4ebbc5c commit cb79b54

File tree

2 files changed

+23
-18
lines changed

2 files changed

+23
-18
lines changed

specification_check/ecma_parser.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from dataclasses import dataclass
22
from typing import Dict, List
3+
from pathlib import Path
34

45
import bs4
56
import requests
@@ -9,6 +10,7 @@
910
from spec_merger.aligner_utils import Position
1011
from spec_merger.utils import ParserState, ParsedPage, Parser
1112

13+
SCRIPT_DIR = Path(__file__).parent
1214

1315
@dataclass(frozen=True)
1416
class URLPosition(Position):
@@ -28,20 +30,24 @@ def add_case(cases: dict[str, Dictionary], case: tuple[str, String], key: str):
2830

2931
class ECMAParser(Parser):
3032

31-
def __init__(self, url, parser_name="ECMA", sections=None):
32-
self.name = parser_name
33+
def __init__(self, version: str, sections=None):
34+
self.name = f"ECMAScript v{version}"
3335
if sections is None:
3436
sections = ["sec-regexp-regular-expression-objects"]
3537
self.sections = sections
36-
self.url = url
38+
self.version = version
39+
self.url = f"https://262.ecma-international.org/{version}/"
3740
self.page = self.__get_page()
38-
self.sections_by_number: Dict[str, Dictionary] = None
41+
self.sections_by_number: dict[str, Dictionary] = None
3942
self.avoid = {None, "emu-note", "\n"}
4043

4144
def __get_page(self):
42-
html_spec = requests.get(self.url).content
43-
soup = BeautifulSoup(html_spec, 'html.parser')
44-
return soup
45+
if (cache := SCRIPT_DIR / f"ecma-{self.version}.html").exists():
46+
html_spec = cache.read_bytes()
47+
else:
48+
html_spec = requests.get(self.url).content
49+
_ = cache.write_bytes(html_spec)
50+
return BeautifulSoup(html_spec, 'html.parser')
4551

4652
def __parse_section(self, section_html: BeautifulSoup, sections_by_number: Dict[str, Dictionary]):
4753
position = URLPosition(self.url + "#" + section_html.get("id"))
@@ -128,7 +134,7 @@ def __parse_emu_grammar(self, emu_grammar_section: BeautifulSoup) -> list[list[s
128134
result.append(tmp)
129135
return result
130136

131-
def __parse_p(self, p: BeautifulSoup):
137+
def __parse_p(self, p):
132138
res = ""
133139
if type(p) is bs4.NavigableString:
134140
return p.text
@@ -183,7 +189,7 @@ def __parse_subsection(self, subsection: List[BeautifulSoup], position: URLPosit
183189
current_case_title[0])
184190
current_case = ""
185191
current_case_titles = self.__parse_emu_grammar(children)
186-
case "span" | "emu-table" | "emu-import" | "h2" | "emu-table":
192+
case "span" | "emu-table" | "emu-import" | "h2":
187193
pass
188194
case _:
189195
print(f"ERROR: Unhandled tag in html section : {children.name}, {children.text}")

specification_check/main.py

100644100755
Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#!/usr/bin/env python
2+
import sys
3+
14
from ecma_parser import ECMAParser
25
from rocq_parser import ROCQParser
36
from spec_merger.aligner import Aligner
@@ -7,17 +10,13 @@
710
def main():
811
paths = [Path("../mechanization/spec/", True)]
912
files_to_exclude = [Path("../mechanization/spec/Node.v", False)]
10-
url = "https://262.ecma-international.org/14.0/"
13+
rocq_parsed_page = ROCQParser(paths, files_to_exclude).get_parsed_page()
1114

12-
rocq_parser = ROCQParser(paths, files_to_exclude)
13-
rocq_parsed_page = rocq_parser.get_parsed_page()
14-
ecma_parser_v14 = ECMAParser(url, parser_name="ECMAScript v14.0")
15-
ecma_parsed_page_v14 = ecma_parser_v14.get_parsed_page()
15+
ecma_version = sys.argv[1] if len(sys.argv) > 1 else "14.0"
16+
ecma_parsed_page = ECMAParser(ecma_version).get_parsed_page()
1617

17-
a = Aligner()
18-
result = a.align(rocq_parsed_page.entries, ecma_parsed_page_v14.entries)
19-
text_result = result.to_text()
20-
print(text_result, end="")
18+
result = Aligner().align(rocq_parsed_page.entries, ecma_parsed_page.entries)
19+
print(result.to_text(), end="")
2120

2221

2322
if __name__ == "__main__":

0 commit comments

Comments
 (0)