11from dataclasses import dataclass
22from typing import Dict , List
3+ from pathlib import Path
34
45import bs4
56import requests
910from spec_merger .aligner_utils import Position
1011from spec_merger .utils import ParserState , ParsedPage , Parser
1112
13+ SCRIPT_DIR = Path (__file__ ).parent
1214
1315@dataclass (frozen = True )
1416class URLPosition (Position ):
@@ -28,20 +30,24 @@ def add_case(cases: dict[str, Dictionary], case: tuple[str, String], key: str):
2830
2931class ECMAParser (Parser ):
3032
31- def __init__ (self , url , parser_name = "ECMA" , sections = None ):
32- self .name = parser_name
33+ def __init__ (self , version : str , sections = None ):
34+ self .name = f"ECMAScript v { version } "
3335 if sections is None :
3436 sections = ["sec-regexp-regular-expression-objects" ]
3537 self .sections = sections
36- self .url = url
38+ self .version = version
39+ self .url = f"https://262.ecma-international.org/{ version } /"
3740 self .page = self .__get_page ()
38- self .sections_by_number : Dict [str , Dictionary ] = None
41+ self .sections_by_number : dict [str , Dictionary ] = None
3942 self .avoid = {None , "emu-note" , "\n " }
4043
4144 def __get_page (self ):
42- html_spec = requests .get (self .url ).content
43- soup = BeautifulSoup (html_spec , 'html.parser' )
44- return soup
45+ if (cache := SCRIPT_DIR / f"ecma-{ self .version } .html" ).exists ():
46+ html_spec = cache .read_bytes ()
47+ else :
48+ html_spec = requests .get (self .url ).content
49+ _ = cache .write_bytes (html_spec )
50+ return BeautifulSoup (html_spec , 'html.parser' )
4551
4652 def __parse_section (self , section_html : BeautifulSoup , sections_by_number : Dict [str , Dictionary ]):
4753 position = URLPosition (self .url + "#" + section_html .get ("id" ))
@@ -128,7 +134,7 @@ def __parse_emu_grammar(self, emu_grammar_section: BeautifulSoup) -> list[list[s
128134 result .append (tmp )
129135 return result
130136
131- def __parse_p (self , p : BeautifulSoup ):
137+ def __parse_p (self , p ):
132138 res = ""
133139 if type (p ) is bs4 .NavigableString :
134140 return p .text
@@ -183,7 +189,7 @@ def __parse_subsection(self, subsection: List[BeautifulSoup], position: URLPosit
183189 current_case_title [0 ])
184190 current_case = ""
185191 current_case_titles = self .__parse_emu_grammar (children )
186- case "span" | "emu-table" | "emu-import" | "h2" | "emu-table" :
192+ case "span" | "emu-table" | "emu-import" | "h2" :
187193 pass
188194 case _:
189195 print (f"ERROR: Unhandled tag in html section : { children .name } , { children .text } " )
0 commit comments