From c8629a61bac35faf385d02bd966f791c2e036f2d Mon Sep 17 00:00:00 2001 From: Romain Date: Fri, 25 Feb 2022 15:02:38 +0100 Subject: [PATCH 1/5] adding function to parse th matlab doc --- utils/parse_info.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 utils/parse_info.py diff --git a/utils/parse_info.py b/utils/parse_info.py new file mode 100644 index 0000000..f2019a2 --- /dev/null +++ b/utils/parse_info.py @@ -0,0 +1,39 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup, NavigableString, Tag + +import gsw +from types import FunctionType + +url = 'https://www.teos-10.org/pubs/gsw/html/gsw_{}.html' + +def parse_info(func_name, url=url): + soup = BeautifulSoup(urlopen(url.format(func_name)), features="html.parser") + for header in soup.find_all('h2'): + if header.text != 'OUTPUT:': + continue + nextNode = header + while True: + nextNode = nextNode.nextSibling + if nextNode is None: + break + if isinstance(nextNode, Tag): + if nextNode.name == "h2": + break + txt = nextNode.get_text(strip=True).strip() + name = txt.split('=')[0].strip() + unit = txt.split('[')[1].split(']')[0].strip() + return (name, unit) + +def print_dict_attrs(): + all_gsw_function = [i for i in dir(gsw) if (isinstance(getattr(gsw, i), FunctionType) and not i.startswith('_'))] + attrs = {} + names = {} + for func in all_gsw_function: + try: + name, unit = parse_info(func) + except: + name, unit = ('', '') + names[func] = name + attrs[func] = {'units':unit} + print(attrs) + print(names) From 0c71d73066e527bec01060175112fa478074692f Mon Sep 17 00:00:00 2001 From: Romain Date: Fri, 25 Feb 2022 15:11:19 +0100 Subject: [PATCH 2/5] replace bad units --- utils/parse_info.py | 46 +++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/utils/parse_info.py b/utils/parse_info.py index f2019a2..4d5bb05 100644 --- a/utils/parse_info.py +++ b/utils/parse_info.py @@ -4,36 +4,46 @@ import gsw from types import FunctionType -url = 'https://www.teos-10.org/pubs/gsw/html/gsw_{}.html' +from functools import reduce + +url = "https://www.teos-10.org/pubs/gsw/html/gsw_{}.html" + +repls = ("deg C", "degC"), ("unitless", "1"), ("degrees of rotation", "arcdeg") + def parse_info(func_name, url=url): soup = BeautifulSoup(urlopen(url.format(func_name)), features="html.parser") - for header in soup.find_all('h2'): - if header.text != 'OUTPUT:': - continue - nextNode = header - while True: - nextNode = nextNode.nextSibling - if nextNode is None: + for header in soup.find_all("h2"): + if header.text != "OUTPUT:": + continue + nextNode = header + while True: + nextNode = nextNode.nextSibling + if nextNode is None: break - if isinstance(nextNode, Tag): - if nextNode.name == "h2": - break + if isinstance(nextNode, Tag): + if nextNode.name == "h2": + break txt = nextNode.get_text(strip=True).strip() - name = txt.split('=')[0].strip() - unit = txt.split('[')[1].split(']')[0].strip() - return (name, unit) + name = txt.split("=")[0].strip().split("_")[0] + unit = txt.split("[")[1].split("]")[0].strip() + return (name, reduce(lambda a, kv: a.replace(*kv), repls, unit)) + -def print_dict_attrs(): - all_gsw_function = [i for i in dir(gsw) if (isinstance(getattr(gsw, i), FunctionType) and not i.startswith('_'))] +def print_dict_attrs(): + all_gsw_function = [ + i + for i in dir(gsw) + if (isinstance(getattr(gsw, i), FunctionType) and not i.startswith("_")) + ][:10] attrs = {} names = {} for func in all_gsw_function: try: name, unit = parse_info(func) except: - name, unit = ('', '') + name, unit = ("", "") names[func] = name - attrs[func] = {'units':unit} + attrs[func] = {"units": unit} print(attrs) print(names) From 8810cac682ec2649eb4666fd610fe95a058376a3 Mon Sep 17 00:00:00 2001 From: Romain Date: Fri, 25 Feb 2022 15:13:59 +0100 Subject: [PATCH 3/5] typo --- utils/parse_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/parse_info.py b/utils/parse_info.py index 4d5bb05..dafaeec 100644 --- a/utils/parse_info.py +++ b/utils/parse_info.py @@ -35,7 +35,7 @@ def print_dict_attrs(): i for i in dir(gsw) if (isinstance(getattr(gsw, i), FunctionType) and not i.startswith("_")) - ][:10] + ] attrs = {} names = {} for func in all_gsw_function: From 20c4faf8894801a4d8ea1cc878953a1f61dfc9b3 Mon Sep 17 00:00:00 2001 From: Romain Date: Fri, 13 May 2022 19:26:06 +0200 Subject: [PATCH 4/5] update env for parsing --- poetry.lock | 14 +++++++------- pyproject.toml | 1 + utils/parse_info.py | 3 +++ 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index e153cd8..40503e3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -41,11 +41,11 @@ pytz = ">=2015.7" [[package]] name = "beautifulsoup4" -version = "4.10.0" +version = "4.11.1" description = "Screen-scraping library" category = "main" -optional = true -python-versions = ">3.0.0" +optional = false +python-versions = ">=3.6.0" [package.dependencies] soupsieve = ">1.2" @@ -337,7 +337,7 @@ name = "soupsieve" version = "2.3.1" description = "A modern CSS selector implementation for Beautiful Soup." category = "main" -optional = true +optional = false python-versions = ">=3.6" [[package]] @@ -502,7 +502,7 @@ docs = ["Sphinx", "furo"] [metadata] lock-version = "1.1" python-versions = ">=3.8" -content-hash = "c6d56c99359f5dce254177f391227d42d07feba1e4e7e7c6d5abd643e683e934" +content-hash = "f0cafd88b060a2f07afa64e64271180f58671d9444c3bb830f36417cacde85aa" [metadata.files] alabaster = [ @@ -522,8 +522,8 @@ babel = [ {file = "Babel-2.9.1.tar.gz", hash = "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"}, ] beautifulsoup4 = [ - {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"}, - {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"}, + {file = "beautifulsoup4-4.11.1-py3-none-any.whl", hash = "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30"}, + {file = "beautifulsoup4-4.11.1.tar.gz", hash = "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"}, ] certifi = [ {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, diff --git a/pyproject.toml b/pyproject.toml index 7e8fae6..22ba1e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ xarray = ">=0.20.2" gsw = ">=3.4.0" Sphinx = {version = ">=4.4.0", optional = true} furo = {version = ">=2022.1.2", optional = true} +beautifulsoup4 = "^4.11.1" [tool.poetry.dev-dependencies] pytest = ">=6.2.5" diff --git a/utils/parse_info.py b/utils/parse_info.py index dafaeec..c4f21e5 100644 --- a/utils/parse_info.py +++ b/utils/parse_info.py @@ -47,3 +47,6 @@ def print_dict_attrs(): attrs[func] = {"units": unit} print(attrs) print(names) + +if __name__ == '__main__': + print_dict_attrs() From 7f4fb80bce6115d265af0cd08070d0725beda9df Mon Sep 17 00:00:00 2001 From: Romain Date: Tue, 14 Jun 2022 10:16:24 +0200 Subject: [PATCH 5/5] add parsing for inputs --- utils/parse_info_input.py | 65 +++++++++++++++++++ utils/{parse_info.py => parse_info_output.py} | 0 2 files changed, 65 insertions(+) create mode 100644 utils/parse_info_input.py rename utils/{parse_info.py => parse_info_output.py} (100%) diff --git a/utils/parse_info_input.py b/utils/parse_info_input.py new file mode 100644 index 0000000..2bb9cde --- /dev/null +++ b/utils/parse_info_input.py @@ -0,0 +1,65 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup, NavigableString, Tag + +import gsw +from types import FunctionType + +from functools import reduce + +url = "https://www.teos-10.org/pubs/gsw/html/gsw_{}.html" + +repls = ("deg C", "degC"), ("unitless", "1"), ("degrees of rotation", "arcdeg") + + +def parse_info(func_name, url=url): + soup = BeautifulSoup(urlopen(url.format(func_name)), features="html.parser") + for header in soup.find_all("h2"): + if header.text != "INPUT:": + continue + nextNode = header + while True: + nextNode = nextNode.nextSibling + if nextNode is None: + break + if isinstance(nextNode, Tag): + if nextNode.name == "h2": + break + txt = nextNode.get_text(strip=True).strip() + args = [i for i in txt.split('\n') if '=' in i] + return [(i.split("=")[0].strip(), reduce(lambda a, kv: a.replace(*kv), repls, i.split("[")[1].split("]")[0].strip())) for i in args] + return [] + + +def print_dict_attrs(): + all_gsw_function = [ + i + for i in dir(gsw) + if (isinstance(getattr(gsw, i), FunctionType) and not i.startswith("_")) + ] + args_all = {} + for func in all_gsw_function[:]: + try: + args = parse_info(func) + except: + args = [] + args_all[func] = args + print(args_all) + print('\n\n********************\n\n') + get_units_per_arg(args_all) + +def get_units_per_arg(args_all): + units = {} + for f in args_all.keys(): + args = args_all[f] + for a in args: + if a[0] == 'h': + print(f, a) + if a[0] in units.keys(): + if a[1] not in units[a[0]]: + units[a[0]].append(a[1]) + else: + units[a[0]] = [a[1]] + print(units) + +if __name__ == '__main__': + print_dict_attrs() diff --git a/utils/parse_info.py b/utils/parse_info_output.py similarity index 100% rename from utils/parse_info.py rename to utils/parse_info_output.py