minimal lark parser (#175)

wpbonelli · web-flow · commit ef021f8c87b7 · 2025-07-16T22:33:45.000-04:00
generic parser just recognizing blocks and lines within them. while a type-aware parser is coming soon, we can fall back to this in case type info is ever not available. if calling load on simulation, model or package, it will be. but maybe we also want a generic load that tries to infer the component from the input file name and contents, in which case we can connect this to a type-aware converter that can match fields against known types to recognize the component, then parse fields before structuring?

also eagerly import submodules of flopy.mf6 so components are all registered on first import of the top-level module.
diff --git a/flopy4/mf6/__init__.py b/flopy4/mf6/__init__.py
@@ -1,3 +1,9 @@
+from flopy4.mf6 import (  # noqa: F401
+    gwf,
+    ims,
+    simulation,
+    tdis,
+)
 from flopy4.mf6.codec import dump
 from flopy4.mf6.component import Component
 from flopy4.mf6.converter import COMPONENT_CONVERTER
diff --git a/flopy4/mf6/codec/reader/__init__.py b/flopy4/mf6/codec/reader/__init__.py
@@ -1,12 +1,46 @@
 from os import PathLike
+from pathlib import Path
 from typing import Any
 
+from flopy4.mf6.codec.reader.parser import make_generic_parser
+from flopy4.mf6.codec.reader.transformer import GenericTransformer
+
 
 def load(path: str | PathLike) -> Any:
-    # TODO
-    pass
+    """
+    Load and parse an MF6 input file.
+
+    Parameters
+    ----------
+    path : str | PathLike
+        Path to the MF6 input file
+
+    Returns
+    -------
+    Any
+        Parsed MF6 input file structure
+    """
+    path = Path(path)
+    with open(path, "r") as f:
+        data = f.read()
+    return loads(data)
 
 
 def loads(data: str) -> Any:
-    # TODO
-    pass
+    """
+    Parse MF6 input file content from string.
+
+    Parameters
+    ----------
+    data : str
+        MF6 input file content as string
+
+    Returns
+    -------
+    Any
+        Parsed MF6 input file structure
+    """
+
+    parser = make_generic_parser()
+    transformer = GenericTransformer()
+    return transformer.transform(parser.parse(data))
diff --git a/flopy4/mf6/codec/reader/grammar/__init__.py b/flopy4/mf6/codec/reader/grammar/__init__.py
@@ -0,0 +1,42 @@
+from os import PathLike
+from pathlib import Path
+
+import jinja2
+from modflow_devtools.dfn import Dfn
+
+from .filters import get_block_variables, get_blocks, get_variables, is_recarray_block
+
+
+def _get_template_env():
+    loader = jinja2.FileSystemLoader(Path(__file__).parent / "templates")
+    env = jinja2.Environment(
+        loader=loader,
+        trim_blocks=True,
+        lstrip_blocks=True,
+        keep_trailing_newline=True,
+    )
+    env.filters["is_recarray_block"] = is_recarray_block
+    env.filters["get_block_variables"] = get_block_variables
+    return env
+
+
+_TEMPLATE_ENV = _get_template_env()
+
+
+def make_grammar(dfn: Dfn, outdir: PathLike):
+    """Generate a Lark grammar file for a single component."""
+    outdir = Path(outdir).expanduser().resolve().absolute()
+    template = _TEMPLATE_ENV.get_template("component.lark.j2")
+    blocks = get_blocks(dfn)
+    variables = get_variables(dfn)
+    target_path = outdir / f"{dfn['name'].replace('-', '')}.lark"
+    with open(target_path, "w") as f:
+        f.write(template.render(component=dfn["name"], blocks=blocks, variables=variables))
+
+
+def make_all_grammars(dfns: dict[str, Dfn], outdir: PathLike):
+    """Generate grammars for all components."""
+    outdir = Path(outdir).expanduser().resolve().absolute()
+    outdir.mkdir(parents=True, exist_ok=True)
+    for dfn in dfns.values():
+        make_grammar(dfn, outdir)
diff --git a/flopy4/mf6/codec/reader/grammar/mf6.lark b/flopy4/mf6/codec/reader/grammar/mf6.lark
@@ -0,0 +1,19 @@
+start: [WS] [_NL*] (block [[WS] _NL*])+ [WS]
+block: "begin"i block_name _NL _content "end"i block_name _NL+
+block_name: CNAME [INT]
+_content: line*
+line: [WS] item+ _NL+
+item: word | NUMBER
+word: /[a-zA-Z0-9._'~,-\\(\\)]+/
+
+%import common.NEWLINE -> _NL
+%import common.WS
+%import common.WS_INLINE
+%import common.CNAME
+%import common.WORD
+%import common.NUMBER
+%import common.INT
+%import common.SH_COMMENT
+%import common._STRING_INNER
+%ignore WS_INLINE
+%ignore SH_COMMENT
diff --git a/flopy4/mf6/codec/reader/parser.py b/flopy4/mf6/codec/reader/parser.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+from lark import Lark
+
+
+def make_generic_parser() -> Lark:
+    grammar_path = Path(__file__).parent / "grammar" / "mf6.lark"
+    with open(grammar_path, "r") as f:
+        grammar = f.read()
+    return Lark(grammar, parser="lalr", debug=True)
diff --git a/flopy4/mf6/codec/reader/transformer.py b/flopy4/mf6/codec/reader/transformer.py
@@ -0,0 +1,50 @@
+from typing import Any
+
+from lark import Token, Transformer
+
+
+class GenericTransformer(Transformer):
+    """
+    Generic transformer for MF6 input files. Works only with the generic
+    grammar. Returns structures of blocks consisting of lines of tokens.
+    """
+
+    def start(self, items: list[Any]) -> dict[str, Any]:
+        blocks = {}
+        for item in items:
+            if not isinstance(item, dict):
+                continue
+            block_name = next(iter(item.keys()))
+            blocks[block_name] = next(iter(item.values()))
+        return blocks
+
+    def block(self, items: list[Any]) -> dict[str, Any]:
+        return {items[0]: items[1 : (len(items) - 1)]}
+
+    def block_name(self, items: list[Any]) -> str:
+        return " ".join([str(item) for item in items if item is not None])
+
+    def line(self, items: list[Any]) -> list[Any]:
+        return items[1:]
+
+    def item(self, items: list[Any]) -> str | float | int:
+        return items[0]
+
+    def word(self, items: list[Token]) -> str:
+        return str(items[0])
+
+    def NUMBER(self, token: Token) -> int | float:
+        value = str(token)
+        try:
+            if "." in value or "e" in value.lower():
+                return float(value)
+            else:
+                return int(value)
+        except ValueError:
+            return float(value)
+
+    def CNAME(self, token: Token) -> str:
+        return str(token)
+
+    def INT(self, token: Token) -> int:
+        return int(token)
diff --git a/flopy4/mf6/gwf/dis.py b/flopy4/mf6/gwf/dis.py
@@ -14,8 +14,8 @@
 @xattree
 class Dis(Package):
     length_units: str = field(
+        block="options",
         default=None,
-        metadata={"block": "options"},
     )
     nogrb: bool = field(block="options", default=False)
     xorigin: float = field(block="options", default=None)
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/test/test_codec.py b/test/test_codec.py