From f98bea220cbd8098eb9a7663ef2a7437f8e50927 Mon Sep 17 00:00:00 2001 From: ornariece <37-ornariece@users.noreply.git.malined.com> Date: Tue, 22 Apr 2025 16:06:28 +0200 Subject: [PATCH 1/8] use token.value where required --- examples/advanced/tree_forest_transformer.py | 2 +- lark/indenter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/advanced/tree_forest_transformer.py b/examples/advanced/tree_forest_transformer.py index 7582b5776..809d62e95 100644 --- a/examples/advanced/tree_forest_transformer.py +++ b/examples/advanced/tree_forest_transformer.py @@ -23,7 +23,7 @@ def adj(self, children): return Discard def __default_token__(self, token): - return token.capitalize() + return token.value.capitalize() grammar = """ sentence: noun verb noun -> simple diff --git a/lark/indenter.py b/lark/indenter.py index 037513bdf..2acdc2312 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -40,7 +40,7 @@ def handle_NL(self, token: Token) -> Iterator[Token]: yield token - indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces + indent_str = token.value.rsplit('\n', 1)[1] # Tabs and spaces indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len if indent > self.indent_level[-1]: From 9b3f1a0383613a6df3dd3adc70b6c536ed61bd56 Mon Sep 17 00:00:00 2001 From: ornariece <37-ornariece@users.noreply.git.malined.com> Date: Tue, 22 Apr 2025 16:43:21 +0200 Subject: [PATCH 2/8] handle postlex graciously --- lark/lexer.py | 20 ++++++++++++++++++++ lark/parser_frontends.py | 26 +++++++++++++------------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 59d9acfd1..4572f5049 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -16,6 +16,7 @@ if TYPE_CHECKING: from .common import LexerConf from .parsers.lalr_parser_state import ParserState + from .lark import PostLex from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken @@ -468,6 +469,25 @@ def __copy__(self): _Token = Token +class PostLexThread(LexerThread): + def __init__(self, lexer: 'Lexer', lexer_state: LexerState, postlex: 'PostLex'): + super().__init__(lexer, lexer_state) + self.postlex = postlex + + @classmethod + def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: 'PostLex') -> 'PostLexThread': + text = TextSlice.cast_from(text_or_slice) + return cls(lexer, LexerState(text), postlex) + + def lex(self, parser_state): + # Get tokens from the underlying lexer and process with postlex + tokens = super().lex(parser_state) + return self.postlex.process(tokens) + + def __copy__(self): + return type(self)(self.lexer, copy(self.state), self.postlex) + + _Callback = Callable[[Token], Token] class Lexer(ABC): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index bfe4eba98..ca20a317f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -2,7 +2,7 @@ from .exceptions import ConfigurationError, GrammarError, assert_config from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice -from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer +from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, PostLexThread from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .tree import Tree @@ -95,8 +95,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars else: raise TypeError("Bad value for lexer_type: {lexer_type}") - if lexer_conf.postlex: - self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) + self.postlex: PostLex | None = lexer_conf.postlex # Store the postlex separately def _verify_start(self, start=None): if start is None: @@ -109,8 +108,18 @@ def _verify_start(self, start=None): return start def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]: + if self.skip_lexer: + return text + cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text) + + thread = cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text) + + # If we have a postlex, wrap the thread + if self.postlex is not None: + return PostLexThread(self.lexer, thread.state, self.postlex) + + return thread def parse(self, text: Optional[TextOrSlice], start=None, on_error=None): if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"): @@ -151,15 +160,6 @@ def _get_lexer_callbacks(transformer, terminals): result[terminal.name] = callback return result -class PostLexConnector: - def __init__(self, lexer, postlexer): - self.lexer = lexer - self.postlexer = postlexer - - def lex(self, lexer_state, parser_state): - i = self.lexer.lex(lexer_state, parser_state) - return self.postlexer.process(i) - def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer: From c59bedddba3c8a5abaab3f44c9ab29187655e83f Mon Sep 17 00:00:00 2001 From: ornariece <37-ornariece@users.noreply.git.malined.com> Date: Tue, 22 Apr 2025 17:25:47 +0200 Subject: [PATCH 3/8] fix typing --- lark/lexer.py | 16 ++++++++++++++-- lark/parser_frontends.py | 15 ++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 4572f5049..89eca136a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -5,7 +5,7 @@ from contextlib import suppress from typing import ( TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - ClassVar, TYPE_CHECKING, overload + ClassVar, TYPE_CHECKING, overload, Union ) from types import ModuleType import warnings @@ -470,12 +470,24 @@ def __copy__(self): class PostLexThread(LexerThread): - def __init__(self, lexer: 'Lexer', lexer_state: LexerState, postlex: 'PostLex'): + def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState], postlex: 'PostLex'): super().__init__(lexer, lexer_state) self.postlex = postlex + @overload + @classmethod + def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: None = None) -> 'LexerThread': + pass + + @overload @classmethod def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: 'PostLex') -> 'PostLexThread': + pass + + @classmethod + def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: Union['PostLex', None] = None) -> Union['LexerThread', 'PostLexThread']: + if postlex is None: + return super().from_text(lexer, text_or_slice) text = TextSlice.cast_from(text_or_slice) return cls(lexer, LexerState(text), postlex) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index ca20a317f..1c9e353d8 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, Type from .exceptions import ConfigurationError, GrammarError, assert_config from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice @@ -10,6 +10,7 @@ if TYPE_CHECKING: from .parsers.lalr_analysis import ParseTableBase + from .lark import PostLex ###{standalone @@ -95,7 +96,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars else: raise TypeError("Bad value for lexer_type: {lexer_type}") - self.postlex: PostLex | None = lexer_conf.postlex # Store the postlex separately + self.postlex: Union['PostLex', None] = lexer_conf.postlex # Store the postlex separately def _verify_start(self, start=None): if start is None: @@ -111,15 +112,15 @@ def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, if self.skip_lexer: return text - cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - - thread = cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text) + cls: Type[LexerThread] # If we have a postlex, wrap the thread if self.postlex is not None: - return PostLexThread(self.lexer, thread.state, self.postlex) + cls = PostLexThread + return cls(self.lexer, text, self.postlex) if text is None else cls.from_text(self.lexer, text, self.postlex) - return thread + cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread + return cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text) def parse(self, text: Optional[TextOrSlice], start=None, on_error=None): if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"): From 4280441907c57cde7b9a2a9a44f9c147264a4a74 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 23 Apr 2025 08:32:02 +0300 Subject: [PATCH 4/8] Docs: Updated link of DSL article to a new version, with better formatting, and support for a dark theme --- README.md | 2 +- docs/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 74710f713..88ebf1b93 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) - [Online IDE](https://lark-parser.org/ide) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. -- Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) +- Blog post: [How to write a DSL with Lark](https://eshsoft.com/blog/write-dsl-in-python-with-lark) - [Gitter chat](https://gitter.im/lark-parser/Lobby) ### Install Lark diff --git a/docs/index.rst b/docs/index.rst index e69e2b9d0..1f7347e91 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -115,7 +115,7 @@ Resources .. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Third-party examples: https://github.com/ligurio/lark-grammars .. _Online IDE: https://lark-parser.org/ide -.. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ +.. _How to write a DSL: https://eshsoft.com/blog/write-dsl-in-python-with-lark .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf .. _Gitter: https://gitter.im/lark-parser/Lobby From f9ba191d2d3ca4e9d46f79b2fe50c078edebe9ae Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 23 Apr 2025 08:34:43 +0300 Subject: [PATCH 5/8] Upgrade pre-commit version --- .github/workflows/mypy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 9624cd9cd..108c19c16 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -16,4 +16,4 @@ jobs: steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v3 - - uses: pre-commit/action@v2.0.3 + - uses: pre-commit/action@v3.0.1 From 1183ae55470978939edc1e657a32bc37e823996c Mon Sep 17 00:00:00 2001 From: ornariece <37-ornariece@users.noreply.git.malined.com> Date: Tue, 22 Apr 2025 16:06:28 +0200 Subject: [PATCH 6/8] use token.value where required --- examples/advanced/tree_forest_transformer.py | 2 +- lark/indenter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/advanced/tree_forest_transformer.py b/examples/advanced/tree_forest_transformer.py index 7582b5776..809d62e95 100644 --- a/examples/advanced/tree_forest_transformer.py +++ b/examples/advanced/tree_forest_transformer.py @@ -23,7 +23,7 @@ def adj(self, children): return Discard def __default_token__(self, token): - return token.capitalize() + return token.value.capitalize() grammar = """ sentence: noun verb noun -> simple diff --git a/lark/indenter.py b/lark/indenter.py index 037513bdf..2acdc2312 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -40,7 +40,7 @@ def handle_NL(self, token: Token) -> Iterator[Token]: yield token - indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces + indent_str = token.value.rsplit('\n', 1)[1] # Tabs and spaces indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len if indent > self.indent_level[-1]: From 6635f127844dd6608587dc1b67180c073104fecc Mon Sep 17 00:00:00 2001 From: ornariece <37-ornariece@users.noreply.git.malined.com> Date: Tue, 22 Apr 2025 16:43:21 +0200 Subject: [PATCH 7/8] handle postlex graciously --- lark/lexer.py | 20 ++++++++++++++++++++ lark/parser_frontends.py | 26 +++++++++++++------------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 59d9acfd1..4572f5049 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -16,6 +16,7 @@ if TYPE_CHECKING: from .common import LexerConf from .parsers.lalr_parser_state import ParserState + from .lark import PostLex from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken @@ -468,6 +469,25 @@ def __copy__(self): _Token = Token +class PostLexThread(LexerThread): + def __init__(self, lexer: 'Lexer', lexer_state: LexerState, postlex: 'PostLex'): + super().__init__(lexer, lexer_state) + self.postlex = postlex + + @classmethod + def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: 'PostLex') -> 'PostLexThread': + text = TextSlice.cast_from(text_or_slice) + return cls(lexer, LexerState(text), postlex) + + def lex(self, parser_state): + # Get tokens from the underlying lexer and process with postlex + tokens = super().lex(parser_state) + return self.postlex.process(tokens) + + def __copy__(self): + return type(self)(self.lexer, copy(self.state), self.postlex) + + _Callback = Callable[[Token], Token] class Lexer(ABC): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index bfe4eba98..ca20a317f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -2,7 +2,7 @@ from .exceptions import ConfigurationError, GrammarError, assert_config from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice -from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer +from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, PostLexThread from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .tree import Tree @@ -95,8 +95,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars else: raise TypeError("Bad value for lexer_type: {lexer_type}") - if lexer_conf.postlex: - self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) + self.postlex: PostLex | None = lexer_conf.postlex # Store the postlex separately def _verify_start(self, start=None): if start is None: @@ -109,8 +108,18 @@ def _verify_start(self, start=None): return start def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]: + if self.skip_lexer: + return text + cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text) + + thread = cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text) + + # If we have a postlex, wrap the thread + if self.postlex is not None: + return PostLexThread(self.lexer, thread.state, self.postlex) + + return thread def parse(self, text: Optional[TextOrSlice], start=None, on_error=None): if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"): @@ -151,15 +160,6 @@ def _get_lexer_callbacks(transformer, terminals): result[terminal.name] = callback return result -class PostLexConnector: - def __init__(self, lexer, postlexer): - self.lexer = lexer - self.postlexer = postlexer - - def lex(self, lexer_state, parser_state): - i = self.lexer.lex(lexer_state, parser_state) - return self.postlexer.process(i) - def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer: From 5374406b5a21efaf6383ebc3a08e44ad44d21104 Mon Sep 17 00:00:00 2001 From: ornariece <37-ornariece@users.noreply.git.malined.com> Date: Tue, 22 Apr 2025 17:25:47 +0200 Subject: [PATCH 8/8] fix typing --- lark/lexer.py | 16 ++++++++++++++-- lark/parser_frontends.py | 15 ++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 4572f5049..89eca136a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -5,7 +5,7 @@ from contextlib import suppress from typing import ( TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - ClassVar, TYPE_CHECKING, overload + ClassVar, TYPE_CHECKING, overload, Union ) from types import ModuleType import warnings @@ -470,12 +470,24 @@ def __copy__(self): class PostLexThread(LexerThread): - def __init__(self, lexer: 'Lexer', lexer_state: LexerState, postlex: 'PostLex'): + def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState], postlex: 'PostLex'): super().__init__(lexer, lexer_state) self.postlex = postlex + @overload + @classmethod + def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: None = None) -> 'LexerThread': + pass + + @overload @classmethod def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: 'PostLex') -> 'PostLexThread': + pass + + @classmethod + def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: Union['PostLex', None] = None) -> Union['LexerThread', 'PostLexThread']: + if postlex is None: + return super().from_text(lexer, text_or_slice) text = TextSlice.cast_from(text_or_slice) return cls(lexer, LexerState(text), postlex) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index ca20a317f..1c9e353d8 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, Type from .exceptions import ConfigurationError, GrammarError, assert_config from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice @@ -10,6 +10,7 @@ if TYPE_CHECKING: from .parsers.lalr_analysis import ParseTableBase + from .lark import PostLex ###{standalone @@ -95,7 +96,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars else: raise TypeError("Bad value for lexer_type: {lexer_type}") - self.postlex: PostLex | None = lexer_conf.postlex # Store the postlex separately + self.postlex: Union['PostLex', None] = lexer_conf.postlex # Store the postlex separately def _verify_start(self, start=None): if start is None: @@ -111,15 +112,15 @@ def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, if self.skip_lexer: return text - cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - - thread = cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text) + cls: Type[LexerThread] # If we have a postlex, wrap the thread if self.postlex is not None: - return PostLexThread(self.lexer, thread.state, self.postlex) + cls = PostLexThread + return cls(self.lexer, text, self.postlex) if text is None else cls.from_text(self.lexer, text, self.postlex) - return thread + cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread + return cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text) def parse(self, text: Optional[TextOrSlice], start=None, on_error=None): if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):