lark-parser · ornariece · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 23, 2025
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
@@ -16,4 +16,4 @@ jobs:
     steps:
     - uses: actions/checkout@v3
     - uses: actions/setup-python@v3
-    - uses: pre-commit/action@v2.0.3
+    - uses: pre-commit/action@v3.0.1
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h
 - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf)
 - [Online IDE](https://lark-parser.org/ide)
 - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser.
-- Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/)
+- Blog post: [How to write a DSL with Lark](https://eshsoft.com/blog/write-dsl-in-python-with-lark)
 - [Gitter chat](https://gitter.im/lark-parser/Lobby)
 
 ### Install Lark

diff --git a/docs/index.rst b/docs/index.rst
@@ -115,7 +115,7 @@ Resources
 .. _Examples: https://github.com/lark-parser/lark/tree/master/examples
 .. _Third-party examples: https://github.com/ligurio/lark-grammars
 .. _Online IDE: https://lark-parser.org/ide
-.. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/
+.. _How to write a DSL: https://eshsoft.com/blog/write-dsl-in-python-with-lark
 .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html
 .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf
 .. _Gitter: https://gitter.im/lark-parser/Lobby

diff --git a/examples/advanced/tree_forest_transformer.py b/examples/advanced/tree_forest_transformer.py
@@ -23,7 +23,7 @@ def adj(self, children):
         return Discard
 
     def __default_token__(self, token):
-        return token.capitalize()
+        return token.value.capitalize()
 
 grammar = """
     sentence: noun verb noun        -> simple

diff --git a/lark/indenter.py b/lark/indenter.py
@@ -40,7 +40,7 @@ def handle_NL(self, token: Token) -> Iterator[Token]:
 
         yield token
 
-        indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
+        indent_str = token.value.rsplit('\n', 1)[1] # Tabs and spaces
         indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
 
         if indent > self.indent_level[-1]:

diff --git a/lark/lexer.py b/lark/lexer.py
@@ -5,7 +5,7 @@
 from contextlib import suppress
 from typing import (
     TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
-    ClassVar, TYPE_CHECKING, overload
+    ClassVar, TYPE_CHECKING, overload, Union
 )
 from types import ModuleType
 import warnings
@@ -16,6 +16,7 @@
 if TYPE_CHECKING:
     from .common import LexerConf
     from .parsers.lalr_parser_state import ParserState
+    from .lark import PostLex
 
 from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
 from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
@@ -468,6 +469,37 @@ def __copy__(self):
     _Token = Token
 
 
+class PostLexThread(LexerThread):
+    def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState], postlex: 'PostLex'):
+        super().__init__(lexer, lexer_state)
+        self.postlex = postlex
+
+    @overload
+    @classmethod
+    def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: None = None) -> 'LexerThread':
+        pass
+
+    @overload
+    @classmethod
+    def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: 'PostLex') -> 'PostLexThread':
+        pass
+
+    @classmethod
+    def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: Union['PostLex', None] = None) -> Union['LexerThread', 'PostLexThread']:
+        if postlex is None:
+            return super().from_text(lexer, text_or_slice)
+        text = TextSlice.cast_from(text_or_slice)
+        return cls(lexer, LexerState(text), postlex)
+
+    def lex(self, parser_state):
+        # Get tokens from the underlying lexer and process with postlex
+        tokens = super().lex(parser_state)
+        return self.postlex.process(tokens)
+
+    def __copy__(self):
+        return type(self)(self.lexer, copy(self.state), self.postlex)
+
+
 _Callback = Callable[[Token], Token]
 
 class Lexer(ABC):

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
@@ -1,15 +1,16 @@
-from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
+from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, Type
 
 from .exceptions import ConfigurationError, GrammarError, assert_config
 from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice
-from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
+from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, PostLexThread
 from .parsers import earley, xearley, cyk
 from .parsers.lalr_parser import LALR_Parser
 from .tree import Tree
 from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
 
 if TYPE_CHECKING:
     from .parsers.lalr_analysis import ParseTableBase
+    from .lark import PostLex
 
 
 ###{standalone
@@ -95,8 +96,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars
         else:
             raise TypeError("Bad value for lexer_type: {lexer_type}")
 
-        if lexer_conf.postlex:
-            self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)
+        self.postlex: Union['PostLex', None] = lexer_conf.postlex  # Store the postlex separately
 
     def _verify_start(self, start=None):
         if start is None:
@@ -109,8 +109,18 @@ def _verify_start(self, start=None):
         return start
 
     def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]:
+        if self.skip_lexer:
+            return text
+
+        cls: Type[LexerThread]
+
+        # If we have a postlex, wrap the thread
+        if self.postlex is not None:
+            cls = PostLexThread
+            return cls(self.lexer, text, self.postlex) if text is None else cls.from_text(self.lexer, text, self.postlex)
+
         cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
-        return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text)
+        return cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text)
 
     def parse(self, text: Optional[TextOrSlice], start=None, on_error=None):
         if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
@@ -151,15 +161,6 @@ def _get_lexer_callbacks(transformer, terminals):
             result[terminal.name] = callback
     return result
 
-class PostLexConnector:
-    def __init__(self, lexer, postlexer):
-        self.lexer = lexer
-        self.postlexer = postlexer
-
-    def lex(self, lexer_state, parser_state):
-        i = self.lexer.lex(lexer_state, parser_state)
-        return self.postlexer.process(i)
-
 
 
 def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer: