Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/mypy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
- uses: pre-commit/action@v2.0.3
- uses: pre-commit/action@v3.0.1
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h
- [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf)
- [Online IDE](https://lark-parser.org/ide)
- [Tutorial](/docs/json_tutorial.md) for writing a JSON parser.
- Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/)
- Blog post: [How to write a DSL with Lark](https://eshsoft.com/blog/write-dsl-in-python-with-lark)
- [Gitter chat](https://gitter.im/lark-parser/Lobby)

### Install Lark
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ Resources
.. _Examples: https://github.com/lark-parser/lark/tree/master/examples
.. _Third-party examples: https://github.com/ligurio/lark-grammars
.. _Online IDE: https://lark-parser.org/ide
.. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/
.. _How to write a DSL: https://eshsoft.com/blog/write-dsl-in-python-with-lark
.. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html
.. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf
.. _Gitter: https://gitter.im/lark-parser/Lobby
Expand Down
2 changes: 1 addition & 1 deletion examples/advanced/tree_forest_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def adj(self, children):
return Discard

def __default_token__(self, token):
return token.capitalize()
return token.value.capitalize()

grammar = """
sentence: noun verb noun -> simple
Expand Down
2 changes: 1 addition & 1 deletion lark/indenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def handle_NL(self, token: Token) -> Iterator[Token]:

yield token

indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
indent_str = token.value.rsplit('\n', 1)[1] # Tabs and spaces
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len

if indent > self.indent_level[-1]:
Expand Down
34 changes: 33 additions & 1 deletion lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from contextlib import suppress
from typing import (
TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
ClassVar, TYPE_CHECKING, overload
ClassVar, TYPE_CHECKING, overload, Union
)
from types import ModuleType
import warnings
Expand All @@ -16,6 +16,7 @@
if TYPE_CHECKING:
from .common import LexerConf
from .parsers.lalr_parser_state import ParserState
from .lark import PostLex

from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
Expand Down Expand Up @@ -468,6 +469,37 @@ def __copy__(self):
_Token = Token


class PostLexThread(LexerThread):
def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState], postlex: 'PostLex'):
super().__init__(lexer, lexer_state)
self.postlex = postlex

@overload
@classmethod
def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: None = None) -> 'LexerThread':
pass

@overload
@classmethod
def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: 'PostLex') -> 'PostLexThread':
pass

@classmethod
def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice, postlex: Union['PostLex', None] = None) -> Union['LexerThread', 'PostLexThread']:
if postlex is None:
return super().from_text(lexer, text_or_slice)
text = TextSlice.cast_from(text_or_slice)
return cls(lexer, LexerState(text), postlex)

def lex(self, parser_state):
# Get tokens from the underlying lexer and process with postlex
tokens = super().lex(parser_state)
return self.postlex.process(tokens)

def __copy__(self):
return type(self)(self.lexer, copy(self.state), self.postlex)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that the postlexer shouldn't be copied too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would say so? i see it as some kind of processor, it holds no data per se about the current states. i could be wrong.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although, tbh, those shouldn't be instance attributes, but local variables inside of process. I think this postlexer design should already be broken on the current main if this copy method is called since lexer=PostLexConnector doesn't get copied either.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see, i agree those don't really make sense as instance attributes. it makes sense that a postlexer instance should be agnostic to a stream of tokens. this matter is probably beyond the scope of this PR though.



_Callback = Callable[[Token], Token]

class Lexer(ABC):
Expand Down
29 changes: 15 additions & 14 deletions lark/parser_frontends.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, Type

from .exceptions import ConfigurationError, GrammarError, assert_config
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, PostLexThread
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .tree import Tree
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType

if TYPE_CHECKING:
from .parsers.lalr_analysis import ParseTableBase
from .lark import PostLex


###{standalone
Expand Down Expand Up @@ -95,8 +96,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars
else:
raise TypeError("Bad value for lexer_type: {lexer_type}")

if lexer_conf.postlex:
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)
self.postlex: Union['PostLex', None] = lexer_conf.postlex # Store the postlex separately

def _verify_start(self, start=None):
if start is None:
Expand All @@ -109,8 +109,18 @@ def _verify_start(self, start=None):
return start

def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]:
if self.skip_lexer:
return text

cls: Type[LexerThread]

# If we have a postlex, wrap the thread
if self.postlex is not None:
cls = PostLexThread
return cls(self.lexer, text, self.postlex) if text is None else cls.from_text(self.lexer, text, self.postlex)

cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text)
return cls(self.lexer, text) if text is None else cls.from_text(self.lexer, text)

def parse(self, text: Optional[TextOrSlice], start=None, on_error=None):
if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
Expand Down Expand Up @@ -151,15 +161,6 @@ def _get_lexer_callbacks(transformer, terminals):
result[terminal.name] = callback
return result

class PostLexConnector:
def __init__(self, lexer, postlexer):
self.lexer = lexer
self.postlexer = postlexer

def lex(self, lexer_state, parser_state):
i = self.lexer.lex(lexer_state, parser_state)
return self.postlexer.process(i)



def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer:
Expand Down