Skip to content

Commit 1f6f03c

Browse files
authored
Merge pull request #1562 from lark-parser/issue1560
Bugfix: Restore support for custom input, alongside text and TextSlice
2 parents e332c2d + 7c365de commit 1f6f03c

File tree

5 files changed

+66
-21
lines changed

5 files changed

+66
-21
lines changed

lark/lark.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .parser_frontends import ParsingFrontend
1717

1818
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
19-
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice
19+
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice, LarkInput
2020
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
2121
from .tree import Tree
2222
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
@@ -637,11 +637,11 @@ def get_terminal(self, name: str) -> TerminalDef:
637637
"""Get information about a terminal"""
638638
return self._terminals_dict[name]
639639

640-
def parse_interactive(self, text: Optional[TextOrSlice]=None, start: Optional[str]=None) -> 'InteractiveParser':
640+
def parse_interactive(self, text: Optional[LarkInput]=None, start: Optional[str]=None) -> 'InteractiveParser':
641641
"""Start an interactive parsing session. Only works when parser='lalr'.
642642
643643
Parameters:
644-
text (TextOrSlice, optional): Text to be parsed. Required for ``resume_parse()``.
644+
text (LarkInput, optional): Text to be parsed. Required for ``resume_parse()``.
645645
start (str, optional): Start symbol
646646
647647
Returns:
@@ -651,12 +651,13 @@ def parse_interactive(self, text: Optional[TextOrSlice]=None, start: Optional[st
651651
"""
652652
return self.parser.parse_interactive(text, start=start)
653653

654-
def parse(self, text: TextOrSlice, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
654+
def parse(self, text: LarkInput, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
655655
"""Parse the given text, according to the options provided.
656656
657657
Parameters:
658-
text (TextOrSlice): Text to be parsed, as `str` or `bytes`.
658+
text (LarkInput): Text to be parsed, as `str` or `bytes`.
659659
TextSlice may also be used, but only when lexer='basic' or 'contextual'.
660+
If Lark was created with a custom lexer, this may be an object of any type.
660661
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
661662
on_error (function, optional): if provided, will be called on UnexpectedInput error,
662663
with the exception as its argument. Return true to resume parsing, or false to raise the exception.

lark/lexer.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -419,15 +419,16 @@ class LexerState:
419419
last_token: Optional[Token]
420420

421421
def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None):
422-
if line_ctr is None:
423-
line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
422+
if isinstance(text, TextSlice):
423+
if line_ctr is None:
424+
line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
424425

425-
if text.start > 0:
426-
# Advance the line-count until line_ctr.char_pos == text.start
427-
line_ctr.feed(TextSlice(text.text, 0, text.start))
426+
if text.start > 0:
427+
# Advance the line-count until line_ctr.char_pos == text.start
428+
line_ctr.feed(TextSlice(text.text, 0, text.start))
428429

429-
if not (text.start <= line_ctr.char_pos <= text.end):
430-
raise ValueError("LineCounter.char_pos is out of bounds")
430+
if not (text.start <= line_ctr.char_pos <= text.end):
431+
raise ValueError("LineCounter.char_pos is out of bounds")
431432

432433
self.text = text
433434
self.line_ctr = line_ctr
@@ -457,6 +458,10 @@ def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread':
457458
text = TextSlice.cast_from(text_or_slice)
458459
return cls(lexer, LexerState(text))
459460

461+
@classmethod
462+
def from_custom_input(cls, lexer: 'Lexer', text: Any) -> 'LexerThread':
463+
return cls(lexer, LexerState(text))
464+
460465
def lex(self, parser_state):
461466
if self.state is None:
462467
raise TypeError("Cannot lex: No text assigned to lexer state")

lark/parser_frontends.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
22

33
from .exceptions import ConfigurationError, GrammarError, assert_config
4-
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice
4+
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice, LarkInput
55
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
66
from .parsers import earley, xearley, cyk
77
from .parsers.lalr_parser import LALR_Parser
@@ -23,7 +23,7 @@ class CustomLexerWrapper1(Lexer):
2323
def __init__(self, lexer_conf):
2424
self.lexer = lexer_class(lexer_conf)
2525
def lex(self, lexer_state, parser_state):
26-
if not lexer_state.text.is_complete_text():
26+
if isinstance(lexer_state.text, TextSlice) and not lexer_state.text.is_complete_text():
2727
raise TypeError("Interface=1 Custom Lexer don't support TextSlice")
2828
lexer_state.text = lexer_state.text
2929
return self.lexer.lex(lexer_state, parser_state)
@@ -34,9 +34,11 @@ def __init__(self, lexer_conf):
3434
self.lexer = lexer_class(lexer_conf)
3535

3636
def lex(self, lexer_state, parser_state):
37-
if not lexer_state.text.is_complete_text():
38-
raise TypeError("Interface=0 Custom Lexer don't support TextSlice")
39-
return self.lexer.lex(lexer_state.text.text)
37+
if isinstance(lexer_state.text, TextSlice):
38+
if not lexer_state.text.is_complete_text():
39+
raise TypeError("Interface=0 Custom Lexer don't support TextSlice")
40+
return self.lexer.lex(lexer_state.text.text)
41+
return self.lexer.lex(lexer_state.text)
4042
return CustomLexerWrapper0
4143
else:
4244
raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected")
@@ -108,11 +110,17 @@ def _verify_start(self, start=None):
108110
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
109111
return start
110112

111-
def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]:
113+
def _make_lexer_thread(self, text: Optional[LarkInput]) -> Union[LarkInput, LexerThread, None]:
112114
cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
113-
return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text)
114-
115-
def parse(self, text: Optional[TextOrSlice], start=None, on_error=None):
115+
if self.skip_lexer:
116+
return text
117+
if text is None:
118+
return cls(self.lexer, None)
119+
if isinstance(text, (str, bytes, TextSlice)):
120+
return cls.from_text(self.lexer, text)
121+
return cls.from_custom_input(self.lexer, text)
122+
123+
def parse(self, text: Optional[LarkInput], start=None, on_error=None):
116124
if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
117125
if isinstance(text, TextSlice) and not text.is_complete_text():
118126
raise TypeError(f"Lexer {self.lexer_conf.lexer_type} does not support text slices.")

lark/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ def rindex(self, substr: AnyStr):
226226

227227

228228
TextOrSlice = Union[AnyStr, 'TextSlice[AnyStr]']
229+
LarkInput = Union[AnyStr, TextSlice[AnyStr], Any]
229230

230231
###}
231232

tests/test_parser.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,36 @@ def test_lexer_token_limit(self):
379379

380380
p = Lark(g, parser='lalr')
381381

382+
def test_custom_input(self):
383+
class TypeLexer(Lexer):
384+
def __init__(self, lexer_conf):
385+
pass
386+
387+
def lex(self, data):
388+
for obj in data:
389+
t = type(obj).__name__.upper()
390+
yield Token(t, obj)
391+
392+
parser = Lark("""
393+
start: data_item+
394+
data_item: STR INT*
395+
396+
%declare STR INT
397+
""", parser='lalr', lexer=TypeLexer)
398+
399+
400+
class ParseToDict(Transformer):
401+
@v_args(inline=True)
402+
def data_item(self, name, *numbers):
403+
return name.value, [n.value for n in numbers]
404+
405+
start = dict
406+
407+
data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6]
408+
tree = parser.parse(data)
409+
res = ParseToDict().transform(tree)
410+
assert res == {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]}
411+
382412

383413

384414
def _make_full_earley_test(LEXER):

0 commit comments

Comments
 (0)