Skip to content

Commit 656a7cc

Browse files
authored
Merge pull request #11 from Mathics3/tokenizer-documentation
Document the tokeniser
2 parents 295fe97 + dbae4b3 commit 656a7cc

File tree

3 files changed

+99
-18
lines changed

3 files changed

+99
-18
lines changed

mathics_scanner/errors.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,21 @@
33

44

55
class TranslateError(Exception):
6-
def __init__(self):
7-
pass
6+
"""A generic class of tokenization errors"""
7+
pass
88

99

1010
class ScanError(TranslateError):
11+
"""A generic scanning error"""
1112
pass
1213

1314

1415
class InvalidSyntaxError(TranslateError):
16+
"""Invalid syntax"""
1517
pass
1618

1719

1820
class IncompleteSyntaxError(TranslateError):
21+
"""More characters were expected to form a valid token"""
1922
pass
23+

mathics_scanner/feed.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,16 @@
88

99

1010
class LineFeeder(metaclass=ABCMeta):
11+
"""
12+
An abstract representation for a feeder. The purpose of a feeder is to
13+
mediate the consumption of characters between the tokeniser and the actual
14+
file being scaned, as well to store messages regarding tokenization errors.
15+
"""
1116
def __init__(self, filename):
17+
"""
18+
@param: filename A string that describes the source of the feeder, i.e.
19+
the filename that is being feed.
20+
"""
1221
self.messages = []
1322
self.lineno = 0
1423
self.filename = filename
@@ -29,13 +38,19 @@ def empty(self):
2938
return
3039

3140
def message(self, sym, tag, *args):
41+
"""
42+
Append a generic message of type ``sym`` to the message queue.
43+
"""
3244
if sym == "Syntax":
3345
message = self.syntax_message(sym, tag, *args)
3446
else:
3547
message = [sym, tag] + list(args)
3648
self.messages.append(message)
3749

3850
def syntax_message(self, sym, tag, *args):
51+
"""
52+
Append a message concerning syntax errors to the message queue.
53+
"""
3954
if len(args) > 3:
4055
raise ValueError("Too many args.")
4156
message = [sym, tag]
@@ -49,16 +64,22 @@ def syntax_message(self, sym, tag, *args):
4964
assert len(message) == 7
5065
return message
5166

67+
# TODO: Rethink this (this is only usefull for core, not anyone else)
5268
def send_messages(self, evaluation):
5369
for message in self.messages:
5470
evaluation.message(*message)
5571
self.messages = []
5672

5773

5874
class MultiLineFeeder(LineFeeder):
59-
"Feeds one line at a time."
75+
"A feeder that feeds one line at a time."
6076

6177
def __init__(self, lines, filename=""):
78+
"""
79+
@param: lines The source of the feeder (a string).
80+
@param: filename A string that describes the source of the feeder, i.e.
81+
the filename that is being feed.
82+
"""
6283
super(MultiLineFeeder, self).__init__(filename)
6384
self.lineno = 0
6485
if isinstance(lines, str):
@@ -79,9 +100,14 @@ def empty(self):
79100

80101

81102
class SingleLineFeeder(LineFeeder):
82-
"Feeds all the code as a single line."
103+
"A feeder that feeds all the code as a single line."
83104

84105
def __init__(self, code, filename=""):
106+
"""
107+
@param: code The source of the feeder (a string).
108+
@param: filename A string that describes the source of the feeder, i.e.
109+
the filename that is being feed.
110+
"""
85111
super().__init__(filename)
86112
self.code = code
87113
self._empty = False
@@ -98,9 +124,14 @@ def empty(self):
98124

99125

100126
class FileLineFeeder(LineFeeder):
101-
"Feeds lines from an open file object"
127+
"A feeder that feeds lines from an open ``File`` object"
102128

103129
def __init__(self, fileobject, trace_fn=None):
130+
"""
131+
@param: fileobject The source of the feeder (a string).
132+
@param: filename A string that describes the source of the feeder,
133+
i.e. the filename that is being feed.
134+
"""
104135
super().__init__(fileobject.name)
105136
self.fileobject = fileobject
106137
self.lineno = 0
@@ -122,3 +153,4 @@ def feed(self):
122153

123154
def empty(self):
124155
return self.eof
156+

mathics_scanner/tokeniser.py

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -305,11 +305,22 @@ def compile_tokens(token_list):
305305

306306

307307
def is_symbol_name(text):
308+
"""
309+
Returns ``True`` if ``text`` is a valid identifier. Otherwise returns
310+
``False``.
311+
"""
312+
# Can't we just call match here?
308313
return full_symbol_pattern.sub("", text) == ""
309314

310315

311316
class Token(object):
317+
"A representation of a Wolfram Language token"
312318
def __init__(self, tag, text, pos):
319+
"""
320+
@param: tag A string that indicates which type of token this is.
321+
@param: text The actual contents of the token.
322+
@param: pos The position of the token in the input feed.
323+
"""
313324
self.tag = tag
314325
self.text = text
315326
self.pos = pos
@@ -326,28 +337,53 @@ def __repr__(self):
326337

327338

328339
class Tokeniser(object):
340+
"""
341+
A tokeniser for the Wolfram Language.
342+
343+
When subclassing ``Tokeniser``, custom tokenisation rules can be defined by
344+
declaring methods whose names are preceded by ``t_``, such as in the
345+
following example: ::
346+
347+
class MyTokeniser(Tokeniser):
348+
def t_MyWeirdRule(self, match):
349+
# Your logic goes here...
350+
pass
351+
352+
In this example, ``t_MyWeirdRule`` is supposed to update the internal state
353+
of the tokeniser and return a ``Token`` with an appropriate tag. ``m̀atch``
354+
is expected to be an instance of ``re.Match``.
355+
"""
329356
modes = {
330357
"expr": (tokens, token_indices),
331358
"filename": (filename_tokens, {}),
332359
}
333360

334361
def __init__(self, feeder):
362+
"""
363+
@param: feeder An instance of ``LineFeeder`` which will feed characters
364+
to the tokeniser.
365+
"""
335366
self.pos = 0
336367
self.feeder = feeder
337368
self.prescanner = Prescanner(feeder)
338369
self.code = self.prescanner.scan()
339-
self.change_mode("expr")
370+
self._change_mode("expr")
340371

341-
def change_mode(self, mode):
372+
def _change_mode(self, mode):
373+
"""
374+
Set the mode of the tokeniser
375+
"""
342376
self.mode = mode
343377
self.tokens, self.token_indices = self.modes[mode]
344378

379+
# TODO: Rename this to something that remotetly makes sense?
345380
def incomplete(self):
346-
"get more code from the prescanner and continue"
381+
"Get more code from the prescanner and continue"
347382
self.prescanner.incomplete()
348383
self.code += self.prescanner.scan()
349384

350385
def sntx_message(self, pos=None):
386+
"""Send a message to the feeder."""
351387
if pos is None:
352388
pos = self.pos
353389
pre, post = self.code[:pos], self.code[pos:].rstrip("\n")
@@ -356,9 +392,10 @@ def sntx_message(self, pos=None):
356392
else:
357393
self.feeder.message("Syntax", "sntxf", pre, post)
358394

395+
# TODO: Convert this to __next__ in the future?
359396
def next(self):
360-
"return next token"
361-
self.skip_blank()
397+
"Returns the next token"
398+
self._skip_blank()
362399
if self.pos >= len(self.code):
363400
return Token("END", "", len(self.code))
364401

@@ -390,8 +427,8 @@ def next(self):
390427
self.pos = match.end(0)
391428
return Token(tag, text, match.start(0))
392429

393-
def skip_blank(self):
394-
"skip whitespace and comments"
430+
def _skip_blank(self):
431+
"Skip whitespace and comments"
395432
comment = [] # start positions of comments
396433
while True:
397434
if self.pos >= len(self.code):
@@ -417,6 +454,7 @@ def skip_blank(self):
417454
break
418455

419456
def t_String(self, match):
457+
"String rule"
420458
start, end = self.pos, None
421459
self.pos += 1 # skip opening '"'
422460
newlines = []
@@ -444,6 +482,7 @@ def t_String(self, match):
444482
return Token("String", result, start)
445483

446484
def t_Number(self, match):
485+
"Number rule"
447486
text = match.group(0)
448487
pos = match.end(0)
449488
if self.code[pos - 1 : pos + 1] == "..":
@@ -454,21 +493,27 @@ def t_Number(self, match):
454493
self.pos = pos
455494
return Token("Number", text, match.start(0))
456495

457-
def token_mode(self, match, tag, mode):
496+
# This isn't outside of here so it's considered internal
497+
def _token_mode(self, match, tag, mode):
458498
"consume a token and switch mode"
459499
text = match.group(0)
460500
self.pos = match.end(0)
461-
self.change_mode(mode)
501+
self._change_mode(mode)
462502
return Token(tag, text, match.start(0))
463503

464504
def t_Get(self, match):
465-
return self.token_mode(match, "Get", "filename")
505+
"Get rule"
506+
return self._token_mode(match, "Get", "filename")
466507

467508
def t_Put(self, match):
468-
return self.token_mode(match, "Put", "filename")
509+
"Put rule"
510+
return self._token_mode(match, "Put", "filename")
469511

470512
def t_PutAppend(self, match):
471-
return self.token_mode(match, "PutAppend", "filename")
513+
"PutAppend rule"
514+
return self._token_mode(match, "PutAppend", "filename")
472515

473516
def t_Filename(self, match):
474-
return self.token_mode(match, "Filename", "expr")
517+
"Filename rule"
518+
return self._token_mode(match, "Filename", "expr")
519+

0 commit comments

Comments
 (0)