sapphire-python/tokenizer.py at master · raghav198/sapphire-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# coding=utf-8
from enum import Enum, auto
import re


class TokenError(Exception):
    pass

class TokenType(Enum):
        STR = auto()
        NUM = auto()
        ASSIGN = auto()
        ADD = auto()
        SUB = auto()
        MUL = auto()
        DIV = auto()
        LPAR = auto()
        RPAR = auto()
        LBLK = auto()
        RBLK = auto()
        THEN = auto()
        ELSE = auto()
        EXP = auto()
        ID = auto()
        TERM = auto()
        GT = auto()
        LT = auto()
        EQ = auto()
        GE = auto()
        LE = auto()
        EOF = auto()
        AND = auto()
        OR = auto()
        NOT = auto()
        XOR = auto()
        EOL = auto()
        LARG = auto()
        RARG = auto()
        SEPARG = auto()


class Token:
    def __init__(self, tok: TokenType, val=None):
        self.type = tok
        self.value = val
        self.tokenList = []

    def __str__(self):
        return f'{self.type.name}{("(" + str(self.value) + ")") if self.value is not None else ""}'
    __repr__ = __str__


class Tokenizer:
    def __init__(self, stream: str, patterns: dict=None):
        self.stream = stream
        self.patterns = patterns

    def tokens(self, debug=False):
        def match(tok, string, capture=False):
            # input(f'matching {string}')
            if self.stream.startswith(string):
                # print(f'matched [{string}]')
                self.stream = self.stream[len(string):]
                # print(f'<<{self.stream}>> remains')
                return Token(tok, string) if capture else Token(tok)
            return False

        self.stream = self.stream.lstrip()
        while len(self.stream):
            self.stream = self.stream.lstrip()
            if debug:
                input()
                print('###' + self.stream)
            if self.stream[0].isdigit():
                number, rest = re.match(r'(\d+)(.*)', self.stream, re.MULTILINE).groups()
                # print(f'found number {number}; {{{rest}}} remains!')
                self.stream = self.stream[len(number):]
                yield Token(TokenType.NUM, int(number))
            elif self.stream[0] in "'\"":
                string, rest = re.match(rf'{self.stream[0]}(.*?){self.stream[0]}(.*)', self.stream, re.MULTILINE).groups()
                # print(f'found string {string}; {{{rest}}} remains!')
                self.stream = self.stream[len(string) + 2:]
                yield Token(TokenType.STR, string)
            elif self.stream[0].isalpha():
                ident, rest = re.match(r'([a-z]+)(.*)', self.stream, re.MULTILINE).groups()
                # print(f'found ID {ident}; {{{rest}}} remains!')
                self.stream = self.stream[len(ident):]
                yield Token(TokenType.ID, ident)
            else:
                yield match(TokenType.ASSIGN, ':-') or \
                      match(TokenType.LBLK, '{{') or \
                      match(TokenType.RBLK, '}}') or \
                      match(TokenType.THEN, '=>') or \
                      match(TokenType.ELSE, '!!') or \
                      match(TokenType.OR, '||') or \
                      match(TokenType.AND, '&&') or \
                      match(TokenType.GE, '>=') or \
                      match(TokenType.GE, '<=') or \
                      match(TokenType.LARG, ':{') or \
                      match(TokenType.NOT, '~') or \
                      match(TokenType.XOR, '$') or \
                      match(TokenType.ADD, '+') or\
                      match(TokenType.SUB, '-') or\
                      match(TokenType.MUL, '*') or \
                      match(TokenType.DIV, '/') or \
                      match(TokenType.LPAR, '(') or \
                      match(TokenType.RPAR, ')') or \
                      match(TokenType.EXP, '^') or \
                      match(TokenType.TERM, '.') or \
                      match(TokenType.GT, '>') or \
                      match(TokenType.LT, '<') or \
                      match(TokenType.EQ, '=') or \
                      match(TokenType.RARG, '}') or \
                      match(TokenType.SEPARG, ';')
            # print('<' + self.stream + '>')
        yield Token(TokenType.EOF)


if __name__ == '__main__':
    print(Token(TokenType.NUM, 5))
    print(Token(TokenType.LPAR))

    tok = Tokenizer('(x>3) => {{ number :- 4; }} !! {{ number :- 7; }}')
    print(list(tok.tokens()))
    tok = Tokenizer('print:{"Hello", x}')
    print(list(tok.tokens()))