|
| 1 | +# Interpreter |
| 2 | + |
| 3 | +A component to turning structural text into lexical tokens(lexing) and then interpreting sequences as tokens(parsing) |
| 4 | + |
| 5 | +## Motivation |
| 6 | + |
| 7 | +- Convert textual content into certain structure, an executable or an object in memory. |
| 8 | + |
| 9 | +Examples using Interpreter pattern: |
| 10 | + |
| 11 | +- Expression parsing: `1 + 2 / 3` |
| 12 | +- Regular expression parsing |
| 13 | +- Structural marker language parsing |
| 14 | +- Syntax tree parsing |
| 15 | + |
| 16 | +## Numeric Expression |
| 17 | + |
| 18 | +### Lexing |
| 19 | + |
| 20 | +Lexing means extracting textual content into different parts by certain rules. |
| 21 | +Lexing results are called **tokens**. |
| 22 | + |
| 23 | +```cs |
| 24 | +using System.Text; |
| 25 | + |
| 26 | +var tokens = Lex("1 + 3 - 2 = 2"); |
| 27 | +Console.WriteLine(string.Join(' ', tokens)); |
| 28 | + |
| 29 | +static List<Token> Lex(ReadOnlySpan<char> expr) |
| 30 | +{ |
| 31 | + List<Token> ret = []; |
| 32 | + for (var i = 0; i < expr.Length; i++) |
| 33 | + { |
| 34 | + switch (expr[i]) |
| 35 | + { |
| 36 | + case '+': |
| 37 | + ret.Add(new Token { Content = expr[i].ToString(), TokenType = Token.Type.Plus }); |
| 38 | + break; |
| 39 | + case '-': |
| 40 | + ret.Add(new Token { Content = expr[i].ToString(), TokenType = Token.Type.Minus }); |
| 41 | + break; |
| 42 | + case '(': |
| 43 | + ret.Add(new Token { Content = expr[i].ToString(), TokenType = Token.Type.LeftParentsis }); |
| 44 | + break; |
| 45 | + case ')': |
| 46 | + ret.Add(new Token { Content = expr[i].ToString(), TokenType = Token.Type.RightParentsis }); |
| 47 | + break; |
| 48 | + case '=': |
| 49 | + ret.Add(new Token { Content = expr[i].ToString(), TokenType = Token.Type.Equal }); |
| 50 | + break; |
| 51 | + case var c when char.IsDigit(c): |
| 52 | + // directly add as the last digit char |
| 53 | + if (i == expr.Length - 1) |
| 54 | + { |
| 55 | + ret.Add(new Token { Content = c.ToString(), TokenType = Token.Type.Int }); |
| 56 | + break; |
| 57 | + } |
| 58 | + var sb = new StringBuilder(c.ToString()); |
| 59 | + for (int j = i + 1; j < expr.Length; j++) |
| 60 | + { |
| 61 | + if (char.IsDigit(expr[j])) |
| 62 | + { |
| 63 | + sb.Append(expr[j]); |
| 64 | + i++; |
| 65 | + } |
| 66 | + else |
| 67 | + { |
| 68 | + ret.Add(new Token { Content = sb.ToString(), TokenType = Token.Type.Int }); |
| 69 | + break; |
| 70 | + } |
| 71 | + } |
| 72 | + break; |
| 73 | + default: // ignore spaces |
| 74 | + break; |
| 75 | + } |
| 76 | + } |
| 77 | + |
| 78 | + return ret; |
| 79 | +} |
| 80 | + |
| 81 | +class Token |
| 82 | +{ |
| 83 | + public enum Type |
| 84 | + { |
| 85 | + Int, LeftParentsis, RightParentsis, Plus, Minus, Equal |
| 86 | + } |
| 87 | + |
| 88 | + public string? Content { get; set; } |
| 89 | + public Type TokenType { get; set; } |
| 90 | + |
| 91 | + public override string ToString() => $"'{Content}'"; |
| 92 | +} |
| 93 | +``` |
| 94 | + |
| 95 | +### Parsing |
| 96 | + |
| 97 | +Now we're going to parse these tokens into real values since they're numeric. |
| 98 | + |
| 99 | +Here we should use a composition pattern to let all items in the expression having the same form. |
| 100 | +A getter is here to represents the evaluation of an expression element. |
| 101 | + |
| 102 | +```cs |
| 103 | +interface INumericExpressionElement<TValue> where TValue : INumber<TValue> |
| 104 | +{ |
| 105 | + TValue Value { get; } // represents a value evaluation // [!code highlight] |
| 106 | +} |
| 107 | +``` |
| 108 | + |
| 109 | +There can be two kinds of element expression: |
| 110 | + |
| 111 | +- numeric literal like `123` |
| 112 | +- sub expressions like `(1 + 2)` in `(1 + 2) - (1 + 3)` |
| 113 | + |
| 114 | +A numeric literal can be wrapped as any type implements `INumber<T>` |
| 115 | +```cs |
| 116 | +class Number<TValue> : INumericExpressionElement<TValue> where TValue : INumber<TValue> |
| 117 | +{ |
| 118 | + public required TValue Value { get; init; } |
| 119 | +} |
| 120 | +``` |
| 121 | + |
| 122 | +Sub-Expression can have many kinds, like binary operation, unary operation and more... |
| 123 | +**In this example we only handles the binary operation.** |
| 124 | + |
| 125 | +```cs |
| 126 | +enum BinaryOperationType { Addition, Substraction } |
| 127 | +class BinaryOperation<TValue>( |
| 128 | + INumericExpressionElement<TValue> left, |
| 129 | + INumericExpressionElement<TValue> right, |
| 130 | + BinaryOperationType type) |
| 131 | + : INumericExpressionElement<TValue> where TValue : INumber<TValue> |
| 132 | +{ |
| 133 | + |
| 134 | + public BinaryOperation() : this(default!, default!, default) { } |
| 135 | + |
| 136 | + public TValue Value |
| 137 | + { |
| 138 | + get => Type switch // [!code highlight] |
| 139 | + { // [!code highlight] |
| 140 | + BinaryOperationType.Addition => Left.Value + Right.Value, // [!code highlight] |
| 141 | + BinaryOperationType.Substraction => Left.Value - Right.Value, // [!code highlight] |
| 142 | + _ => TValue.Zero // [!code highlight] |
| 143 | + }; // [!code highlight] |
| 144 | + } |
| 145 | + public INumericExpressionElement<TValue> Left { get; set; } = left; |
| 146 | + public INumericExpressionElement<TValue> Right { get; set; } = right; |
| 147 | + public BinaryOperationType Type { get; set; } = type; |
| 148 | +} |
| 149 | +``` |
| 150 | + |
| 151 | +And then we do the parsing, the implementation is quite similar to lexing. |
| 152 | + |
| 153 | +```cs |
| 154 | +static INumericExpressionElement<TValue> Parse<TValue>(ReadOnlySpan<Token> tokens) where TValue : INumber<TValue> |
| 155 | +{ |
| 156 | + var operation = new BinaryOperation<TValue>(); |
| 157 | + bool lhs = false; |
| 158 | + for (int i = 0; i < tokens.Length; i++) |
| 159 | + { |
| 160 | + Token? token = tokens[i]; |
| 161 | + switch (token.TokenType) |
| 162 | + { |
| 163 | + case Token.Type.Number: |
| 164 | + var number = new Number<TValue> { Value = TValue.Parse(token.Content.AsSpan(), NumberStyles.Number, null) }; |
| 165 | + if (!lhs) |
| 166 | + { |
| 167 | + operation.Left = number; |
| 168 | + lhs = true; |
| 169 | + } |
| 170 | + else |
| 171 | + operation.Right = number; |
| 172 | + break; |
| 173 | + case Token.Type.Plus: |
| 174 | + operation.Type = BinaryOperationType.Addition; |
| 175 | + break; |
| 176 | + case Token.Type.Minus: |
| 177 | + operation.Type = BinaryOperationType.Substraction; |
| 178 | + break; |
| 179 | + case Token.Type.LeftParentsis: |
| 180 | + int left = i; |
| 181 | + int right = i; |
| 182 | + |
| 183 | + // find the nearest right parentsis index. |
| 184 | + for (; left < tokens.Length; right++) |
| 185 | + if (tokens[right].TokenType is Token.Type.RightParentsis) break; |
| 186 | + // clamp the sub expression |
| 187 | + var subExpr = tokens[(left + 1)..right]; // [!code highlight] |
| 188 | + // parse subExpr recursively |
| 189 | + var element = Parse<TValue>(subExpr); // [!code highlight] |
| 190 | + if (!lhs) |
| 191 | + { |
| 192 | + operation.Left = element; |
| 193 | + lhs = true; |
| 194 | + } |
| 195 | + else |
| 196 | + operation.Right = element; |
| 197 | + i = right; // remember to skip the whole range for next iteration. // [!code highlight] |
| 198 | + break; |
| 199 | + default: |
| 200 | + throw new InvalidOperationException(); |
| 201 | + } |
| 202 | + } |
| 203 | + return operation; |
| 204 | +} |
| 205 | +``` |
0 commit comments