|
1 | 1 | package parser; |
2 | 2 |
|
| 3 | +import java.util.Scanner; |
3 | 4 | import java.util.regex.Pattern; |
| 5 | +import parser.Token.Lexeme_Types; |
4 | 6 |
|
5 | 7 | /** |
6 | 8 | <b> |
7 | 9 | Purpose: Perform tokenization on input files, such that later stages may read a stream of tokens one by one.<br> |
8 | 10 | Programmer: Gabriel Toban Harris <br> |
9 | | -Date: 2021-07-27 |
| 11 | +Date: 2021-07-[27, 28] |
10 | 12 | </b> |
11 | 13 | */ |
12 | 14 |
|
13 | 15 | public class Tokenizer |
14 | 16 | { |
| 17 | + /** |
| 18 | + * Inner class representing custom return type to allow returning extra data. Like a pair. |
| 19 | + * Said extra data, when not null, is the start of the next {@link Token}. |
| 20 | + */ |
| 21 | + public static class Returned_Data |
| 22 | + { |
| 23 | + /** |
| 24 | + * Data which was read but not part of the current {@link #FULLY_FORMED_PART}. It should be feed directly back into {@link Tokenizer#tokenize(long, String, Scanner)}. |
| 25 | + */ |
| 26 | + public String extra_data; |
| 27 | + |
| 28 | + /** |
| 29 | + * Created {@link Token} from {@link Tokenizer#tokenize(long, String, Scanner)}. |
| 30 | + */ |
| 31 | + public final Token FULLY_FORMED_PART; |
| 32 | + |
| 33 | + /** |
| 34 | + * @see #Returned_Data(String, Token) |
| 35 | + */ |
| 36 | + public Returned_Data(final Token FULLY_FORMED_PART) |
| 37 | + { |
| 38 | + this(null, FULLY_FORMED_PART); |
| 39 | + } |
| 40 | + |
| 41 | + /** |
| 42 | + * Fully parameterized constructor. |
| 43 | + * |
| 44 | + * @param EXTRA_DATA {@link #extra_data} |
| 45 | + * @param FULLY_FORMED_PART {@link #FULLY_FORMED_PART} |
| 46 | + */ |
| 47 | + public Returned_Data(final String EXTRA_DATA, final Token FULLY_FORMED_PART) |
| 48 | + { |
| 49 | + this.extra_data = EXTRA_DATA; |
| 50 | + this.FULLY_FORMED_PART = FULLY_FORMED_PART; |
| 51 | + } |
| 52 | + } |
| 53 | + |
15 | 54 | /** |
16 | 55 | * Special marker of section starts. |
17 | 56 | */ |
18 | | - public final static char SENTINEL_START = '{'; |
| 57 | + public final static String SENTINEL_START = "{"; |
19 | 58 |
|
20 | 59 | /** |
21 | 60 | * Special marker of section end. |
22 | 61 | */ |
23 | | - public final static char SENTINEL_END = '}'; |
| 62 | + public final static String SENTINEL_END = "}"; |
24 | 63 |
|
25 | 64 | /** |
26 | 65 | * Special marker of the start of card in expression. |
27 | 66 | */ |
28 | | - public final static char CONDITION_CARD_START = '['; |
| 67 | + public final static String CONDITION_CARD_START = "["; |
29 | 68 |
|
30 | 69 | /** |
31 | 70 | * Special marker of the end of card in expression. |
32 | 71 | */ |
33 | | - public final static char CONDITION_CARD_END = ']'; |
| 72 | + public final static String CONDITION_CARD_END = "]"; |
34 | 73 |
|
35 | 74 | /** |
36 | 75 | * Special marker of section starts. |
37 | 76 | */ |
38 | | - public final static char CONDITION_SCENARIO_START = '<'; |
| 77 | + public final static String CONDITION_SCENARIO_START = "<"; |
39 | 78 |
|
40 | 79 | /** |
41 | 80 | * Special marker of section end. |
42 | 81 | */ |
43 | | - public final static char CONDITION_SCENARIO_END = '>'; |
| 82 | + public final static String CONDITION_SCENARIO_END = ">"; |
44 | 83 |
|
45 | 84 | /** |
46 | 85 | * Special marker of the start of a subexpression. |
47 | 86 | */ |
48 | | - public final static char CONDITION_EXPR_START = '('; |
| 87 | + public final static String CONDITION_EXPR_START = "("; |
49 | 88 |
|
50 | 89 | /** |
51 | 90 | * Special marker of the end of a subexpression. |
52 | 91 | */ |
53 | | - public final static char CONDITION_EXPR_END = ')'; |
| 92 | + public final static String CONDITION_EXPR_END = ")"; |
54 | 93 |
|
55 | 94 | /** |
56 | | - * Representation of unary operator not. |
| 95 | + * Pattern indicating the definition of the {@link Lexeme_Types#DECK_START} {@link Token}. |
57 | 96 | */ |
58 | | - public final static String NOT = "NOT"; |
| 97 | + public final static Pattern DECK_START = Pattern.compile("\\s*deck list:\\s*"); |
59 | 98 |
|
60 | 99 | /** |
61 | | - * Representation of binary operator and. |
| 100 | + * Pattern indicating the definition of the {@link Lexeme_Types#PROBABILITY_START} {@link Token}. |
62 | 101 | */ |
63 | | - public final static String AND = "AND"; |
| 102 | + public final static Pattern PROBABILITY_START = Pattern.compile("\\s*scenarios:\\s*"); |
64 | 103 |
|
65 | 104 | /** |
66 | | - * Representation of binary operator or. |
| 105 | + * Pattern indicating the definition of the {@link Lexeme_Types#TREE_START} {@link Token}. |
67 | 106 | */ |
68 | | - public final static String OR = "OR"; |
| 107 | + public final static Pattern TREE_START = Pattern.compile("\\s*scenario\\s*"); |
69 | 108 |
|
70 | 109 | /** |
71 | | - * Representation of binary operator xor. |
| 110 | + * Pattern indicating the definition of the {@link Lexeme_Types#DISPLAY_START} {@link Token}. |
72 | 111 | */ |
73 | | - public final static String XOR = "XOR"; |
| 112 | + public final static Pattern DISPLAY_START = Pattern.compile("\\s*display\\s*"); |
74 | 113 |
|
75 | 114 | /** |
76 | | - * Pattern indicating the definition of the DECK_START token. |
| 115 | + * Definition of valid char that an the ID lexeme can have. |
77 | 116 | */ |
78 | | - public final static Pattern DECK_START = Pattern.compile("\\s*deck list:\\s*"); |
| 117 | + public final static Pattern ID_CHAR_SET = Pattern.compile("[^;=" + SENTINEL_START + SENTINEL_END + CONDITION_CARD_START + CONDITION_CARD_END + CONDITION_SCENARIO_START + |
| 118 | + CONDITION_SCENARIO_END + CONDITION_EXPR_START + CONDITION_EXPR_END + "]"); |
79 | 119 |
|
80 | 120 | /** |
81 | | - * Pattern indicating the definition of the PROBABILITY_START token. |
| 121 | + * Performs tokenization, after which other functions are called internally. |
| 122 | + * |
| 123 | + * @param LINE_NUMBER of source file being read |
| 124 | + * @param START starting string, expected to be length 1 |
| 125 | + * @param INPUT source to be read from |
| 126 | + * @return newly created {@link Returned_Data} object |
82 | 127 | */ |
83 | | - public final static Pattern PROBABILITY_START = Pattern.compile("\\s*scenarios:\\s*"); |
| 128 | + public static Returned_Data tokenize(final long LINE_NUMBER, final String START, final Scanner INPUT) |
| 129 | + { |
| 130 | + final StringBuilder LEXEME = new StringBuilder(32); //32 feels like a good starting number, possibly go up to 256 (byte) as that should be a limit. |
| 131 | + LEXEME.append(START); |
| 132 | + |
| 133 | + //deal with 1 char lexeme first |
| 134 | + switch (START) |
| 135 | + { |
| 136 | + //deal with single char tokens first |
| 137 | + case SENTINEL_START: |
| 138 | + return new Returned_Data(new Token(Token.Lexeme_Types.SENTINEL_START, LINE_NUMBER, START)); |
| 139 | + case SENTINEL_END: |
| 140 | + return new Returned_Data(new Token(Token.Lexeme_Types.SENTINEL_END, LINE_NUMBER, START)); |
| 141 | + case CONDITION_CARD_START: |
| 142 | + return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_CARD_START, LINE_NUMBER, START)); |
| 143 | + case CONDITION_CARD_END: |
| 144 | + return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_CARD_END, LINE_NUMBER, START)); |
| 145 | + case CONDITION_SCENARIO_START: |
| 146 | + return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_SCENARIO_START, LINE_NUMBER, START)); |
| 147 | + case CONDITION_SCENARIO_END: |
| 148 | + return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_SCENARIO_END, LINE_NUMBER, START)); |
| 149 | + case CONDITION_EXPR_START: |
| 150 | + return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_EXPR_START, LINE_NUMBER, START)); |
| 151 | + case CONDITION_EXPR_END: |
| 152 | + return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_EXPR_END, LINE_NUMBER, START)); |
| 153 | + case ";": |
| 154 | + return new Returned_Data(new Token(Token.Lexeme_Types.SEMI_COLON, LINE_NUMBER, START)); |
| 155 | + case "=": |
| 156 | + return new Returned_Data(new Token(Token.Lexeme_Types.ASSIGN, LINE_NUMBER, START)); |
| 157 | + //comments |
| 158 | + case "/": |
| 159 | + { |
| 160 | + if (INPUT.hasNext()) |
| 161 | + { |
| 162 | + String placeholder = INPUT.next(); |
| 163 | + LEXEME.append(placeholder); |
| 164 | + |
| 165 | + //line comment |
| 166 | + if (placeholder.equals("/")) |
| 167 | + { |
| 168 | + if (INPUT.hasNext()) |
| 169 | + LEXEME.append(INPUT.nextLine()); |
| 170 | + |
| 171 | + return new Returned_Data(new Token(Token.Lexeme_Types.LINE_COMMENT, LINE_NUMBER, LEXEME.toString())); |
| 172 | + } |
| 173 | + //block comment |
| 174 | + else if (placeholder.equals("*")) |
| 175 | + { |
| 176 | + while (INPUT.hasNext()) |
| 177 | + { |
| 178 | + LEXEME.append(placeholder = INPUT.next()); |
| 179 | + |
| 180 | + if (placeholder.equals("*")) |
| 181 | + { |
| 182 | + if (INPUT.hasNext()) |
| 183 | + { |
| 184 | + LEXEME.append(placeholder = INPUT.next()); |
| 185 | + |
| 186 | + if (placeholder.equals("/")) |
| 187 | + return new Returned_Data(new Token(Token.Lexeme_Types.BLOCK_COMMENT, LINE_NUMBER, LEXEME.toString())); |
| 188 | + } |
| 189 | + else |
| 190 | + break; |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + //block comment lacking closing symbols |
| 195 | + return new Returned_Data(new Token(Token.Lexeme_Types.BLOCK_COMMENT_ERROR, LINE_NUMBER, LEXEME.toString())); |
| 196 | + } |
| 197 | + } |
| 198 | + |
| 199 | + //defer to ID_CHAR_SET |
| 200 | + return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME.toString())); |
| 201 | + } |
| 202 | + //TODO:finish |
| 203 | + //binary operators |
| 204 | + |
| 205 | + //display values |
| 206 | + case "t": |
| 207 | + { |
| 208 | + if (INPUT.hasNext()) |
| 209 | + { |
| 210 | + String placeholder = INPUT.next(); |
| 211 | + LEXEME.append(placeholder); |
| 212 | + //true keyword |
| 213 | + if (placeholder.equals("r")) |
| 214 | + { |
| 215 | + |
| 216 | + |
| 217 | + LEXEME.append(placeholder = INPUT.next()); |
| 218 | + |
| 219 | + if (placeholder.equals("u")) |
| 220 | + { |
| 221 | + LEXEME.append(placeholder= INPUT.next()); |
| 222 | + |
| 223 | + if (placeholder.equals("e")) |
| 224 | + { |
| 225 | + placeholder= INPUT.next(); |
| 226 | + |
| 227 | + if () |
| 228 | + } |
| 229 | + } |
| 230 | + } |
| 231 | + |
| 232 | + return parse_ID(LINE_NUMBER, LEXEME, INPUT); |
| 233 | + } |
| 234 | + |
| 235 | + return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME.toString())); |
| 236 | + } |
| 237 | + //guess it is some sort of ID |
| 238 | + default: |
| 239 | + return gather_ID_chars(LINE_NUMBER, LEXEME, INPUT); |
| 240 | + } |
| 241 | + } |
84 | 242 |
|
85 | 243 | /** |
86 | | - * Pattern indicating the definition of the TREE_START token. |
| 244 | + * Determines which {@link Token} should be formed given the lexeme provided. |
| 245 | + * |
| 246 | + * @param LINE_NUMBER of source file being read |
| 247 | + * @param REMAINDER which is the extra data that should be feed back into {@link #tokenize(long, String, Scanner)} |
| 248 | + * @param COMPLETE_LEXEME which is fully formed |
| 249 | + * @return the created {@link Token} wrapped in a {@link Returned_Data} |
87 | 250 | */ |
88 | | - public final static Pattern TREE_START = Pattern.compile("\\s*scenario\\s*"); |
| 251 | + private static Returned_Data parse_potentional_ID(final long LINE_NUMBER, final String REMAINDER, final String COMPLETE_LEXEME) |
| 252 | + { |
| 253 | + //Test lexeme for special sequences, if all fail then is in fact ID. |
| 254 | + if (TREE_START.matcher(COMPLETE_LEXEME).matches()) |
| 255 | + return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.TREE_START, LINE_NUMBER, COMPLETE_LEXEME)); |
| 256 | + else if (DISPLAY_START.matcher(COMPLETE_LEXEME).matches()) |
| 257 | + return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DISPLAY_START, LINE_NUMBER, COMPLETE_LEXEME)); |
| 258 | + else if (PROBABILITY_START.matcher(COMPLETE_LEXEME).matches()) |
| 259 | + return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.PROBABILITY_START, LINE_NUMBER, COMPLETE_LEXEME)); |
| 260 | + else if (DECK_START.matcher(COMPLETE_LEXEME).matches()) |
| 261 | + return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DECK_START, LINE_NUMBER, COMPLETE_LEXEME)); |
| 262 | + else |
| 263 | + return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, COMPLETE_LEXEME)); |
| 264 | + } |
89 | 265 |
|
90 | 266 | /** |
91 | | - * Pattern indicating the definition of the DISPLAY_START Token. |
| 267 | + * Subroutine to obtain a sequence which may be an ID. |
| 268 | + * |
| 269 | + * @param LINE_NUMBER of source file being read |
| 270 | + * @param LEXEME_START is the lexem formed thus far |
| 271 | + * @param INPUT source to be read from |
| 272 | + * @return the created {@link Token} wrapped in a {@link Returned_Data} |
92 | 273 | */ |
93 | | - public final static Pattern DISPLAY_START = Pattern.compile("\\s*display\\s*"); |
| 274 | + private static Returned_Data gather_ID_chars(final long LINE_NUMBER, final StringBuilder LEXEME_START, final Scanner INPUT) |
| 275 | + { |
| 276 | + String placeholder; |
| 277 | + |
| 278 | + while (INPUT.hasNext()) |
| 279 | + { |
| 280 | + placeholder = INPUT.next(); |
| 281 | + |
| 282 | + if (ID_CHAR_SET.matcher(placeholder).matches()) |
| 283 | + LEXEME_START.append(placeholder); |
| 284 | + else |
| 285 | + return parse_potentional_ID(LINE_NUMBER, placeholder, LEXEME_START.toString()); |
| 286 | + } |
| 287 | + |
| 288 | + return parse_potentional_ID(LINE_NUMBER, null, LEXEME_START.toString()); |
| 289 | + } |
94 | 290 |
|
95 | 291 | /** |
96 | | - * Definition of the ID Token. |
| 292 | + * Subroutine to create a {@link Lexeme_Types#ID} {@link Token}. |
| 293 | + * |
| 294 | + * @param LINE_NUMBER of source file being read |
| 295 | + * @param LEXEME_START is the lexem formed thus far |
| 296 | + * @param INPUT source to be read from |
| 297 | + * @return the created {@link Token} wrapped in a {@link Returned_Data} |
97 | 298 | */ |
98 | | - public final static Pattern ID = Pattern.compile("[^;=" + SENTINEL_START + SENTINEL_END + CONDITION_CARD_START + CONDITION_CARD_END + CONDITION_SCENARIO_START + |
99 | | - CONDITION_SCENARIO_END + CONDITION_EXPR_START + CONDITION_EXPR_END + "]+"); |
| 299 | + private static Returned_Data parse_ID(final long LINE_NUMBER, final StringBuilder LEXEME_START, final Scanner INPUT) |
| 300 | + { |
| 301 | + String placeholder; |
| 302 | + |
| 303 | + while (INPUT.hasNext()) |
| 304 | + { |
| 305 | + placeholder = INPUT.next(); |
| 306 | + |
| 307 | + if (ID_CHAR_SET.matcher(placeholder).matches()) |
| 308 | + LEXEME_START.append(placeholder); |
| 309 | + else |
| 310 | + return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME_START.toString())); |
| 311 | + } |
| 312 | + |
| 313 | + return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME_START.toString())); |
| 314 | + } |
100 | 315 | } |
0 commit comments