Skip to content

Commit 8f938ca

Browse files
Most of the Tokenizer
Wrote overall structure of tokenizer. As well as multiple subroutines for it.
1 parent 89cac14 commit 8f938ca

File tree

3 files changed

+248
-32
lines changed

3 files changed

+248
-32
lines changed

parser/Grammar notes.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,19 @@ card_name: [^;\]]+ //include CONDITION_CARD_END
55
scenarios: scenarios:
66
scenario_name: [^=>]+ //include CONDITION_SCENARIO_END
77
display: display
8-
Upper case: not, and, or, xor, true, false.
8+
Upper case: not, and, or, xor
9+
Lower case: true, false
910
single symbol are exactly the same.
1011

1112
Merge card_name and scenario_name and not allowing any already reserved special characters into unified ID regex for tokenizer.
12-
Thus ID: [^;:=<>\(\)\{\}]+ and grammar will defer to ID in places it used to use them.
13+
Thus ID: [^;=<>\(\)\{\}]+ and grammar will defer to ID in places it used to use them.
1314

1415
//lower case for literal, capital for production rule.
1516
//Note grammar does not have the unification resulting in ID, though is coded with that change.
1617
START -> DECK PROBABILITY .
1718

1819
DECK -> DECK_START SENTINEL_START DECK_LIST SENTINEL_END.
19-
DECK_START -> deck_list: .
20+
DECK_START -> deck list: .
2021
DECK_LIST -> CARD MORE_CARDS .
2122
MORE_CARDS -> CARD MORE_CARDS | .
2223
CARD -> CARD_NAME ; .

parser/Token.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public static enum Lexeme_Types
2121
{
2222
DECK_START, ID, SEMI_COLON, PROBABILITY_START, SENTINEL_START, SENTINEL_END, ASSIGN, TREE_START, NOT, CONDITION_CARD_START, CONDITION_CARD_END,
2323
CONDITION_SCENARIO_START, CONDITION_SCENARIO_END, CONDITION_EXPR_START, CONDITION_EXPR_END, OR, AND, XOR, DISPLAY_START, TRUE, FALSE, LINE_COMMENT, BLOCK_COMMENT,
24-
SPECIAL_SEPERATOR, UNKNOWN_CHARACTER_ERROR, NAME_ERROR, BLOCK_COMMENT_ERROR;
24+
SPECIAL_SEPERATOR, UNKNOWN_CHARACTER_ERROR, ID_ERROR, BLOCK_COMMENT_ERROR;
2525

2626
/**
2727
* Compares SPECIAL_SEPERATOR with INPUT. Expecting it to be -1, so that value < 0 is true.

parser/Tokenizer.java

Lines changed: 243 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,315 @@
11
package parser;
22

3+
import java.util.Scanner;
34
import java.util.regex.Pattern;
5+
import parser.Token.Lexeme_Types;
46

57
/**
68
<b>
79
Purpose: Perform tokenization on input files, such that later stages may read a stream of tokens one by one.<br>
810
Programmer: Gabriel Toban Harris <br>
9-
Date: 2021-07-27
11+
Date: 2021-07-[27, 28]
1012
</b>
1113
*/
1214

1315
public class Tokenizer
1416
{
17+
/**
18+
* Inner class representing custom return type to allow returning extra data. Like a pair.
19+
* Said extra data, when not null, is the start of the next {@link Token}.
20+
*/
21+
public static class Returned_Data
22+
{
23+
/**
24+
* Data which was read but not part of the current {@link #FULLY_FORMED_PART}. It should be feed directly back into {@link Tokenizer#tokenize(long, String, Scanner)}.
25+
*/
26+
public String extra_data;
27+
28+
/**
29+
* Created {@link Token} from {@link Tokenizer#tokenize(long, String, Scanner)}.
30+
*/
31+
public final Token FULLY_FORMED_PART;
32+
33+
/**
34+
* @see #Returned_Data(String, Token)
35+
*/
36+
public Returned_Data(final Token FULLY_FORMED_PART)
37+
{
38+
this(null, FULLY_FORMED_PART);
39+
}
40+
41+
/**
42+
* Fully parameterized constructor.
43+
*
44+
* @param EXTRA_DATA {@link #extra_data}
45+
* @param FULLY_FORMED_PART {@link #FULLY_FORMED_PART}
46+
*/
47+
public Returned_Data(final String EXTRA_DATA, final Token FULLY_FORMED_PART)
48+
{
49+
this.extra_data = EXTRA_DATA;
50+
this.FULLY_FORMED_PART = FULLY_FORMED_PART;
51+
}
52+
}
53+
1554
/**
1655
* Special marker of section starts.
1756
*/
18-
public final static char SENTINEL_START = '{';
57+
public final static String SENTINEL_START = "{";
1958

2059
/**
2160
* Special marker of section end.
2261
*/
23-
public final static char SENTINEL_END = '}';
62+
public final static String SENTINEL_END = "}";
2463

2564
/**
2665
* Special marker of the start of card in expression.
2766
*/
28-
public final static char CONDITION_CARD_START = '[';
67+
public final static String CONDITION_CARD_START = "[";
2968

3069
/**
3170
* Special marker of the end of card in expression.
3271
*/
33-
public final static char CONDITION_CARD_END = ']';
72+
public final static String CONDITION_CARD_END = "]";
3473

3574
/**
3675
* Special marker of section starts.
3776
*/
38-
public final static char CONDITION_SCENARIO_START = '<';
77+
public final static String CONDITION_SCENARIO_START = "<";
3978

4079
/**
4180
* Special marker of section end.
4281
*/
43-
public final static char CONDITION_SCENARIO_END = '>';
82+
public final static String CONDITION_SCENARIO_END = ">";
4483

4584
/**
4685
* Special marker of the start of a subexpression.
4786
*/
48-
public final static char CONDITION_EXPR_START = '(';
87+
public final static String CONDITION_EXPR_START = "(";
4988

5089
/**
5190
* Special marker of the end of a subexpression.
5291
*/
53-
public final static char CONDITION_EXPR_END = ')';
92+
public final static String CONDITION_EXPR_END = ")";
5493

5594
/**
56-
* Representation of unary operator not.
95+
* Pattern indicating the definition of the {@link Lexeme_Types#DECK_START} {@link Token}.
5796
*/
58-
public final static String NOT = "NOT";
97+
public final static Pattern DECK_START = Pattern.compile("\\s*deck list:\\s*");
5998

6099
/**
61-
* Representation of binary operator and.
100+
* Pattern indicating the definition of the {@link Lexeme_Types#PROBABILITY_START} {@link Token}.
62101
*/
63-
public final static String AND = "AND";
102+
public final static Pattern PROBABILITY_START = Pattern.compile("\\s*scenarios:\\s*");
64103

65104
/**
66-
* Representation of binary operator or.
105+
* Pattern indicating the definition of the {@link Lexeme_Types#TREE_START} {@link Token}.
67106
*/
68-
public final static String OR = "OR";
107+
public final static Pattern TREE_START = Pattern.compile("\\s*scenario\\s*");
69108

70109
/**
71-
* Representation of binary operator xor.
110+
* Pattern indicating the definition of the {@link Lexeme_Types#DISPLAY_START} {@link Token}.
72111
*/
73-
public final static String XOR = "XOR";
112+
public final static Pattern DISPLAY_START = Pattern.compile("\\s*display\\s*");
74113

75114
/**
76-
* Pattern indicating the definition of the DECK_START token.
115+
* Definition of valid char that an the ID lexeme can have.
77116
*/
78-
public final static Pattern DECK_START = Pattern.compile("\\s*deck list:\\s*");
117+
public final static Pattern ID_CHAR_SET = Pattern.compile("[^;=" + SENTINEL_START + SENTINEL_END + CONDITION_CARD_START + CONDITION_CARD_END + CONDITION_SCENARIO_START +
118+
CONDITION_SCENARIO_END + CONDITION_EXPR_START + CONDITION_EXPR_END + "]");
79119

80120
/**
81-
* Pattern indicating the definition of the PROBABILITY_START token.
121+
* Performs tokenization, after which other functions are called internally.
122+
*
123+
* @param LINE_NUMBER of source file being read
124+
* @param START starting string, expected to be length 1
125+
* @param INPUT source to be read from
126+
* @return newly created {@link Returned_Data} object
82127
*/
83-
public final static Pattern PROBABILITY_START = Pattern.compile("\\s*scenarios:\\s*");
128+
public static Returned_Data tokenize(final long LINE_NUMBER, final String START, final Scanner INPUT)
129+
{
130+
final StringBuilder LEXEME = new StringBuilder(32); //32 feels like a good starting number, possibly go up to 256 (byte) as that should be a limit.
131+
LEXEME.append(START);
132+
133+
//deal with 1 char lexeme first
134+
switch (START)
135+
{
136+
//deal with single char tokens first
137+
case SENTINEL_START:
138+
return new Returned_Data(new Token(Token.Lexeme_Types.SENTINEL_START, LINE_NUMBER, START));
139+
case SENTINEL_END:
140+
return new Returned_Data(new Token(Token.Lexeme_Types.SENTINEL_END, LINE_NUMBER, START));
141+
case CONDITION_CARD_START:
142+
return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_CARD_START, LINE_NUMBER, START));
143+
case CONDITION_CARD_END:
144+
return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_CARD_END, LINE_NUMBER, START));
145+
case CONDITION_SCENARIO_START:
146+
return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_SCENARIO_START, LINE_NUMBER, START));
147+
case CONDITION_SCENARIO_END:
148+
return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_SCENARIO_END, LINE_NUMBER, START));
149+
case CONDITION_EXPR_START:
150+
return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_EXPR_START, LINE_NUMBER, START));
151+
case CONDITION_EXPR_END:
152+
return new Returned_Data(new Token(Token.Lexeme_Types.CONDITION_EXPR_END, LINE_NUMBER, START));
153+
case ";":
154+
return new Returned_Data(new Token(Token.Lexeme_Types.SEMI_COLON, LINE_NUMBER, START));
155+
case "=":
156+
return new Returned_Data(new Token(Token.Lexeme_Types.ASSIGN, LINE_NUMBER, START));
157+
//comments
158+
case "/":
159+
{
160+
if (INPUT.hasNext())
161+
{
162+
String placeholder = INPUT.next();
163+
LEXEME.append(placeholder);
164+
165+
//line comment
166+
if (placeholder.equals("/"))
167+
{
168+
if (INPUT.hasNext())
169+
LEXEME.append(INPUT.nextLine());
170+
171+
return new Returned_Data(new Token(Token.Lexeme_Types.LINE_COMMENT, LINE_NUMBER, LEXEME.toString()));
172+
}
173+
//block comment
174+
else if (placeholder.equals("*"))
175+
{
176+
while (INPUT.hasNext())
177+
{
178+
LEXEME.append(placeholder = INPUT.next());
179+
180+
if (placeholder.equals("*"))
181+
{
182+
if (INPUT.hasNext())
183+
{
184+
LEXEME.append(placeholder = INPUT.next());
185+
186+
if (placeholder.equals("/"))
187+
return new Returned_Data(new Token(Token.Lexeme_Types.BLOCK_COMMENT, LINE_NUMBER, LEXEME.toString()));
188+
}
189+
else
190+
break;
191+
}
192+
}
193+
194+
//block comment lacking closing symbols
195+
return new Returned_Data(new Token(Token.Lexeme_Types.BLOCK_COMMENT_ERROR, LINE_NUMBER, LEXEME.toString()));
196+
}
197+
}
198+
199+
//defer to ID_CHAR_SET
200+
return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME.toString()));
201+
}
202+
//TODO:finish
203+
//binary operators
204+
205+
//display values
206+
case "t":
207+
{
208+
if (INPUT.hasNext())
209+
{
210+
String placeholder = INPUT.next();
211+
LEXEME.append(placeholder);
212+
//true keyword
213+
if (placeholder.equals("r"))
214+
{
215+
216+
217+
LEXEME.append(placeholder = INPUT.next());
218+
219+
if (placeholder.equals("u"))
220+
{
221+
LEXEME.append(placeholder= INPUT.next());
222+
223+
if (placeholder.equals("e"))
224+
{
225+
placeholder= INPUT.next();
226+
227+
if ()
228+
}
229+
}
230+
}
231+
232+
return parse_ID(LINE_NUMBER, LEXEME, INPUT);
233+
}
234+
235+
return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME.toString()));
236+
}
237+
//guess it is some sort of ID
238+
default:
239+
return gather_ID_chars(LINE_NUMBER, LEXEME, INPUT);
240+
}
241+
}
84242

85243
/**
86-
* Pattern indicating the definition of the TREE_START token.
244+
* Determines which {@link Token} should be formed given the lexeme provided.
245+
*
246+
* @param LINE_NUMBER of source file being read
247+
* @param REMAINDER which is the extra data that should be feed back into {@link #tokenize(long, String, Scanner)}
248+
* @param COMPLETE_LEXEME which is fully formed
249+
* @return the created {@link Token} wrapped in a {@link Returned_Data}
87250
*/
88-
public final static Pattern TREE_START = Pattern.compile("\\s*scenario\\s*");
251+
private static Returned_Data parse_potentional_ID(final long LINE_NUMBER, final String REMAINDER, final String COMPLETE_LEXEME)
252+
{
253+
//Test lexeme for special sequences, if all fail then is in fact ID.
254+
if (TREE_START.matcher(COMPLETE_LEXEME).matches())
255+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.TREE_START, LINE_NUMBER, COMPLETE_LEXEME));
256+
else if (DISPLAY_START.matcher(COMPLETE_LEXEME).matches())
257+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DISPLAY_START, LINE_NUMBER, COMPLETE_LEXEME));
258+
else if (PROBABILITY_START.matcher(COMPLETE_LEXEME).matches())
259+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.PROBABILITY_START, LINE_NUMBER, COMPLETE_LEXEME));
260+
else if (DECK_START.matcher(COMPLETE_LEXEME).matches())
261+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DECK_START, LINE_NUMBER, COMPLETE_LEXEME));
262+
else
263+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, COMPLETE_LEXEME));
264+
}
89265

90266
/**
91-
* Pattern indicating the definition of the DISPLAY_START Token.
267+
* Subroutine to obtain a sequence which may be an ID.
268+
*
269+
* @param LINE_NUMBER of source file being read
270+
* @param LEXEME_START is the lexem formed thus far
271+
* @param INPUT source to be read from
272+
* @return the created {@link Token} wrapped in a {@link Returned_Data}
92273
*/
93-
public final static Pattern DISPLAY_START = Pattern.compile("\\s*display\\s*");
274+
private static Returned_Data gather_ID_chars(final long LINE_NUMBER, final StringBuilder LEXEME_START, final Scanner INPUT)
275+
{
276+
String placeholder;
277+
278+
while (INPUT.hasNext())
279+
{
280+
placeholder = INPUT.next();
281+
282+
if (ID_CHAR_SET.matcher(placeholder).matches())
283+
LEXEME_START.append(placeholder);
284+
else
285+
return parse_potentional_ID(LINE_NUMBER, placeholder, LEXEME_START.toString());
286+
}
287+
288+
return parse_potentional_ID(LINE_NUMBER, null, LEXEME_START.toString());
289+
}
94290

95291
/**
96-
* Definition of the ID Token.
292+
* Subroutine to create a {@link Lexeme_Types#ID} {@link Token}.
293+
*
294+
* @param LINE_NUMBER of source file being read
295+
* @param LEXEME_START is the lexem formed thus far
296+
* @param INPUT source to be read from
297+
* @return the created {@link Token} wrapped in a {@link Returned_Data}
97298
*/
98-
public final static Pattern ID = Pattern.compile("[^;=" + SENTINEL_START + SENTINEL_END + CONDITION_CARD_START + CONDITION_CARD_END + CONDITION_SCENARIO_START +
99-
CONDITION_SCENARIO_END + CONDITION_EXPR_START + CONDITION_EXPR_END + "]+");
299+
private static Returned_Data parse_ID(final long LINE_NUMBER, final StringBuilder LEXEME_START, final Scanner INPUT)
300+
{
301+
String placeholder;
302+
303+
while (INPUT.hasNext())
304+
{
305+
placeholder = INPUT.next();
306+
307+
if (ID_CHAR_SET.matcher(placeholder).matches())
308+
LEXEME_START.append(placeholder);
309+
else
310+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME_START.toString()));
311+
}
312+
313+
return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME_START.toString()));
314+
}
100315
}

0 commit comments

Comments
 (0)