Skip to content

Commit b1eaef8

Browse files
Finished Tokenizer
Added part to match keywords.
1 parent 8f938ca commit b1eaef8

File tree

2 files changed

+101
-75
lines changed

2 files changed

+101
-75
lines changed

parser/Token.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<b>
55
Purpose: Types of Tokens to be created by Lexical Analyzer.<br>
66
Programmer: Gabriel Toban Harris <br>
7-
Date: 2021-07-26
7+
Date: 2021-07-26, 2021-7-30
88
</b>
99
*/
1010

@@ -45,13 +45,13 @@ public static boolean is_error(final Lexeme_Types INPUT)
4545
*
4646
* @param TYPE of LEXEME
4747
* @param LINE_NUMBER location in source file
48-
* @param LEXEME content of LEXEME
48+
* @param LEXEME content of lexeme which is then trimmed by {@link String#trim()}
4949
*/
5050
public Token(final Token.Lexeme_Types TYPE, final long LINE_NUMBER, final String LEXEME)
5151
{
5252
this.TYPE = TYPE;
5353
this.LINE_NUMBER = LINE_NUMBER;
54-
this.LEXEME = LEXEME;
54+
this.LEXEME = LEXEME.trim();
5555
}
5656

5757
//getters

parser/Tokenizer.java

Lines changed: 98 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<b>
99
Purpose: Perform tokenization on input files, such that later stages may read a stream of tokens one by one.<br>
1010
Programmer: Gabriel Toban Harris <br>
11-
Date: 2021-07-[27, 28]
11+
Date: 2021-07-[27, 28], 2021-7-30
1212
</b>
1313
*/
1414

@@ -91,6 +91,37 @@ public Returned_Data(final String EXTRA_DATA, final Token FULLY_FORMED_PART)
9191
*/
9292
public final static String CONDITION_EXPR_END = ")";
9393

94+
/**
95+
* Simple concatenation of chars which are not allowed to be a part of any keyword, ID, or special marker of a part.
96+
*/
97+
public final static String RESTRICTED_CHARS = ";=" + SENTINEL_START + SENTINEL_END + CONDITION_CARD_START + CONDITION_CARD_END + CONDITION_SCENARIO_START +
98+
CONDITION_SCENARIO_END + CONDITION_EXPR_START + CONDITION_EXPR_END;
99+
100+
/**
101+
* Representation of unary operator not.
102+
*/
103+
public final static Pattern NOT = Pattern.compile("\\s*NOT\\s*");
104+
105+
/**
106+
* Representation of binary operator and.
107+
*/
108+
public final static Pattern AND = Pattern.compile("\\s*AND\\s*");
109+
110+
/**
111+
* Representation of binary operator or.
112+
*/
113+
public final static Pattern OR = Pattern.compile("\\s*OR\\s*");
114+
115+
/**
116+
* Representation of binary operator xor.
117+
*/
118+
public final static Pattern XOR = Pattern.compile("\\s*XOR\\s*");
119+
120+
/**
121+
* Simply the the predefined class \s.
122+
*/
123+
public final static Pattern WHITE_SPACE_CHAR = Pattern.compile("\\s");
124+
94125
/**
95126
* Pattern indicating the definition of the {@link Lexeme_Types#DECK_START} {@link Token}.
96127
*/
@@ -114,8 +145,7 @@ public Returned_Data(final String EXTRA_DATA, final Token FULLY_FORMED_PART)
114145
/**
115146
* Definition of valid char that an the ID lexeme can have.
116147
*/
117-
public final static Pattern ID_CHAR_SET = Pattern.compile("[^;=" + SENTINEL_START + SENTINEL_END + CONDITION_CARD_START + CONDITION_CARD_END + CONDITION_SCENARIO_START +
118-
CONDITION_SCENARIO_END + CONDITION_EXPR_START + CONDITION_EXPR_END + "]");
148+
public final static Pattern ID_CHAR_SET = Pattern.compile("[^" + RESTRICTED_CHARS + "]");
119149

120150
/**
121151
* Performs tokenization, after which other functions are called internally.
@@ -199,68 +229,66 @@ else if (placeholder.equals("*"))
199229
//defer to ID_CHAR_SET
200230
return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME.toString()));
201231
}
202-
//TODO:finish
203-
//binary operators
204-
205-
//display values
206-
case "t":
207-
{
208-
if (INPUT.hasNext())
209-
{
210-
String placeholder = INPUT.next();
211-
LEXEME.append(placeholder);
212-
//true keyword
213-
if (placeholder.equals("r"))
214-
{
215-
216-
217-
LEXEME.append(placeholder = INPUT.next());
218-
219-
if (placeholder.equals("u"))
220-
{
221-
LEXEME.append(placeholder= INPUT.next());
222-
223-
if (placeholder.equals("e"))
224-
{
225-
placeholder= INPUT.next();
226-
227-
if ()
228-
}
229-
}
230-
}
231-
232-
return parse_ID(LINE_NUMBER, LEXEME, INPUT);
233-
}
234-
235-
return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME.toString()));
236-
}
237-
//guess it is some sort of ID
232+
//parse multichar sequences
238233
default:
239-
return gather_ID_chars(LINE_NUMBER, LEXEME, INPUT);
234+
return gather_keyword_chars(LINE_NUMBER, LEXEME, INPUT);
240235
}
241236
}
242237

243238
/**
244-
* Determines which {@link Token} should be formed given the lexeme provided.
239+
* Function to attempt to find keywords, other wise defers to {@link #gather_ID_chars(long, StringBuilder, Scanner)}
245240
*
246241
* @param LINE_NUMBER of source file being read
247-
* @param REMAINDER which is the extra data that should be feed back into {@link #tokenize(long, String, Scanner)}
248-
* @param COMPLETE_LEXEME which is fully formed
242+
* @param LEXEME_START is the lexem formed thus far
243+
* @param INPUT source to be read from
249244
* @return the created {@link Token} wrapped in a {@link Returned_Data}
250245
*/
251-
private static Returned_Data parse_potentional_ID(final long LINE_NUMBER, final String REMAINDER, final String COMPLETE_LEXEME)
246+
private static Returned_Data gather_keyword_chars(final long LINE_NUMBER, final StringBuilder LEXEME_START, final Scanner INPUT)
252247
{
253-
//Test lexeme for special sequences, if all fail then is in fact ID.
254-
if (TREE_START.matcher(COMPLETE_LEXEME).matches())
255-
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.TREE_START, LINE_NUMBER, COMPLETE_LEXEME));
256-
else if (DISPLAY_START.matcher(COMPLETE_LEXEME).matches())
257-
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DISPLAY_START, LINE_NUMBER, COMPLETE_LEXEME));
258-
else if (PROBABILITY_START.matcher(COMPLETE_LEXEME).matches())
259-
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.PROBABILITY_START, LINE_NUMBER, COMPLETE_LEXEME));
260-
else if (DECK_START.matcher(COMPLETE_LEXEME).matches())
261-
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DECK_START, LINE_NUMBER, COMPLETE_LEXEME));
262-
else
263-
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, COMPLETE_LEXEME));
248+
String placeholder;
249+
250+
while (INPUT.hasNext())
251+
{
252+
placeholder = INPUT.next();
253+
254+
//check for keyword end
255+
if (WHITE_SPACE_CHAR.matcher(placeholder).matches())
256+
{
257+
final String LEXEM = LEXEME_START.toString();
258+
259+
if (AND.matcher(LEXEM).matches())
260+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.AND, LINE_NUMBER, LEXEM));
261+
else if (OR.matcher(LEXEM).matches())
262+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.OR, LINE_NUMBER, LEXEM));
263+
else if (NOT.matcher(LEXEM).matches())
264+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.NOT, LINE_NUMBER, LEXEM));
265+
else if (XOR.matcher(LEXEM).matches())
266+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.XOR, LINE_NUMBER, LEXEM));
267+
else
268+
return gather_ID_chars(LINE_NUMBER, LEXEME_START.append(placeholder), INPUT);
269+
}
270+
//check for restricted char
271+
else if (RESTRICTED_CHARS.contains(placeholder))
272+
{
273+
final String LEXEM = LEXEME_START.toString();
274+
275+
if (AND.matcher(LEXEM).matches())
276+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.AND, LINE_NUMBER, LEXEM));
277+
else if (OR.matcher(LEXEM).matches())
278+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.OR, LINE_NUMBER, LEXEM));
279+
else if (NOT.matcher(LEXEM).matches())
280+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.NOT, LINE_NUMBER, LEXEM));
281+
else if (XOR.matcher(LEXEM).matches())
282+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.XOR, LINE_NUMBER, LEXEM));
283+
else
284+
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEM));
285+
}
286+
//keep building
287+
else
288+
LEXEME_START.append(placeholder);
289+
}
290+
291+
return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME_START.toString()));
264292
}
265293

266294
/**
@@ -289,27 +317,25 @@ private static Returned_Data gather_ID_chars(final long LINE_NUMBER, final Strin
289317
}
290318

291319
/**
292-
* Subroutine to create a {@link Lexeme_Types#ID} {@link Token}.
320+
* Determines which {@link Token} should be formed given the lexeme provided.
293321
*
294322
* @param LINE_NUMBER of source file being read
295-
* @param LEXEME_START is the lexem formed thus far
296-
* @param INPUT source to be read from
323+
* @param REMAINDER which is the extra data that should be feed back into {@link #tokenize(long, String, Scanner)}
324+
* @param COMPLETE_LEXEME which is fully formed
297325
* @return the created {@link Token} wrapped in a {@link Returned_Data}
298326
*/
299-
private static Returned_Data parse_ID(final long LINE_NUMBER, final StringBuilder LEXEME_START, final Scanner INPUT)
327+
private static Returned_Data parse_potentional_ID(final long LINE_NUMBER, final String REMAINDER, final String COMPLETE_LEXEME)
300328
{
301-
String placeholder;
302-
303-
while (INPUT.hasNext())
304-
{
305-
placeholder = INPUT.next();
306-
307-
if (ID_CHAR_SET.matcher(placeholder).matches())
308-
LEXEME_START.append(placeholder);
309-
else
310-
return new Returned_Data(placeholder, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME_START.toString()));
311-
}
312-
313-
return new Returned_Data(new Token(Token.Lexeme_Types.ID, LINE_NUMBER, LEXEME_START.toString()));
329+
//Test lexeme for special sequences, if all fail then is in fact ID.
330+
if (TREE_START.matcher(COMPLETE_LEXEME).matches())
331+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.TREE_START, LINE_NUMBER, COMPLETE_LEXEME));
332+
else if (DISPLAY_START.matcher(COMPLETE_LEXEME).matches())
333+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DISPLAY_START, LINE_NUMBER, COMPLETE_LEXEME));
334+
else if (PROBABILITY_START.matcher(COMPLETE_LEXEME).matches())
335+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.PROBABILITY_START, LINE_NUMBER, COMPLETE_LEXEME));
336+
else if (DECK_START.matcher(COMPLETE_LEXEME).matches())
337+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.DECK_START, LINE_NUMBER, COMPLETE_LEXEME));
338+
else
339+
return new Returned_Data(REMAINDER, new Token(Token.Lexeme_Types.ID, LINE_NUMBER, COMPLETE_LEXEME));
314340
}
315341
}

0 commit comments

Comments
 (0)