Skip to content

Commit 52d1de6

Browse files
committed
Fix lexer data structure
1 parent 70b3c11 commit 52d1de6

File tree

4 files changed

+111
-53
lines changed

4 files changed

+111
-53
lines changed

include/rbs/lexer.h

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -126,20 +126,26 @@ typedef struct {
126126
* The lexer state is the curren token.
127127
*
128128
* ```
129-
* ... "a string token"
130-
* ^ start position
131-
* ^ current position
132-
* ~~~~~~ Token => "a str
129+
#. 0.1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6
130+
* ... " a s t r i n g t o k e n "
131+
* ^ start position (0)
132+
* ^ current position (6)
133+
* ^ current character ('i', bytes = 1)
134+
* ~~~~~~~~~~~ Token => "a str
133135
* ```
134136
* */
135137
typedef struct {
136138
rbs_string_t string;
137-
int start_pos; /* The character position that defines the start of the input */
138-
int end_pos; /* The character position that defines the end of the input */
139-
rbs_position_t current; /* The current position */
140-
rbs_position_t start; /* The start position of the current token */
139+
int start_pos; /* The character position that defines the start of the input */
140+
int end_pos; /* The character position that defines the end of the input */
141+
rbs_position_t current; /* The current position: just before the current_character */
142+
rbs_position_t start; /* The start position of the current token */
143+
144+
unsigned int current_code_point; /* Current character code point */
145+
size_t current_character_bytes; /* Current character byte length (0 or 1~4) */
146+
141147
bool first_token_of_line; /* This flag is used for tLINECOMMENT */
142-
unsigned int last_char; /* Last peeked character */
148+
143149
const rbs_encoding_t *encoding;
144150
} rbs_lexer_t;
145151

@@ -159,15 +165,23 @@ int rbs_token_bytes(rbs_token_t tok);
159165
const char *rbs_token_type_str(enum RBSTokenType type);
160166

161167
/**
162-
* Read next character.
168+
* Returns the next character.
163169
* */
164170
unsigned int rbs_peek(rbs_lexer_t *lexer);
165171

166172
/**
167-
* Skip one character.
173+
* Advances the current position by one character.
168174
* */
169175
void rbs_skip(rbs_lexer_t *lexer);
170176

177+
/**
178+
* Read next character and store the codepoint and byte length to the given pointers.
179+
*
180+
* This doesn't update the lexer state.
181+
* Returns `true` if succeeded, or `false` if reached to EOF.
182+
* */
183+
bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *bytes);
184+
171185
/**
172186
* Skip n characters.
173187
* */
@@ -187,4 +201,6 @@ rbs_token_t rbs_lexer_next_token(rbs_lexer_t *lexer);
187201

188202
void rbs_print_token(rbs_token_t tok);
189203

204+
void rbs_print_lexer(rbs_lexer_t *lexer);
205+
190206
#endif

src/lexstate.c

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#include "rbs/defines.h"
12
#include "rbs/lexer.h"
3+
#include "rbs/util/rbs_assert.h"
24

35
static const char *RBS_TOKENTYPE_NAMES[] = {
46
"NullType",
@@ -112,17 +114,60 @@ int rbs_token_bytes(rbs_token_t tok) {
112114
}
113115

114116
unsigned int rbs_peek(rbs_lexer_t *lexer) {
115-
if (lexer->current.char_pos == lexer->end_pos) {
116-
lexer->last_char = '\0';
117-
return 0;
117+
return lexer->current_code_point;
118+
}
119+
120+
bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) {
121+
if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) {
122+
return false;
123+
}
124+
125+
const char *start = lexer->string.start + lexer->current.byte_pos;
126+
127+
// Fast path for ASCII (single-byte) characters
128+
if ((unsigned int) *start < 128) {
129+
*codepoint = (unsigned int) *start;
130+
*byte_len = 1;
131+
return true;
132+
}
133+
134+
*byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
135+
136+
if (*byte_len == 1) {
137+
*codepoint = (unsigned int) *start;
118138
} else {
119-
rbs_string_t str = rbs_string_new(
120-
lexer->string.start + lexer->current.byte_pos,
121-
lexer->string.end
122-
);
123-
unsigned int c = rbs_utf8_string_to_codepoint(str);
124-
lexer->last_char = c;
125-
return c;
139+
*codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode
140+
}
141+
142+
return true;
143+
}
144+
145+
void rbs_skip(rbs_lexer_t *lexer) {
146+
rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0");
147+
148+
if (RBS_UNLIKELY(lexer->current_code_point == '\0')) {
149+
return;
150+
}
151+
152+
unsigned int codepoint;
153+
size_t byte_len;
154+
155+
lexer->current.byte_pos += lexer->current_character_bytes;
156+
lexer->current.char_pos += 1;
157+
if (lexer->current_code_point == '\n') {
158+
lexer->current.line += 1;
159+
lexer->current.column = 0;
160+
lexer->first_token_of_line = true;
161+
} else {
162+
lexer->current.column += 1;
163+
}
164+
165+
if (rbs_next_char(lexer, &codepoint, &byte_len)) {
166+
lexer->current_code_point = codepoint;
167+
lexer->current_character_bytes = byte_len;
168+
} else {
169+
lexer->current_character_bytes = 1;
170+
lexer->current_code_point = '\0';
126171
}
127172
}
128173

@@ -156,35 +201,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) {
156201
}
157202
}
158203

159-
void rbs_skip(rbs_lexer_t *lexer) {
160-
if (!lexer->last_char) {
161-
rbs_peek(lexer);
162-
}
163-
164-
size_t byte_len;
165-
166-
if (lexer->last_char == '\0') {
167-
byte_len = 1;
168-
} else {
169-
const char *start = lexer->string.start + lexer->current.byte_pos;
170-
byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
171-
}
172-
173-
lexer->current.char_pos += 1;
174-
lexer->current.byte_pos += byte_len;
175-
176-
if (lexer->last_char == '\n') {
177-
lexer->current.line += 1;
178-
lexer->current.column = 0;
179-
lexer->first_token_of_line = true;
180-
} else {
181-
lexer->current.column += 1;
182-
}
183-
}
184-
185204
void rbs_skipn(rbs_lexer_t *lexer, size_t size) {
186205
for (size_t i = 0; i < size; i++) {
187-
rbs_peek(lexer);
188206
rbs_skip(lexer);
189207
}
190208
}

src/parser.c

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <string.h>
88

99
#include "rbs/defines.h"
10+
#include "rbs/lexer.h"
1011
#include "rbs/string.h"
1112
#include "rbs/util/rbs_unescape.h"
1213
#include "rbs/util/rbs_buffer.h"
@@ -3456,6 +3457,14 @@ void rbs_print_token(rbs_token_t tok) {
34563457
);
34573458
}
34583459

3460+
void rbs_print_lexer(rbs_lexer_t *lexer) {
3461+
printf("Lexer: (range = %d...%d, encoding = %s\n", lexer->start_pos, lexer->end_pos, lexer->encoding->name);
3462+
printf(" start = { char_pos = %d, byte_pos = %d }\n", lexer->start.char_pos, lexer->start.byte_pos);
3463+
printf(" current = { char_pos = %d, byte_pos = %d }\n", lexer->current.char_pos, lexer->current.byte_pos);
3464+
printf(" character = { code_point = %d (%c), bytes = %zu }\n", lexer->current_code_point, lexer->current_code_point < 256 ? lexer->current_code_point : '?', lexer->current_character_bytes);
3465+
printf(" first_token_of_line = %s\n", lexer->first_token_of_line ? "true" : "false");
3466+
}
3467+
34593468
rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line) {
34603469
int comment_line = subject_line - 1;
34613470

@@ -3484,14 +3493,28 @@ rbs_lexer_t *rbs_lexer_new(rbs_allocator_t *allocator, rbs_string_t string, cons
34843493
.end_pos = end_pos,
34853494
.current = start_position,
34863495
.start = { 0 },
3487-
.first_token_of_line = false,
3488-
.last_char = 0,
3496+
.first_token_of_line = true,
3497+
.current_character_bytes = 0,
3498+
.current_code_point = '\0',
34893499
.encoding = encoding,
34903500
};
34913501

3492-
rbs_skipn(lexer, start_pos);
3502+
unsigned int codepoint;
3503+
size_t bytes;
3504+
3505+
if (rbs_next_char(lexer, &codepoint, &bytes)) {
3506+
lexer->current_code_point = codepoint;
3507+
lexer->current_character_bytes = bytes;
3508+
} else {
3509+
lexer->current_code_point = '\0';
3510+
lexer->current_character_bytes = 1;
3511+
}
3512+
3513+
if (start_pos > 0) {
3514+
rbs_skipn(lexer, start_pos);
3515+
}
3516+
34933517
lexer->start = lexer->current;
3494-
lexer->first_token_of_line = lexer->current.column == 0;
34953518

34963519
return lexer;
34973520
}

src/string.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "rbs/string.h"
2+
#include "rbs/defines.h"
23

34
#include <stdlib.h>
45
#include <string.h>
@@ -14,7 +15,7 @@ unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {
1415

1516
if (s >= end) return 0; // End of string
1617

17-
if ((*s & 0x80) == 0) {
18+
if (RBS_LIKELY((*s & 0x80) == 0)) {
1819
// Single byte character (0xxxxxxx)
1920
return *s;
2021
} else if ((*s & 0xE0) == 0xC0) {

0 commit comments

Comments
 (0)