|
| 1 | +#include "rbs/defines.h" |
1 | 2 | #include "rbs/lexer.h" |
| 3 | +#include "rbs/util/rbs_assert.h" |
2 | 4 |
|
3 | 5 | static const char *RBS_TOKENTYPE_NAMES[] = { |
4 | 6 | "NullType", |
@@ -112,17 +114,60 @@ int rbs_token_bytes(rbs_token_t tok) { |
112 | 114 | } |
113 | 115 |
|
114 | 116 | unsigned int rbs_peek(rbs_lexer_t *lexer) { |
115 | | - if (lexer->current.char_pos == lexer->end_pos) { |
116 | | - lexer->last_char = '\0'; |
117 | | - return 0; |
| 117 | + return lexer->current_code_point; |
| 118 | +} |
| 119 | + |
| 120 | +bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) { |
| 121 | + if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) { |
| 122 | + return false; |
| 123 | + } |
| 124 | + |
| 125 | + const char *start = lexer->string.start + lexer->current.byte_pos; |
| 126 | + |
| 127 | + // Fast path for ASCII (single-byte) characters |
| 128 | + if ((unsigned int) *start < 128) { |
| 129 | + *codepoint = (unsigned int) *start; |
| 130 | + *byte_len = 1; |
| 131 | + return true; |
| 132 | + } |
| 133 | + |
| 134 | + *byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); |
| 135 | + |
| 136 | + if (*byte_len == 1) { |
| 137 | + *codepoint = (unsigned int) *start; |
118 | 138 | } else { |
119 | | - rbs_string_t str = rbs_string_new( |
120 | | - lexer->string.start + lexer->current.byte_pos, |
121 | | - lexer->string.end |
122 | | - ); |
123 | | - unsigned int c = rbs_utf8_string_to_codepoint(str); |
124 | | - lexer->last_char = c; |
125 | | - return c; |
| 139 | + *codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode |
| 140 | + } |
| 141 | + |
| 142 | + return true; |
| 143 | +} |
| 144 | + |
| 145 | +void rbs_skip(rbs_lexer_t *lexer) { |
| 146 | + rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0"); |
| 147 | + |
| 148 | + if (RBS_UNLIKELY(lexer->current_code_point == '\0')) { |
| 149 | + return; |
| 150 | + } |
| 151 | + |
| 152 | + unsigned int codepoint; |
| 153 | + size_t byte_len; |
| 154 | + |
| 155 | + lexer->current.byte_pos += lexer->current_character_bytes; |
| 156 | + lexer->current.char_pos += 1; |
| 157 | + if (lexer->current_code_point == '\n') { |
| 158 | + lexer->current.line += 1; |
| 159 | + lexer->current.column = 0; |
| 160 | + lexer->first_token_of_line = true; |
| 161 | + } else { |
| 162 | + lexer->current.column += 1; |
| 163 | + } |
| 164 | + |
| 165 | + if (rbs_next_char(lexer, &codepoint, &byte_len)) { |
| 166 | + lexer->current_code_point = codepoint; |
| 167 | + lexer->current_character_bytes = byte_len; |
| 168 | + } else { |
| 169 | + lexer->current_character_bytes = 1; |
| 170 | + lexer->current_code_point = '\0'; |
126 | 171 | } |
127 | 172 | } |
128 | 173 |
|
@@ -156,35 +201,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) { |
156 | 201 | } |
157 | 202 | } |
158 | 203 |
|
159 | | -void rbs_skip(rbs_lexer_t *lexer) { |
160 | | - if (!lexer->last_char) { |
161 | | - rbs_peek(lexer); |
162 | | - } |
163 | | - |
164 | | - size_t byte_len; |
165 | | - |
166 | | - if (lexer->last_char == '\0') { |
167 | | - byte_len = 1; |
168 | | - } else { |
169 | | - const char *start = lexer->string.start + lexer->current.byte_pos; |
170 | | - byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); |
171 | | - } |
172 | | - |
173 | | - lexer->current.char_pos += 1; |
174 | | - lexer->current.byte_pos += byte_len; |
175 | | - |
176 | | - if (lexer->last_char == '\n') { |
177 | | - lexer->current.line += 1; |
178 | | - lexer->current.column = 0; |
179 | | - lexer->first_token_of_line = true; |
180 | | - } else { |
181 | | - lexer->current.column += 1; |
182 | | - } |
183 | | -} |
184 | | - |
185 | 204 | void rbs_skipn(rbs_lexer_t *lexer, size_t size) { |
186 | 205 | for (size_t i = 0; i < size; i++) { |
187 | | - rbs_peek(lexer); |
188 | 206 | rbs_skip(lexer); |
189 | 207 | } |
190 | 208 | } |
|
0 commit comments