Skip to content

Commit ffcb7e2

Browse files
authored
Merge pull request #2665 from ruby/fix-lexer
Faster lexical analyzer
2 parents caecdfe + cd148f2 commit ffcb7e2

File tree

10 files changed

+212
-56
lines changed

10 files changed

+212
-56
lines changed

Rakefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,3 +537,17 @@ task :compile_c99 do
537537
ensure
538538
ENV.delete("TEST_NO_C23")
539539
end
540+
541+
task :prepare_bench do
542+
ENV.delete("DEBUG")
543+
Rake::Task[:"clobber"].invoke
544+
Rake::Task[:"templates"].invoke
545+
Rake::Task[:"compile"].invoke
546+
end
547+
548+
task :prepare_profiling do
549+
ENV["DEBUG"] = "1"
550+
Rake::Task[:"clobber"].invoke
551+
Rake::Task[:"templates"].invoke
552+
Rake::Task[:"compile"].invoke
553+
end

bin/benchmark-parse.rb

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
require "rbs"
2+
require "benchmark/ips"
3+
require "csv"
4+
require "pathname"
5+
6+
files = {}
7+
ARGV.each do |file|
8+
content = File.read(file)
9+
files[file] = RBS::Buffer.new(content: content, name: Pathname(file))
10+
end
11+
12+
puts "Benchmarking parsing #{files.size} files..."
13+
14+
result = Benchmark.ips do |x|
15+
x.report("parsing") do
16+
files.each do |file, content|
17+
RBS::Parser.parse_signature(content)
18+
end
19+
end
20+
21+
x.quiet = true
22+
end
23+
24+
entry = result.entries[0]
25+
puts "✅ #{"%0.3f" % entry.ips} i/s (±#{"%0.3f" % entry.error_percentage}%)"

bin/profile-parse.rb

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
require 'rbs'
2+
require "optparse"
3+
4+
wait = false
5+
duration = 3
6+
7+
args = ARGV.dup
8+
9+
OptionParser.new do |opts|
10+
opts.banner = "Usage: profile-parse.rb [options] FILE"
11+
12+
opts.on("--wait", "Wait for enter before starting") do
13+
wait = true
14+
end
15+
opts.on("--duration=NUMBER", "Repeat parsing for <NUMBER> seconds") do |number|
16+
duration = number.to_i
17+
end
18+
end.parse!(args)
19+
20+
if wait
21+
puts "⏯️ Waiting for enter to continue at #{Process.pid}..."
22+
STDIN.gets
23+
end
24+
25+
file = args.shift or raise "No file path is given"
26+
sig = File.read(file)
27+
28+
puts "Parsing #{file} -- #{sig.bytesize} bytes"
29+
30+
started_at = Time.now
31+
count = 0
32+
33+
loop do
34+
count += 1
35+
RBS::Parser.parse_signature(sig)
36+
break if (Time.now - started_at) > duration
37+
end
38+
39+
puts "✅ Done #{count} loop(s)"

ext/rbs_extension/extconf.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
'-Wc++-compat',
1919
]
2020

21-
append_cflags ['-O0', '-g'] if ENV['DEBUG']
21+
append_cflags ['-O0', '-pg'] if ENV['DEBUG']
2222
if ENV["TEST_NO_C23"]
2323
puts "Adding -Wc2x-extensions to CFLAGS"
2424
$CFLAGS << " -Werror -Wc2x-extensions"

include/rbs/defines.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,24 @@
3232
#define RBS_ATTRIBUTE_FORMAT(string_index, argument_index)
3333
#endif
3434

35+
/**
36+
* Support RBS_LIKELY and RBS_UNLIKELY to help the compiler optimize its
37+
* branch predication.
38+
*/
39+
#if defined(__GNUC__) || defined(__clang__)
40+
/** The compiler should predicate that this branch will be taken. */
41+
#define RBS_LIKELY(x) __builtin_expect(!!(x), 1)
42+
43+
/** The compiler should predicate that this branch will not be taken. */
44+
#define RBS_UNLIKELY(x) __builtin_expect(!!(x), 0)
45+
#else
46+
/** Void because this platform does not support branch prediction hints. */
47+
#define RBS_LIKELY(x) (x)
48+
49+
/** Void because this platform does not support branch prediction hints. */
50+
#define RBS_UNLIKELY(x) (x)
51+
#endif
52+
3553
/**
3654
* We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch.
3755
* Use RBS_FALLTHROUGH to explicitly annotate cases where the fallthrough is intentional.

include/rbs/lexer.h

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -126,20 +126,26 @@ typedef struct {
126126
* The lexer state is the curren token.
127127
*
128128
* ```
129-
* ... "a string token"
130-
* ^ start position
131-
* ^ current position
132-
* ~~~~~~ Token => "a str
129+
#. 0.1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6
130+
* ... " a s t r i n g t o k e n "
131+
* ^ start position (0)
132+
* ^ current position (6)
133+
* ^ current character ('i', bytes = 1)
134+
* ~~~~~~~~~~~ Token => "a str
133135
* ```
134136
* */
135137
typedef struct {
136138
rbs_string_t string;
137-
int start_pos; /* The character position that defines the start of the input */
138-
int end_pos; /* The character position that defines the end of the input */
139-
rbs_position_t current; /* The current position */
140-
rbs_position_t start; /* The start position of the current token */
139+
int start_pos; /* The character position that defines the start of the input */
140+
int end_pos; /* The character position that defines the end of the input */
141+
rbs_position_t current; /* The current position: just before the current_character */
142+
rbs_position_t start; /* The start position of the current token */
143+
144+
unsigned int current_code_point; /* Current character code point */
145+
size_t current_character_bytes; /* Current character byte length (0 or 1~4) */
146+
141147
bool first_token_of_line; /* This flag is used for tLINECOMMENT */
142-
unsigned int last_char; /* Last peeked character */
148+
143149
const rbs_encoding_t *encoding;
144150
} rbs_lexer_t;
145151

@@ -159,15 +165,23 @@ int rbs_token_bytes(rbs_token_t tok);
159165
const char *rbs_token_type_str(enum RBSTokenType type);
160166

161167
/**
162-
* Read next character.
168+
* Returns the next character.
163169
* */
164170
unsigned int rbs_peek(rbs_lexer_t *lexer);
165171

166172
/**
167-
* Skip one character.
173+
* Advances the current position by one character.
168174
* */
169175
void rbs_skip(rbs_lexer_t *lexer);
170176

177+
/**
178+
* Read next character and store the codepoint and byte length to the given pointers.
179+
*
180+
* This doesn't update the lexer state.
181+
* Returns `true` if succeeded, or `false` if reached to EOF.
182+
* */
183+
bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *bytes);
184+
171185
/**
172186
* Skip n characters.
173187
* */
@@ -187,4 +201,6 @@ rbs_token_t rbs_lexer_next_token(rbs_lexer_t *lexer);
187201

188202
void rbs_print_token(rbs_token_t tok);
189203

204+
void rbs_print_lexer(rbs_lexer_t *lexer);
205+
190206
#endif

src/lexstate.c

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#include "rbs/defines.h"
12
#include "rbs/lexer.h"
3+
#include "rbs/util/rbs_assert.h"
24

35
static const char *RBS_TOKENTYPE_NAMES[] = {
46
"NullType",
@@ -112,17 +114,60 @@ int rbs_token_bytes(rbs_token_t tok) {
112114
}
113115

114116
unsigned int rbs_peek(rbs_lexer_t *lexer) {
115-
if (lexer->current.char_pos == lexer->end_pos) {
116-
lexer->last_char = '\0';
117-
return 0;
117+
return lexer->current_code_point;
118+
}
119+
120+
bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) {
121+
if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) {
122+
return false;
123+
}
124+
125+
const char *start = lexer->string.start + lexer->current.byte_pos;
126+
127+
// Fast path for ASCII (single-byte) characters
128+
if ((unsigned int) *start < 128) {
129+
*codepoint = (unsigned int) *start;
130+
*byte_len = 1;
131+
return true;
132+
}
133+
134+
*byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
135+
136+
if (*byte_len == 1) {
137+
*codepoint = (unsigned int) *start;
118138
} else {
119-
rbs_string_t str = rbs_string_new(
120-
lexer->string.start + lexer->current.byte_pos,
121-
lexer->string.end
122-
);
123-
unsigned int c = rbs_utf8_string_to_codepoint(str);
124-
lexer->last_char = c;
125-
return c;
139+
*codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode
140+
}
141+
142+
return true;
143+
}
144+
145+
void rbs_skip(rbs_lexer_t *lexer) {
146+
rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0");
147+
148+
if (RBS_UNLIKELY(lexer->current_code_point == '\0')) {
149+
return;
150+
}
151+
152+
unsigned int codepoint;
153+
size_t byte_len;
154+
155+
lexer->current.byte_pos += lexer->current_character_bytes;
156+
lexer->current.char_pos += 1;
157+
if (lexer->current_code_point == '\n') {
158+
lexer->current.line += 1;
159+
lexer->current.column = 0;
160+
lexer->first_token_of_line = true;
161+
} else {
162+
lexer->current.column += 1;
163+
}
164+
165+
if (rbs_next_char(lexer, &codepoint, &byte_len)) {
166+
lexer->current_code_point = codepoint;
167+
lexer->current_character_bytes = byte_len;
168+
} else {
169+
lexer->current_character_bytes = 1;
170+
lexer->current_code_point = '\0';
126171
}
127172
}
128173

@@ -156,35 +201,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) {
156201
}
157202
}
158203

159-
void rbs_skip(rbs_lexer_t *lexer) {
160-
if (!lexer->last_char) {
161-
rbs_peek(lexer);
162-
}
163-
164-
size_t byte_len;
165-
166-
if (lexer->last_char == '\0') {
167-
byte_len = 1;
168-
} else {
169-
const char *start = lexer->string.start + lexer->current.byte_pos;
170-
byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
171-
}
172-
173-
lexer->current.char_pos += 1;
174-
lexer->current.byte_pos += byte_len;
175-
176-
if (lexer->last_char == '\n') {
177-
lexer->current.line += 1;
178-
lexer->current.column = 0;
179-
lexer->first_token_of_line = true;
180-
} else {
181-
lexer->current.column += 1;
182-
}
183-
}
184-
185204
void rbs_skipn(rbs_lexer_t *lexer, size_t size) {
186205
for (size_t i = 0; i < size; i++) {
187-
rbs_peek(lexer);
188206
rbs_skip(lexer);
189207
}
190208
}

src/parser.c

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <string.h>
88

99
#include "rbs/defines.h"
10+
#include "rbs/lexer.h"
1011
#include "rbs/string.h"
1112
#include "rbs/util/rbs_unescape.h"
1213
#include "rbs/util/rbs_buffer.h"
@@ -3456,6 +3457,14 @@ void rbs_print_token(rbs_token_t tok) {
34563457
);
34573458
}
34583459

3460+
void rbs_print_lexer(rbs_lexer_t *lexer) {
3461+
printf("Lexer: (range = %d...%d, encoding = %s\n", lexer->start_pos, lexer->end_pos, lexer->encoding->name);
3462+
printf(" start = { char_pos = %d, byte_pos = %d }\n", lexer->start.char_pos, lexer->start.byte_pos);
3463+
printf(" current = { char_pos = %d, byte_pos = %d }\n", lexer->current.char_pos, lexer->current.byte_pos);
3464+
printf(" character = { code_point = %d (%c), bytes = %zu }\n", lexer->current_code_point, lexer->current_code_point < 256 ? lexer->current_code_point : '?', lexer->current_character_bytes);
3465+
printf(" first_token_of_line = %s\n", lexer->first_token_of_line ? "true" : "false");
3466+
}
3467+
34593468
rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line) {
34603469
int comment_line = subject_line - 1;
34613470

@@ -3484,14 +3493,28 @@ rbs_lexer_t *rbs_lexer_new(rbs_allocator_t *allocator, rbs_string_t string, cons
34843493
.end_pos = end_pos,
34853494
.current = start_position,
34863495
.start = { 0 },
3487-
.first_token_of_line = false,
3488-
.last_char = 0,
3496+
.first_token_of_line = true,
3497+
.current_character_bytes = 0,
3498+
.current_code_point = '\0',
34893499
.encoding = encoding,
34903500
};
34913501

3492-
rbs_skipn(lexer, start_pos);
3502+
unsigned int codepoint;
3503+
size_t bytes;
3504+
3505+
if (rbs_next_char(lexer, &codepoint, &bytes)) {
3506+
lexer->current_code_point = codepoint;
3507+
lexer->current_character_bytes = bytes;
3508+
} else {
3509+
lexer->current_code_point = '\0';
3510+
lexer->current_character_bytes = 1;
3511+
}
3512+
3513+
if (start_pos > 0) {
3514+
rbs_skipn(lexer, start_pos);
3515+
}
3516+
34933517
lexer->start = lexer->current;
3494-
lexer->first_token_of_line = lexer->current.column == 0;
34953518

34963519
return lexer;
34973520
}

src/string.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "rbs/string.h"
2+
#include "rbs/defines.h"
23

34
#include <stdlib.h>
45
#include <string.h>
@@ -14,7 +15,7 @@ unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {
1415

1516
if (s >= end) return 0; // End of string
1617

17-
if ((*s & 0x80) == 0) {
18+
if (RBS_LIKELY((*s & 0x80) == 0)) {
1819
// Single byte character (0xxxxxxx)
1920
return *s;
2021
} else if ((*s & 0xE0) == 0xC0) {

0 commit comments

Comments
 (0)