Skip to content

Commit 636d57b

Browse files
byroothsbt
authored andcommitted
[ruby/strscan] Micro optimize encoding checks
(ruby/strscan#117) Profiling shows a lot of time spent in various encoding check functions. I'm working on optimizing them on the Ruby side, but if we assume most strings are one of the simple 3 encodings, we can skip a lot of overhead. ```ruby require 'strscan' require 'benchmark/ips' source = 10_000.times.map { rand(9999999).to_s }.join(",").force_encoding(Encoding::UTF_8).freeze def scan_to_i(source) scanner = StringScanner.new(source) while number = scanner.scan(/\d+/) number.to_i scanner.skip(",") end end def scan_integer(source) scanner = StringScanner.new(source) while scanner.scan_integer scanner.skip(",") end end Benchmark.ips do |x| x.report("scan.to_i") { scan_to_i(source) } x.report("scan_integer") { scan_integer(source) } x.compare! end ``` Before: ``` ruby 3.3.4 (2024-07-09 revision ruby/strscan@be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- scan.to_i 93.000 i/100ms scan_integer 232.000 i/100ms Calculating ------------------------------------- scan.to_i 933.191 (± 0.2%) i/s (1.07 ms/i) - 4.743k in 5.082597s scan_integer 2.326k (± 0.8%) i/s (429.99 μs/i) - 11.832k in 5.087974s Comparison: scan_integer: 2325.6 i/s scan.to_i: 933.2 i/s - 2.49x slower ``` After: ``` ruby 3.3.4 (2024-07-09 revision ruby/strscan@be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- scan.to_i 96.000 i/100ms scan_integer 274.000 i/100ms Calculating ------------------------------------- scan.to_i 969.489 (± 0.2%) i/s (1.03 ms/i) - 4.896k in 5.050114s scan_integer 2.756k (± 0.1%) i/s (362.88 μs/i) - 13.974k in 5.070837s Comparison: scan_integer: 2755.8 i/s scan.to_i: 969.5 i/s - 2.84x slower ``` ruby/strscan@c02b1ce684
1 parent 79cc3d2 commit 636d57b

File tree

1 file changed

+38
-3
lines changed

1 file changed

+38
-3
lines changed

ext/strscan/strscan.c

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ static VALUE StringScanner;
3232
static VALUE ScanError;
3333
static ID id_byteslice;
3434

35+
static int usascii_encindex, utf8_encindex, binary_encindex;
36+
3537
struct strscanner
3638
{
3739
/* multi-purpose flags */
@@ -683,6 +685,14 @@ strscan_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_pt
683685
ONIG_OPTION_NONE);
684686
}
685687

688+
static void
689+
strscan_enc_check(VALUE str1, VALUE str2)
690+
{
691+
if (RB_ENCODING_GET(str1) != RB_ENCODING_GET(str2)) {
692+
rb_enc_check(str1, str2);
693+
}
694+
}
695+
686696
static VALUE
687697
strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly)
688698
{
@@ -710,18 +720,21 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly
710720
}
711721
else {
712722
StringValue(pattern);
713-
rb_encoding *enc = rb_enc_check(p->str, pattern);
714723
if (S_RESTLEN(p) < RSTRING_LEN(pattern)) {
724+
strscan_enc_check(p->str, pattern);
715725
return Qnil;
716726
}
717727

718728
if (headonly) {
729+
strscan_enc_check(p->str, pattern);
730+
719731
if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) {
720732
return Qnil;
721733
}
722734
set_registers(p, RSTRING_LEN(pattern));
723735
}
724736
else {
737+
rb_encoding *enc = rb_enc_check(p->str, pattern);
725738
long pos = rb_memsearch(RSTRING_PTR(pattern), RSTRING_LEN(pattern),
726739
CURPTR(p), S_RESTLEN(p), enc);
727740
if (pos == -1) {
@@ -1282,6 +1295,24 @@ strscan_parse_integer(struct strscanner *p, int base, long len)
12821295
return integer;
12831296
}
12841297

1298+
static inline bool
1299+
strscan_ascii_compat_fastpath(VALUE str) {
1300+
int encindex = ENCODING_GET_INLINED(str);
1301+
// The overwhelming majority of strings are in one of these 3 encodings.
1302+
return encindex == utf8_encindex || encindex == binary_encindex || encindex == usascii_encindex;
1303+
}
1304+
1305+
static inline void
1306+
strscan_must_ascii_compat(VALUE str)
1307+
{
1308+
// The overwhelming majority of strings are in one of these 3 encodings.
1309+
if (RB_LIKELY(strscan_ascii_compat_fastpath(str))) {
1310+
return;
1311+
}
1312+
1313+
rb_must_asciicompat(str);
1314+
}
1315+
12851316
static VALUE
12861317
strscan_scan_base10_integer(VALUE self)
12871318
{
@@ -1292,7 +1323,7 @@ strscan_scan_base10_integer(VALUE self)
12921323
GET_SCANNER(self, p);
12931324
CLEAR_MATCH_STATUS(p);
12941325

1295-
rb_must_asciicompat(p->str);
1326+
strscan_must_ascii_compat(p->str);
12961327

12971328
ptr = CURPTR(p);
12981329

@@ -1330,7 +1361,7 @@ strscan_scan_base16_integer(VALUE self)
13301361
GET_SCANNER(self, p);
13311362
CLEAR_MATCH_STATUS(p);
13321363

1333-
rb_must_asciicompat(p->str);
1364+
strscan_must_ascii_compat(p->str);
13341365

13351366
ptr = CURPTR(p);
13361367

@@ -2251,6 +2282,10 @@ Init_strscan(void)
22512282

22522283
id_byteslice = rb_intern("byteslice");
22532284

2285+
usascii_encindex = rb_usascii_encindex();
2286+
utf8_encindex = rb_utf8_encindex();
2287+
binary_encindex = rb_ascii8bit_encindex();
2288+
22542289
StringScanner = rb_define_class("StringScanner", rb_cObject);
22552290
ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError);
22562291
if (!rb_const_defined(rb_cObject, id_scanerr)) {

0 commit comments

Comments
 (0)