Skip to content

Commit f32d507

Browse files
committed
Elide string allocation when using String#gsub in MAP mode
If the provided Hash doesn't have a default proc, we know for sure that we'll never call into user provided code, hence the string we allocate to access the Hash can't possibly escape. So we don't actually have to allocate it, we can use a fake_str, AKA a stack allocated string. ``` compare-ruby: ruby 3.5.0dev (2025-02-10T13:47:44Z master 3fb455a) +PRISM [arm64-darwin23] built-ruby: ruby 3.5.0dev (2025-02-10T17:09:52Z opt-gsub-alloc ea5c28958f) +PRISM [arm64-darwin23] warming up.... | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |escape | 3.374k| 3.722k| | | -| 1.10x| |escape_bin | 5.469k| 6.587k| | | -| 1.20x| |escape_utf8 | 3.465k| 3.734k| | | -| 1.08x| |escape_utf8_bin | 5.752k| 7.283k| | | -| 1.27x| ```
1 parent b8db606 commit f32d507

File tree

5 files changed

+63
-3
lines changed

5 files changed

+63
-3
lines changed

benchmark/string_gsub.yml

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
prelude: |
2+
# frozen_string_literal: true
3+
STR = ((("a" * 31) + "<") * 1000).freeze
4+
STR_UNICODE = ((("a" * 30) + "\u2028") * 1000).freeze
5+
ESCAPED_CHARS_BINARY = {
6+
"\u2028".b => '\u2028'.b,
7+
"\u2029".b => '\u2029'.b,
8+
">".b => '\u003e'.b.freeze,
9+
"<".b => '\u003c'.b.freeze,
10+
"&".b => '\u0026'.b.freeze,
11+
}
12+
BINARY_PATTERN = Regexp.union(ESCAPED_CHARS_BINARY.keys)
13+
14+
ESCAPED_CHARS = {
15+
"\u2028" => '\u2028',
16+
"\u2029" => '\u2029',
17+
">" => '\u003e',
18+
"<" => '\u003c',
19+
"&" => '\u0026',
20+
}
21+
ESCAPE_PATTERN = Regexp.union(ESCAPED_CHARS.keys)
22+
23+
24+
benchmark:
25+
escape: |
26+
str = STR.dup
27+
str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
28+
str
29+
30+
escape_bin: |
31+
str = STR.b
32+
str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
33+
str.force_encoding(Encoding::UTF_8)
34+
35+
escape_utf8: |
36+
str = STR_UNICODE.dup
37+
str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
38+
str
39+
40+
escape_utf8_bin: |
41+
str = STR_UNICODE.b
42+
str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
43+
str.force_encoding(Encoding::UTF_8)

common.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17878,6 +17878,7 @@ string.$(OBJEXT): $(top_srcdir)/internal/encoding.h
1787817878
string.$(OBJEXT): $(top_srcdir)/internal/error.h
1787917879
string.$(OBJEXT): $(top_srcdir)/internal/fixnum.h
1788017880
string.$(OBJEXT): $(top_srcdir)/internal/gc.h
17881+
string.$(OBJEXT): $(top_srcdir)/internal/hash.h
1788117882
string.$(OBJEXT): $(top_srcdir)/internal/imemo.h
1788217883
string.$(OBJEXT): $(top_srcdir)/internal/numeric.h
1788317884
string.$(OBJEXT): $(top_srcdir)/internal/object.h

hash.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2037,7 +2037,7 @@ call_default_proc(VALUE proc, VALUE hash, VALUE key)
20372037
return rb_proc_call_with_block(proc, 2, args, Qnil);
20382038
}
20392039

2040-
static bool
2040+
bool
20412041
rb_hash_default_unredefined(VALUE hash)
20422042
{
20432043
VALUE klass = RBASIC_CLASS(hash);

internal/hash.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ VALUE rb_hash_set_pair(VALUE hash, VALUE pair);
8686
int rb_hash_stlike_delete(VALUE hash, st_data_t *pkey, st_data_t *pval);
8787
int rb_hash_stlike_foreach_with_replace(VALUE hash, st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg);
8888
int rb_hash_stlike_update(VALUE hash, st_data_t key, st_update_callback_func *func, st_data_t arg);
89+
bool rb_hash_default_unredefined(VALUE hash);
8990
VALUE rb_ident_hash_new_with_size(st_index_t size);
9091
void rb_hash_free(VALUE hash);
9192
RUBY_EXTERN VALUE rb_cHash_empty_frozen;

string.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "internal/encoding.h"
3232
#include "internal/error.h"
3333
#include "internal/gc.h"
34+
#include "internal/hash.h"
3435
#include "internal/numeric.h"
3536
#include "internal/object.h"
3637
#include "internal/proc.h"
@@ -6295,7 +6296,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
62956296
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
62966297
long beg, beg0, end0;
62976298
long offset, blen, slen, len, last;
6298-
enum {STR, ITER, MAP} mode = STR;
6299+
enum {STR, ITER, FAST_MAP, MAP} mode = STR;
62996300
char *sp, *cp;
63006301
int need_backref = -1;
63016302
rb_encoding *str_enc;
@@ -6311,6 +6312,9 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
63116312
if (NIL_P(hash)) {
63126313
StringValue(repl);
63136314
}
6315+
else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6316+
mode = FAST_MAP;
6317+
}
63146318
else {
63156319
mode = MAP;
63166320
}
@@ -6355,7 +6359,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
63556359
val = rb_obj_as_string(rb_yield(match0));
63566360
}
63576361
else {
6358-
val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6362+
struct RString fake_str;
6363+
VALUE key;
6364+
if (mode == FAST_MAP) {
6365+
// It is safe to use a fake_str here because we established that it won't escape,
6366+
// as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6367+
// default proc.
6368+
key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6369+
}
6370+
else {
6371+
key = rb_str_subseq(str, beg0, end0 - beg0);
6372+
}
6373+
val = rb_hash_aref(hash, key);
63596374
val = rb_obj_as_string(val);
63606375
}
63616376
str_mod_check(str, sp, slen);

0 commit comments

Comments
 (0)