From 6e1323107c60abbce9dbd98e741078f11aebd756 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Mon, 4 Aug 2025 16:40:57 -0400 Subject: [PATCH 01/12] Fix autoloading of encodings when transcoding (ex: String#encode) Use a managed `fast_transcoder_table` to cache results of src/dest encoding lookup. Without the cache, it could take a while because `transcode_search_path` does a BFS to get from src to dest encodings due to base encodings, aliases, etc. Since this managed table is a single-level table instead of 2 levels like `transcoder_table`, we don't need VM lock held and can use RCU. Make sure VM lock is NOT held when calling `load_transcoder_entry`, as that could call `rb_require_internal_silent`. --- depend | 2 + test/ruby/test_transcode.rb | 22 +++++++ transcode.c | 128 ++++++++++++++++++++---------------- 3 files changed, 96 insertions(+), 56 deletions(-) diff --git a/depend b/depend index ec8c2771c92104..9a0612562cc167 100644 --- a/depend +++ b/depend @@ -17730,6 +17730,7 @@ transcode.$(OBJEXT): $(top_srcdir)/internal/transcode.h transcode.$(OBJEXT): $(top_srcdir)/internal/variable.h transcode.$(OBJEXT): $(top_srcdir)/internal/warnings.h transcode.$(OBJEXT): {$(VPATH)}assert.h +transcode.$(OBJEXT): {$(VPATH)}atomic.h transcode.$(OBJEXT): {$(VPATH)}backward/2/assume.h transcode.$(OBJEXT): {$(VPATH)}backward/2/attributes.h transcode.$(OBJEXT): {$(VPATH)}backward/2/bool.h @@ -17901,6 +17902,7 @@ transcode.$(OBJEXT): {$(VPATH)}internal/xmalloc.h transcode.$(OBJEXT): {$(VPATH)}missing.h transcode.$(OBJEXT): {$(VPATH)}onigmo.h transcode.$(OBJEXT): {$(VPATH)}oniguruma.h +transcode.$(OBJEXT): {$(VPATH)}ruby_atomic.h transcode.$(OBJEXT): {$(VPATH)}shape.h transcode.$(OBJEXT): {$(VPATH)}st.h transcode.$(OBJEXT): {$(VPATH)}subst.h diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 63d37f4ba4ff90..2ace6d0e63a58f 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -2320,6 +2320,28 @@ def test_newline_options assert_equal("A\nB\nC", s.encode(usascii, newline: :lf)) end + def test_ractor_lazy_load_encoding + assert_ractor("#{<<~"begin;"}\n#{<<~'end;'}") + begin; + rs = [] + autoload_encodings = Encoding.list.select { |e| e.inspect.include?("(autoload)") }.freeze + 7.times do + rs << Ractor.new(autoload_encodings) do |encodings| + str = "\u0300" + encodings.each do |enc| + str.encode(enc) rescue Encoding::UndefinedConversionError + end + end + end + + while rs.any? + r, _obj = Ractor.select(*rs) + rs.delete(r) + end + assert rs.empty? + end; + end + private def assert_conversion_both_ways_utf8(utf8, raw, encoding) diff --git a/transcode.c b/transcode.c index d8cd90e56d5cf7..688f78c3d8fc48 100644 --- a/transcode.c +++ b/transcode.c @@ -21,8 +21,10 @@ #include "internal/transcode.h" #include "ruby/encoding.h" #include "vm_sync.h" +#include "ruby_atomic.h" #include "transcode_data.h" +#include "id_table.h" #include "id.h" #define ENABLE_ECONV_NEWLINE_OPTION 1 @@ -62,6 +64,8 @@ static VALUE sym_finished; static VALUE sym_after_output; static VALUE sym_incomplete_input; +static VALUE fast_transcoder_table; + static unsigned char * allocate_converted_string(const char *sname, const char *dname, const unsigned char *str, size_t len, @@ -340,7 +344,7 @@ transcode_search_path(const char *sname, const char *dname, bfs.queue_last_ptr = &q->next; bfs.queue = q; - bfs.visited = st_init_strcasetable(); + bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL); RB_VM_LOCKING() { @@ -351,14 +355,14 @@ transcode_search_path(const char *sname, const char *dname, bfs.queue_last_ptr = &bfs.queue; } - lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); + lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2 if (!lookup_res) { xfree(q); continue; } table2 = (st_table *)val; - if (st_lookup(table2, (st_data_t)dname, &val)) { + if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc); xfree(q); found = true; @@ -411,8 +415,7 @@ int rb_require_internal_silent(VALUE fname); static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry) { - // changes result of entry->transcoder depending on if it's required or not, so needs lock - ASSERT_vm_locking(); + ASSERT_vm_unlocking(); if (entry->transcoder) return entry->transcoder; @@ -427,7 +430,7 @@ load_transcoder_entry(transcoder_entry_t *entry) memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len); rb_str_set_len(fn, total_len); OBJ_FREEZE(fn); - rb_require_internal_silent(fn); + rb_require_internal_silent(fn); // Sets entry->transcoder } if (entry->transcoder) @@ -977,19 +980,19 @@ rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i) } static rb_econv_t * -rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries) +rb_econv_open_by_transcoder_entries(transcoder_entry_t **entries) { rb_econv_t *ec; int i, ret; - ASSERT_vm_locking(); - for (i = 0; i < n; i++) { + for (i = 0; entries && entries[i]; i++) { const rb_transcoder *tr; tr = load_transcoder_entry(entries[i]); if (!tr) return NULL; } + int n = i; ec = rb_econv_alloc(n); for (i = 0; i < n; i++) { @@ -1015,7 +1018,9 @@ trans_open_i(const char *sname, const char *dname, int depth, void *arg) struct trans_open_t *toarg = arg; if (!toarg->entries) { - toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional); + size_t num = depth+1+toarg->num_additional+1; + toarg->entries = ALLOC_N(transcoder_entry_t *, num); + memset(toarg->entries + num - 1, 0, sizeof(transcoder_entry_t*)); // last entry is 0 } toarg->entries[depth] = get_transcoder_entry(sname, dname); } @@ -1026,12 +1031,9 @@ rb_econv_open0(const char *sname, const char *dname, int ecflags) transcoder_entry_t **entries = NULL; int num_trans; rb_econv_t *ec; - ASSERT_vm_locking(); - /* Just check if sname and dname are defined */ - /* (This check is needed?) */ - if (*sname) rb_enc_find_index(sname); - if (*dname) rb_enc_find_index(dname); + if (*sname) rb_enc_find_index(sname); // loads encoding if not already loaded + if (*dname) rb_enc_find_index(dname); // loads encoding if not already loaded if (*sname == '\0' && *dname == '\0') { num_trans = 0; @@ -1042,16 +1044,36 @@ rb_econv_open0(const char *sname, const char *dname, int ecflags) struct trans_open_t toarg; toarg.entries = NULL; toarg.num_additional = 0; - num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); - entries = toarg.entries; - if (num_trans < 0) { - xfree(entries); - return NULL; + char buf[128] = { 0 }; // encoding namelen max is currently 63 bytes, so this is enough + char *p = buf; + size_t slen = strlen(sname); + memcpy(p, sname, slen); + p += slen; + memcpy(p, ":", 1); + p += 1; + memcpy(p, dname, strlen(dname)); + ID src_to_dest_id = rb_intern(buf); + VALUE managed_val; + VALUE tbl = RUBY_ATOMIC_VALUE_LOAD(fast_transcoder_table); + if (rb_managed_id_table_lookup(tbl, src_to_dest_id, &managed_val)) { + entries = (transcoder_entry_t **)managed_val; + } else { + num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); + entries = toarg.entries; + if (num_trans < 0) { + xfree(entries); + return NULL; + } + // No need for CAS loop if it's not most recent `fast_transcoder_table`, some values + // can be lost. It will just go through the slow path next time for the lost src/dst encoding + // pairs + VALUE new_transcoder_tbl = rb_managed_id_table_dup(tbl); + rb_managed_id_table_insert(new_transcoder_tbl, src_to_dest_id, (VALUE)entries); + RUBY_ATOMIC_VALUE_SET(fast_transcoder_table, new_transcoder_tbl); } } - ec = rb_econv_open_by_transcoder_entries(num_trans, entries); - xfree(entries); + ec = rb_econv_open_by_transcoder_entries(entries); if (!ec) return NULL; @@ -1117,15 +1139,13 @@ rb_econv_open(const char *sname, const char *dname, int ecflags) if (num_decorators == -1) return NULL; - RB_VM_LOCKING() { - ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); - if (ec) { - for (i = 0; i < num_decorators; i++) { - if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { - rb_econv_close(ec); - ec = NULL; - break; - } + ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); + if (ec) { + for (i = 0; i < num_decorators; i++) { + if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { + rb_econv_close(ec); + ec = NULL; + break; } } } @@ -1960,14 +1980,12 @@ rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int if (ec->started != 0) return -1; - RB_VM_LOCKING() { - entry = get_transcoder_entry(sname, dname); - if (entry) { - tr = load_transcoder_entry(entry); - } - + entry = get_transcoder_entry(sname, dname); + if (entry) { + tr = load_transcoder_entry(entry); } + return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1; } @@ -2681,21 +2699,19 @@ rb_econv_open_opts(const char *source_encoding, const char *destination_encoding replacement = rb_hash_aref(opthash, sym_replace); } - RB_VM_LOCKING() { - ec = rb_econv_open(source_encoding, destination_encoding, ecflags); - if (ec) { - if (!NIL_P(replacement)) { - int ret; - rb_encoding *enc = rb_enc_get(replacement); - - ret = rb_econv_set_replacement(ec, - (const unsigned char *)RSTRING_PTR(replacement), - RSTRING_LEN(replacement), - rb_enc_name(enc)); - if (ret == -1) { - rb_econv_close(ec); - ec = NULL; - } + ec = rb_econv_open(source_encoding, destination_encoding, ecflags); + if (ec) { + if (!NIL_P(replacement)) { + int ret; + rb_encoding *enc = rb_enc_get(replacement); + + ret = rb_econv_set_replacement(ec, + (const unsigned char *)RSTRING_PTR(replacement), + RSTRING_LEN(replacement), + rb_enc_name(enc)); + if (ret == -1) { + rb_econv_close(ec); + ec = NULL; } } } @@ -3132,10 +3148,8 @@ decorate_convpath(VALUE convpath, int ecflags) const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1))); transcoder_entry_t *entry; const rb_transcoder *tr; - RB_VM_LOCKING() { - entry = get_transcoder_entry(sname, dname); - tr = load_transcoder_entry(entry); - } + entry = get_transcoder_entry(sname, dname); + tr = load_transcoder_entry(entry); if (!tr) return -1; if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && @@ -4486,6 +4500,8 @@ void Init_transcode(void) { transcoder_table = st_init_strcasetable(); + fast_transcoder_table = rb_managed_id_table_new(8); // NOTE: size is arbitrarily chosen + rb_gc_register_address(&fast_transcoder_table); id_destination_encoding = rb_intern_const("destination_encoding"); id_destination_encoding_name = rb_intern_const("destination_encoding_name"); From 8de47d2fa897addf4a4aabeec0e9c3c953bbcc5f Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Wed, 6 Aug 2025 13:39:42 -0400 Subject: [PATCH 02/12] encoding.c: Make sure autoloading encoding is done without VM lock held --- encoding.c | 46 ++++++++++++++++++++++--------------- test/ruby/test_transcode.rb | 18 +++++++++++++++ 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/encoding.c b/encoding.c index 7bca8d1b2b3d38..aa211d84709ba3 100644 --- a/encoding.c +++ b/encoding.c @@ -258,6 +258,7 @@ must_encindex(int index) int rb_to_encoding_index(VALUE enc) { + ASSERT_vm_unlocking(); // can load encoding, so must not hold VM lock int idx; const char *name; @@ -668,9 +669,11 @@ rb_enc_alias(const char *alias, const char *orig) { int idx, r; + idx = rb_enc_find_index(orig); + GLOBAL_ENC_TABLE_LOCKING(enc_table) { enc_check_addable(enc_table, alias); - if ((idx = rb_enc_find_index(orig)) < 0) { + if (idx < 0) { r = -1; } else { @@ -742,6 +745,7 @@ int rb_require_internal_silent(VALUE fname); static int load_encoding(const char *name) { + ASSERT_vm_unlocking(); VALUE enclib = rb_sprintf("enc/%s.so", name); VALUE debug = ruby_debug; VALUE errinfo; @@ -757,7 +761,7 @@ load_encoding(const char *name) enclib = rb_fstring(enclib); ruby_debug = Qfalse; errinfo = rb_errinfo(); - loaded = rb_require_internal_silent(enclib); + loaded = rb_require_internal_silent(enclib); // must run without VM_LOCK ruby_debug = debug; rb_set_errinfo(errinfo); @@ -781,6 +785,7 @@ enc_autoload_body(rb_encoding *enc) { rb_encoding *base; int i = 0; + ASSERT_vm_unlocking(); GLOBAL_ENC_TABLE_LOCKING(enc_table) { base = enc_table->list[ENC_TO_ENCINDEX(enc)].base; @@ -792,30 +797,32 @@ enc_autoload_body(rb_encoding *enc) } } while (enc_table->list[i].enc != base && (++i, 1)); } + } + - if (i != -1) { - if (base) { - bool do_register = true; - if (rb_enc_autoload_p(base)) { - if (rb_enc_autoload(base) < 0) { - do_register = false; - i = -1; - } + if (i != -1) { + if (base) { + bool do_register = true; + if (rb_enc_autoload_p(base)) { + if (rb_enc_autoload(base) < 0) { + do_register = false; + i = -1; } + } - i = enc->ruby_encoding_index; - if (do_register) { + if (do_register) { + GLOBAL_ENC_TABLE_LOCKING(enc_table) { + i = enc->ruby_encoding_index; enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base); ((rb_raw_encoding *)enc)->ruby_encoding_index = i; } - - i &= ENC_INDEX_MASK; - } - else { - i = -2; } - } + i &= ENC_INDEX_MASK; + } + else { + i = -2; + } } return i; @@ -824,6 +831,7 @@ enc_autoload_body(rb_encoding *enc) int rb_enc_autoload(rb_encoding *enc) { + ASSERT_vm_unlocking(); int i = enc_autoload_body(enc); if (i == -2) { i = load_encoding(rb_enc_name(enc)); @@ -843,6 +851,7 @@ rb_enc_autoload_p(rb_encoding *enc) int rb_enc_find_index(const char *name) { + ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary int i; GLOBAL_ENC_TABLE_LOCKING(enc_table) { i = enc_registered(enc_table, name); @@ -1019,7 +1028,6 @@ rb_enc_associate_index(VALUE obj, int idx) rb_encoding *enc; int oldidx, oldtermlen, termlen; -/* enc_check_capable(obj);*/ rb_check_frozen(obj); oldidx = rb_enc_get_index(obj); if (oldidx == idx) diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 2ace6d0e63a58f..2b6f8234ced64f 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -2342,6 +2342,24 @@ def test_ractor_lazy_load_encoding end; end + def test_ractor_lazy_load_encoding_random + assert_ractor("#{<<~"begin;"}\n#{<<~'end;'}") + begin; + rs = [] + 100.times do + rs << Ractor.new do + "\u0300".encode(Encoding.list.sample) rescue Encoding::UndefinedConversionError + end + end + + while rs.any? + r, _obj = Ractor.select(*rs) + rs.delete(r) + end + assert rs.empty? + end; + end + private def assert_conversion_both_ways_utf8(utf8, raw, encoding) From 69f7770827c336d5fef6038c165a9dfaa79c53f9 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Mon, 4 Aug 2025 20:06:35 -0400 Subject: [PATCH 03/12] rb_enc_find_index shouldn't lock for main 3 encodings --- encoding.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/encoding.c b/encoding.c index aa211d84709ba3..125e5d407cd6ea 100644 --- a/encoding.c +++ b/encoding.c @@ -73,6 +73,10 @@ static struct enc_table { st_table *names; } global_enc_table; +static const char *string_UTF_8; +static const char *string_US_ASCII; +static const char *string_ASCII_8BIT; + static int enc_names_free_i(st_data_t name, st_data_t idx, st_data_t args) { @@ -710,7 +714,8 @@ rb_enc_init(struct enc_table *enc_table) enc_table->names = st_init_strcasetable_with_size(ENCODING_LIST_CAPA); } #define OnigEncodingASCII_8BIT OnigEncodingASCII -#define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) +#define ENC_REGISTER(enc) string_##enc = rb_enc_name(&OnigEncoding##enc); \ + enc_register_at(enc_table, ENCINDEX_##enc, string_##enc, &OnigEncoding##enc) ENC_REGISTER(ASCII_8BIT); ENC_REGISTER(UTF_8); ENC_REGISTER(US_ASCII); @@ -851,6 +856,23 @@ rb_enc_autoload_p(rb_encoding *enc) int rb_enc_find_index(const char *name) { + size_t input_len = strlen(name); + switch(input_len) { + case 5: + if (strncmp(name, string_UTF_8, 5) == 0) { + return ENCINDEX_UTF_8; + } + case 8: + if (strncmp(name, string_US_ASCII, 8) == 0) { + return ENCINDEX_US_ASCII; + } + case 10: + if (strncmp(name, string_ASCII_8BIT, 10) == 0) { + return ENCINDEX_ASCII_8BIT; + } + default: + break; + } ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary int i; GLOBAL_ENC_TABLE_LOCKING(enc_table) { From e95b85ef1cc8af368c6530a903094ff204fddee4 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Mon, 4 Aug 2025 20:14:28 -0400 Subject: [PATCH 04/12] For setting default external/internal encoding, loading should be done first Loading needs to be outside of VM lock. --- encoding.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/encoding.c b/encoding.c index 125e5d407cd6ea..ab763c87fb43ce 100644 --- a/encoding.c +++ b/encoding.c @@ -1614,6 +1614,10 @@ enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const cha /* Already set */ overridden = TRUE; + if (!NIL_P(encoding)) { + enc_check_encoding(encoding); // loads it if necessary. Needs to be done outside of VM lock. + } + GLOBAL_ENC_TABLE_LOCKING(enc_table) { if (NIL_P(encoding)) { def->index = -1; From fdfc51e83f7d4a53f53ed14ed493e1850cb3f253 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Tue, 5 Aug 2025 11:55:05 -0400 Subject: [PATCH 05/12] make methods on ENV safe if they load an encoding Most of the time this should work fine, but if a native extension calls `setlocale(SOME_LOCALE)`, and that encoding is not loaded, it will load next time `rb_locale_encoding()` is called. This is called for methods on ENV like ENV#[]. Since these ENV methods acquire the VM lock, we need to make sure the encoding is loaded prior to locking it. Calling `setlocale()` like this is supported. The documentation for `rb_locale_encoding()` states: ``` * This is dynamic. If you change the process' locale by e.g. calling * `setlocale(3)`, that should also change the return value of this function. * * There is no official way for Ruby scripts to manipulate locales, though. rb_encoding *rb_locale_encoding(void); ``` --- encoding.c | 5 ++++- hash.c | 37 ++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/encoding.c b/encoding.c index ab763c87fb43ce..fc1bf3bffb2cc3 100644 --- a/encoding.c +++ b/encoding.c @@ -856,6 +856,7 @@ rb_enc_autoload_p(rb_encoding *enc) int rb_enc_find_index(const char *name) { + ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary size_t input_len = strlen(name); switch(input_len) { case 5: @@ -873,7 +874,6 @@ rb_enc_find_index(const char *name) default: break; } - ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary int i; GLOBAL_ENC_TABLE_LOCKING(enc_table) { i = enc_registered(enc_table, name); @@ -1556,6 +1556,9 @@ int rb_locale_charmap_index(void); int rb_locale_encindex(void) { + // `rb_locale_charmap_index` can call `enc_find_index`, which can + // load an encoding. This needs to be done without VM lock held. + ASSERT_vm_unlocking(); int idx = rb_locale_charmap_index(); if (idx < 0) idx = ENCINDEX_UTF_8; diff --git a/hash.c b/hash.c index 7ce1b768e0e831..de9bc97ea69cdf 100644 --- a/hash.c +++ b/hash.c @@ -5192,25 +5192,26 @@ env_enc_str_new(const char *ptr, long len, rb_encoding *enc) } static VALUE -env_str_new(const char *ptr, long len) +env_str_new(const char *ptr, long len, rb_encoding *enc) { - return env_enc_str_new(ptr, len, env_encoding()); + return env_enc_str_new(ptr, len, enc); } static VALUE -env_str_new2(const char *ptr) +env_str_new2(const char *ptr, rb_encoding *enc) { if (!ptr) return Qnil; - return env_str_new(ptr, strlen(ptr)); + return env_str_new(ptr, strlen(ptr), enc); } static VALUE getenv_with_lock(const char *name) { VALUE ret; + rb_encoding *enc = env_encoding(); ENV_LOCKING() { const char *val = getenv(name); - ret = env_str_new2(val); + ret = env_str_new2(val, enc); } return ret; } @@ -5773,13 +5774,14 @@ env_values(void) { VALUE ary = rb_ary_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_ary_push(ary, env_str_new2(s+1)); + rb_ary_push(ary, env_str_new2(s+1, enc)); } env++; } @@ -5865,14 +5867,15 @@ env_each_pair(VALUE ehash) VALUE ary = rb_ary_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_ary_push(ary, env_str_new(*env, s-*env)); - rb_ary_push(ary, env_str_new2(s+1)); + rb_ary_push(ary, env_str_new(*env, s-*env, enc)); + rb_ary_push(ary, env_str_new2(s+1, enc)); } env++; } @@ -6255,13 +6258,14 @@ env_to_a(VALUE _) { VALUE ary = rb_ary_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env), - env_str_new2(s+1))); + rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env, enc), + env_str_new2(s+1, enc))); } env++; } @@ -6509,6 +6513,7 @@ env_key(VALUE dmy, VALUE value) StringValue(value); VALUE str = Qnil; + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { @@ -6516,7 +6521,7 @@ env_key(VALUE dmy, VALUE value) if (s++) { long len = strlen(s); if (RSTRING_LEN(value) == len && strncmp(s, RSTRING_PTR(value), len) == 0) { - str = env_str_new(*env, s-*env-1); + str = env_str_new(*env, s-*env-1, enc); break; } } @@ -6533,13 +6538,14 @@ env_to_hash(void) { VALUE hash = rb_hash_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_hash_aset(hash, env_str_new(*env, s-*env), - env_str_new2(s+1)); + rb_hash_aset(hash, env_str_new(*env, s-*env, enc), + env_str_new2(s+1, enc)); } env++; } @@ -6684,14 +6690,15 @@ env_shift(VALUE _) VALUE result = Qnil; VALUE key = Qnil; + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); if (*env) { const char *p = *env; char *s = strchr(p, '='); if (s) { - key = env_str_new(p, s-p); - VALUE val = env_str_new2(getenv(RSTRING_PTR(key))); + key = env_str_new(p, s-p, enc); + VALUE val = env_str_new2(getenv(RSTRING_PTR(key)), enc); result = rb_assoc_new(key, val); } } From 4e4f4e12bc187c9d373394b266629b3797a7d1cf Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Tue, 5 Aug 2025 15:34:36 -0400 Subject: [PATCH 06/12] Don't attempt to load encoding if given encoding object during transcode This avoids taking the VM lock when doing "str".encode(Encoding::UTF_32), for example. --- transcode.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 7 deletions(-) diff --git a/transcode.c b/transcode.c index 688f78c3d8fc48..a8f4bb8d653f3e 100644 --- a/transcode.c +++ b/transcode.c @@ -1026,14 +1026,14 @@ trans_open_i(const char *sname, const char *dname, int depth, void *arg) } static rb_econv_t * -rb_econv_open0(const char *sname, const char *dname, int ecflags) +rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const char *dname, int ecflags) { transcoder_entry_t **entries = NULL; int num_trans; rb_econv_t *ec; - if (*sname) rb_enc_find_index(sname); // loads encoding if not already loaded - if (*dname) rb_enc_find_index(dname); // loads encoding if not already loaded + if (*sname && (!senc || !senc->max_enc_len)) rb_enc_find_index(sname); // loads encoding if not already loaded + if (*dname && (!denc || !denc->max_enc_len)) rb_enc_find_index(dname); // loads encoding if not already loaded if (*sname == '\0' && *dname == '\0') { num_trans = 0; @@ -1127,6 +1127,35 @@ decorator_names(int ecflags, const char **decorators_ret) return num_decorators; } +static rb_econv_t * +rb_econv_open_enc(rb_encoding *senc, const char *sname, rb_encoding *denc, const char *dname, int ecflags) +{ + rb_econv_t *ec; + int num_decorators; + const char *decorators[MAX_ECFLAGS_DECORATORS]; + int i; + + num_decorators = decorator_names(ecflags, decorators); + if (num_decorators == -1) + return NULL; + + ec = rb_econv_open0(senc, sname, denc, dname, ecflags & ECONV_ERROR_HANDLER_MASK); + if (ec) { + for (i = 0; i < num_decorators; i++) { + if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { + rb_econv_close(ec); + ec = NULL; + break; + } + } + } + + if (ec) { + ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK; + } + return ec; // can be NULL +} + rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags) { @@ -1139,7 +1168,7 @@ rb_econv_open(const char *sname, const char *dname, int ecflags) if (num_decorators == -1) return NULL; - ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); + ec = rb_econv_open0(NULL, sname, NULL, dname, ecflags & ECONV_ERROR_HANDLER_MASK); if (ec) { for (i = 0; i < num_decorators; i++) { if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { @@ -2360,11 +2389,16 @@ aref_fallback(VALUE fallback, VALUE c) return rb_funcallv_public(fallback, idAREF, 1, &c); } +static rb_econv_t * +rb_econv_open_opts_enc(rb_encoding *senc, const char *source_encoding, rb_encoding *denc, const char *destination_encoding, int ecflags, VALUE opthash); + static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), + rb_encoding *senc, + rb_encoding *denc, const char *src_encoding, const char *dst_encoding, int ecflags, @@ -2379,7 +2413,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, VALUE fallback = Qnil; VALUE (*fallback_func)(VALUE, VALUE) = 0; - ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); + ec = rb_econv_open_opts_enc(senc, src_encoding, denc, dst_encoding, ecflags, ecopts); if (!ec) rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); @@ -2684,6 +2718,40 @@ rb_econv_prepare_opts(VALUE opthash, VALUE *opts) return rb_econv_prepare_options(opthash, opts, 0); } +static rb_econv_t * +rb_econv_open_opts_enc(rb_encoding *senc, const char *source_encoding, rb_encoding *denc, const char *destination_encoding, int ecflags, VALUE opthash) +{ + rb_econv_t *ec; + VALUE replacement; + + if (NIL_P(opthash)) { + replacement = Qnil; + } + else { + if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash)) + rb_bug("rb_econv_open_opts called with invalid opthash"); + replacement = rb_hash_aref(opthash, sym_replace); + } + + ec = rb_econv_open_enc(senc, source_encoding, denc, destination_encoding, ecflags); + if (ec) { + if (!NIL_P(replacement)) { + int ret; + rb_encoding *enc = rb_enc_get(replacement); + + ret = rb_econv_set_replacement(ec, + (const unsigned char *)RSTRING_PTR(replacement), + RSTRING_LEN(replacement), + rb_enc_name(enc)); + if (ret == -1) { + rb_econv_close(ec); + ec = NULL; + } + } + } + return ec; // can be NULL +} + rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) { @@ -2778,7 +2846,7 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) long blen, slen; unsigned char *buf, *bp, *sp; const unsigned char *fromp; - rb_encoding *senc, *denc; + rb_encoding *senc = NULL, *denc = NULL; const char *sname, *dname; int dencidx; int explicitly_invalid_replace = TRUE; @@ -2847,7 +2915,7 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) dest = rb_str_tmp_new(blen); bp = (unsigned char *)RSTRING_PTR(dest); - transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts); + transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, senc, denc, sname, dname, ecflags, ecopts); if (fromp != sp+slen) { rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp); } From 87b41498ffe592dcf7db26ebf347e3daccb92d0b Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Tue, 5 Aug 2025 17:34:59 -0400 Subject: [PATCH 07/12] Add fast_transcoder_entry_table, rename other transcoder managed tbl We want to avoid taking the VM lock at all on transcoding operations like `String#encode`, so now we have another managed table so we can avoid going through a two-level table that requires a VM lock. I think this is justified because looking up transcoder entries is done fairly often. Ractors are now quite fast when using `String#encode`. --- transcode.c | 128 ++++++++++++++++++++++------------------------------ 1 file changed, 55 insertions(+), 73 deletions(-) diff --git a/transcode.c b/transcode.c index a8f4bb8d653f3e..d7d809d6854df5 100644 --- a/transcode.c +++ b/transcode.c @@ -28,6 +28,7 @@ #include "id.h" #define ENABLE_ECONV_NEWLINE_OPTION 1 +#define SRC_ENC_TO_DST_ENC_KEY_SIZE 128 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */ static VALUE rb_eUndefinedConversionError; @@ -64,7 +65,8 @@ static VALUE sym_finished; static VALUE sym_after_output; static VALUE sym_incomplete_input; -static VALUE fast_transcoder_table; +static VALUE fast_transcoder_path_table; +static VALUE fast_transcoder_entry_table; static unsigned char * allocate_converted_string(const char *sname, const char *dname, @@ -208,11 +210,26 @@ rb_free_transcoder_table(void) st_free_table(transcoder_table); } +static void +gen_src_to_dst_encodings_key(const char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE], const char *sname, const char *dname) +{ + char *p = (char*)key_buf; + size_t slen = strlen(sname); + memcpy(p, sname, slen); + p += slen; + memcpy(p, ":", 1); + p += 1; + size_t dlen = strlen(dname); + RUBY_ASSERT(slen + dlen < SRC_ENC_TO_DST_ENC_KEY_SIZE); + memcpy(p, dname, dlen); +} + static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname) { st_data_t val; st_table *table2; + transcoder_entry_t *entry = NULL; RB_VM_LOCKING() { if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { @@ -221,16 +238,24 @@ make_transcoder_entry(const char *sname, const char *dname) } table2 = (st_table *)val; if (!st_lookup(table2, (st_data_t)dname, &val)) { - transcoder_entry_t *entry = ALLOC(transcoder_entry_t); + entry = ALLOC(transcoder_entry_t); entry->sname = sname; entry->dname = dname; entry->lib = NULL; entry->transcoder = NULL; val = (st_data_t)entry; st_add_direct(table2, (st_data_t)dname, val); + } else { + entry = (transcoder_entry_t*)val; } } - return (transcoder_entry_t *)val; + char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + gen_src_to_dst_encodings_key(key_buf, sname, dname); + VALUE tbl = fast_transcoder_entry_table; + VALUE new_tbl = rb_managed_id_table_dup(tbl); + rb_managed_id_table_insert(new_tbl, rb_intern(key_buf), (VALUE)entry); + RUBY_ATOMIC_VALUE_SET(fast_transcoder_entry_table, new_tbl); // TODO: use CAS + return entry; } static transcoder_entry_t * @@ -238,6 +263,14 @@ get_transcoder_entry(const char *sname, const char *dname) { st_data_t val = 0; st_table *table2; + char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + gen_src_to_dst_encodings_key(key_buf, sname, dname); + VALUE entry_val; + // TODO: once we CAS in `make_transcoder_entry`, we no longer need to check regular transcoder_table after + if (rb_managed_id_table_lookup(fast_transcoder_entry_table, rb_intern(key_buf), &entry_val)) { + return (transcoder_entry_t*)entry_val; + } + RB_VM_LOCKING() { if (st_lookup(transcoder_table, (st_data_t)sname, &val)) { table2 = (st_table *)val; @@ -1032,8 +1065,12 @@ rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const ch int num_trans; rb_econv_t *ec; - if (*sname && (!senc || !senc->max_enc_len)) rb_enc_find_index(sname); // loads encoding if not already loaded - if (*dname && (!denc || !denc->max_enc_len)) rb_enc_find_index(dname); // loads encoding if not already loaded + if (*sname && (!senc || !senc->max_enc_len)) { + rb_enc_find_index(sname); // loads encoding + } + if (*dname && (!denc || !denc->max_enc_len)) { + rb_enc_find_index(dname); // loads encoding + } if (*sname == '\0' && *dname == '\0') { num_trans = 0; @@ -1044,17 +1081,11 @@ rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const ch struct trans_open_t toarg; toarg.entries = NULL; toarg.num_additional = 0; - char buf[128] = { 0 }; // encoding namelen max is currently 63 bytes, so this is enough - char *p = buf; - size_t slen = strlen(sname); - memcpy(p, sname, slen); - p += slen; - memcpy(p, ":", 1); - p += 1; - memcpy(p, dname, strlen(dname)); - ID src_to_dest_id = rb_intern(buf); + char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + gen_src_to_dst_encodings_key(key_buf, sname, dname); + ID src_to_dest_id = rb_intern(key_buf); VALUE managed_val; - VALUE tbl = RUBY_ATOMIC_VALUE_LOAD(fast_transcoder_table); + VALUE tbl = RUBY_ATOMIC_VALUE_LOAD(fast_transcoder_path_table); if (rb_managed_id_table_lookup(tbl, src_to_dest_id, &managed_val)) { entries = (transcoder_entry_t **)managed_val; } else { @@ -1067,9 +1098,9 @@ rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const ch // No need for CAS loop if it's not most recent `fast_transcoder_table`, some values // can be lost. It will just go through the slow path next time for the lost src/dst encoding // pairs - VALUE new_transcoder_tbl = rb_managed_id_table_dup(tbl); - rb_managed_id_table_insert(new_transcoder_tbl, src_to_dest_id, (VALUE)entries); - RUBY_ATOMIC_VALUE_SET(fast_transcoder_table, new_transcoder_tbl); + VALUE new_tbl = rb_managed_id_table_dup(tbl); + rb_managed_id_table_insert(new_tbl, src_to_dest_id, (VALUE)entries); + RUBY_ATOMIC_VALUE_SET(fast_transcoder_path_table, new_tbl); } } @@ -1159,30 +1190,7 @@ rb_econv_open_enc(rb_encoding *senc, const char *sname, rb_encoding *denc, const rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags) { - rb_econv_t *ec; - int num_decorators; - const char *decorators[MAX_ECFLAGS_DECORATORS]; - int i; - - num_decorators = decorator_names(ecflags, decorators); - if (num_decorators == -1) - return NULL; - - ec = rb_econv_open0(NULL, sname, NULL, dname, ecflags & ECONV_ERROR_HANDLER_MASK); - if (ec) { - for (i = 0; i < num_decorators; i++) { - if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { - rb_econv_close(ec); - ec = NULL; - break; - } - } - } - - if (ec) { - ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK; - } - return ec; // can be NULL + return rb_econv_open_enc(NULL, sname, NULL, dname, ecflags); } static int @@ -2755,35 +2763,7 @@ rb_econv_open_opts_enc(rb_encoding *senc, const char *source_encoding, rb_encodi rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) { - rb_econv_t *ec; - VALUE replacement; - - if (NIL_P(opthash)) { - replacement = Qnil; - } - else { - if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash)) - rb_bug("rb_econv_open_opts called with invalid opthash"); - replacement = rb_hash_aref(opthash, sym_replace); - } - - ec = rb_econv_open(source_encoding, destination_encoding, ecflags); - if (ec) { - if (!NIL_P(replacement)) { - int ret; - rb_encoding *enc = rb_enc_get(replacement); - - ret = rb_econv_set_replacement(ec, - (const unsigned char *)RSTRING_PTR(replacement), - RSTRING_LEN(replacement), - rb_enc_name(enc)); - if (ret == -1) { - rb_econv_close(ec); - ec = NULL; - } - } - } - return ec; // can be NULL + return rb_econv_open_opts_enc(NULL, source_encoding, NULL, destination_encoding, ecflags, opthash); } static int @@ -4568,8 +4548,10 @@ void Init_transcode(void) { transcoder_table = st_init_strcasetable(); - fast_transcoder_table = rb_managed_id_table_new(8); // NOTE: size is arbitrarily chosen - rb_gc_register_address(&fast_transcoder_table); + fast_transcoder_path_table = rb_managed_id_table_new(8); // NOTE: size is arbitrarily chosen + rb_gc_register_address(&fast_transcoder_path_table); + fast_transcoder_entry_table = rb_managed_id_table_new(8); + rb_gc_register_address(&fast_transcoder_entry_table); id_destination_encoding = rb_intern_const("destination_encoding"); id_destination_encoding_name = rb_intern_const("destination_encoding_name"); From 2424348bae09571b8773f939505cafa07efa2c8d Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Tue, 5 Aug 2025 18:19:16 -0400 Subject: [PATCH 08/12] use CAS for fast_transcoder_entry_table --- include/ruby/internal/encoding/encoding.h | 1 + transcode.c | 29 ++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/ruby/internal/encoding/encoding.h b/include/ruby/internal/encoding/encoding.h index a58f9f2b1524b5..53412bd379bbda 100644 --- a/include/ruby/internal/encoding/encoding.h +++ b/include/ruby/internal/encoding/encoding.h @@ -67,6 +67,7 @@ enum ruby_encoding_consts { #define ENCODING_INLINE_MAX RUBY_ENCODING_INLINE_MAX /**< @old{RUBY_ENCODING_INLINE_MAX} */ #define ENCODING_SHIFT RUBY_ENCODING_SHIFT /**< @old{RUBY_ENCODING_SHIFT} */ #define ENCODING_MASK RUBY_ENCODING_MASK /**< @old{RUBY_ENCODING_MASK} */ +#define ENCODING_NAMELEN_MAX 63 /** * Destructively assigns the passed encoding to the passed object. The object diff --git a/transcode.c b/transcode.c index d7d809d6854df5..06aa8eec729cea 100644 --- a/transcode.c +++ b/transcode.c @@ -29,6 +29,7 @@ #define ENABLE_ECONV_NEWLINE_OPTION 1 #define SRC_ENC_TO_DST_ENC_KEY_SIZE 128 +STATIC_ASSERT(encoding_namelen, SRC_ENC_TO_DST_ENC_KEY_SIZE > (ENCODING_NAMELEN_MAX * 2 + 1)); /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */ static VALUE rb_eUndefinedConversionError; @@ -211,7 +212,7 @@ rb_free_transcoder_table(void) } static void -gen_src_to_dst_encodings_key(const char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE], const char *sname, const char *dname) +gen_src_to_dst_encodings_key(const char *key_buf, const char *sname, const char *dname) { char *p = (char*)key_buf; size_t slen = strlen(sname); @@ -231,6 +232,7 @@ make_transcoder_entry(const char *sname, const char *dname) st_table *table2; transcoder_entry_t *entry = NULL; + // TODO: we should be able to remove this table soon RB_VM_LOCKING() { if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { val = (st_data_t)st_init_strcasetable(); @@ -251,10 +253,20 @@ make_transcoder_entry(const char *sname, const char *dname) } char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; gen_src_to_dst_encodings_key(key_buf, sname, dname); - VALUE tbl = fast_transcoder_entry_table; - VALUE new_tbl = rb_managed_id_table_dup(tbl); - rb_managed_id_table_insert(new_tbl, rb_intern(key_buf), (VALUE)entry); - RUBY_ATOMIC_VALUE_SET(fast_transcoder_entry_table, new_tbl); // TODO: use CAS + ID key = rb_intern(key_buf); + while (1) { + VALUE tbl = fast_transcoder_entry_table; + VALUE entry_got; + if (rb_managed_id_table_lookup(tbl, key, &entry_got)) { + break; + } else { + VALUE new_tbl = rb_managed_id_table_dup(tbl); + rb_managed_id_table_insert(new_tbl, key, (VALUE)entry); + if (RUBY_ATOMIC_VALUE_CAS(fast_transcoder_entry_table, tbl, new_tbl) == tbl) { + break; + } + } + } return entry; } @@ -266,11 +278,11 @@ get_transcoder_entry(const char *sname, const char *dname) char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; gen_src_to_dst_encodings_key(key_buf, sname, dname); VALUE entry_val; - // TODO: once we CAS in `make_transcoder_entry`, we no longer need to check regular transcoder_table after if (rb_managed_id_table_lookup(fast_transcoder_entry_table, rb_intern(key_buf), &entry_val)) { return (transcoder_entry_t*)entry_val; } + // TODO: we should be able to remove this table soon RB_VM_LOCKING() { if (st_lookup(transcoder_table, (st_data_t)sname, &val)) { table2 = (st_table *)val; @@ -1065,11 +1077,12 @@ rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const ch int num_trans; rb_econv_t *ec; + // load encodings, if necessary if (*sname && (!senc || !senc->max_enc_len)) { - rb_enc_find_index(sname); // loads encoding + rb_enc_find_index(sname); } if (*dname && (!denc || !denc->max_enc_len)) { - rb_enc_find_index(dname); // loads encoding + rb_enc_find_index(dname); } if (*sname == '\0' && *dname == '\0') { From 34f0766d92e0d1c8844f2b8dec46155637638274 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Wed, 6 Aug 2025 11:07:07 -0400 Subject: [PATCH 09/12] Use STRCASECMP in rb_enc_find_index --- encoding.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/encoding.c b/encoding.c index fc1bf3bffb2cc3..a403b70038bc53 100644 --- a/encoding.c +++ b/encoding.c @@ -860,15 +860,15 @@ rb_enc_find_index(const char *name) size_t input_len = strlen(name); switch(input_len) { case 5: - if (strncmp(name, string_UTF_8, 5) == 0) { + if (STRCASECMP(name, string_UTF_8) == 0) { return ENCINDEX_UTF_8; } case 8: - if (strncmp(name, string_US_ASCII, 8) == 0) { + if (STRCASECMP(name, string_US_ASCII) == 0) { return ENCINDEX_US_ASCII; } case 10: - if (strncmp(name, string_ASCII_8BIT, 10) == 0) { + if (STRCASECMP(name, string_ASCII_8BIT) == 0) { return ENCINDEX_ASCII_8BIT; } default: From 48ddbab12816adefe144da8a05f93e0e89bd735b Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Wed, 6 Aug 2025 12:34:30 -0400 Subject: [PATCH 10/12] Use st_table for managed table instead of id_table Add managed_st_table functions and use them in transcode.c. --- include/ruby/st.h | 8 ++++ st.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++ transcode.c | 77 ++++++++++++++++------------------ 3 files changed, 145 insertions(+), 42 deletions(-) diff --git a/include/ruby/st.h b/include/ruby/st.h index f35ab436037237..08ee3779c3cc75 100644 --- a/include/ruby/st.h +++ b/include/ruby/st.h @@ -187,6 +187,14 @@ CONSTFUNC(st_index_t rb_st_hash_start(st_index_t h)); void rb_hash_bulk_insert_into_st_table(long, const VALUE *, VALUE); +VALUE rb_managed_st_table_create_numtable(size_t capa); +VALUE rb_managed_st_table_create_strtable(size_t capa); +VALUE rb_managed_st_table_create_strcasetable(size_t capa); +int rb_managed_st_table_lookup(VALUE tbl, st_data_t key, st_data_t *value); +int rb_managed_st_table_insert(VALUE tbl, st_data_t key, st_data_t value); +void rb_managed_st_table_add_direct(VALUE tbl, st_data_t key, st_data_t value); +VALUE rb_managed_st_table_dup(VALUE old_table); + RUBY_SYMBOL_EXPORT_END #if defined(__cplusplus) diff --git a/st.c b/st.c index 195a16b8ad73e7..a7853df2d1760e 100644 --- a/st.c +++ b/st.c @@ -3206,4 +3206,106 @@ set_compact_table(set_table *tab) } } +static void +managed_st_table_free(void *data) +{ + st_table *tbl = (st_table *)data; + free(tbl->bins); + free(tbl->entries); +} + +static size_t +managed_st_table_memsize(const void *data) +{ + st_table *tbl = (st_table*)data; + return st_memsize(tbl) - sizeof(st_table); +} + +const rb_data_type_t rb_managed_st_table_type = { + .wrap_struct_name = "VM/managed_st_table", + .function = { + .dmark = NULL, // Nothing to mark + .dfree = (RUBY_DATA_FUNC)managed_st_table_free, + .dsize = managed_st_table_memsize, + }, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE, +}; + +static inline st_table * +managed_st_table_ptr(VALUE obj) +{ + RUBY_ASSERT(RB_TYPE_P(obj, T_DATA)); + RUBY_ASSERT(rb_typeddata_inherited_p(RTYPEDDATA_TYPE(obj), &rb_managed_st_table_type)); + + return RTYPEDDATA_GET_DATA(obj); +} + +static VALUE +rb_managed_st_table_create_type(const rb_data_type_t *type, const struct st_hash_type *table_type, size_t capa) +{ + struct st_table *tbl; + VALUE obj = TypedData_Make_Struct(0, struct st_table, type, tbl); + st_init_existing_table_with_size(tbl, table_type, capa); + return obj; +} + +VALUE +rb_managed_st_table_create_numtable(size_t capa) +{ + return rb_managed_st_table_create_type(&rb_managed_st_table_type, &type_numhash, capa); +} + +VALUE +rb_managed_st_table_create_strtable(size_t capa) +{ + return rb_managed_st_table_create_type(&rb_managed_st_table_type, &type_strhash, capa); +} + +VALUE +rb_managed_st_table_create_strcasetable(size_t capa) +{ + return rb_managed_st_table_create_type(&rb_managed_st_table_type, &type_strcasehash, capa); +} + +int +rb_managed_st_table_lookup(VALUE tbl, st_data_t key, VALUE *value) +{ + st_table *st = managed_st_table_ptr(tbl); + st_data_t *val = (st_data_t*)value; + return st_lookup(st, key, val); +} + +int +rb_managed_st_table_insert(VALUE tbl, st_data_t key, VALUE value) +{ + st_table *st = managed_st_table_ptr(tbl); + return st_insert(st, key, (st_data_t)value); +} + +void +rb_managed_st_table_add_direct(VALUE tbl, st_data_t key, st_data_t value) +{ + st_table *st = managed_st_table_ptr(tbl); + return st_add_direct(st, key, value); +} + +static int +managed_st_table_dup_i(st_data_t key, st_data_t val, st_data_t data) { + st_table *tbl = (st_table *)data; + st_insert(tbl, key, val); + return ST_CONTINUE; +} + +VALUE +rb_managed_st_table_dup(VALUE old_table) +{ + struct st_table *new_tbl; + VALUE obj = TypedData_Make_Struct(0, struct st_table, RTYPEDDATA_TYPE(old_table), new_tbl); + struct st_table *old_tbl = managed_st_table_ptr(old_table); + st_init_existing_table_with_size(new_tbl, old_tbl->type, old_tbl->num_entries+1); + st_foreach(old_tbl, managed_st_table_dup_i, (st_data_t)new_tbl); + + return obj; +} + #endif diff --git a/transcode.c b/transcode.c index 06aa8eec729cea..79961a5aa7b75b 100644 --- a/transcode.c +++ b/transcode.c @@ -232,6 +232,32 @@ make_transcoder_entry(const char *sname, const char *dname) st_table *table2; transcoder_entry_t *entry = NULL; + char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + char *key = NULL; + gen_src_to_dst_encodings_key(key_buf, sname, dname); + while (1) { + VALUE tbl = fast_transcoder_entry_table; + VALUE entry_got; + if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &entry_got)) { + entry = (transcoder_entry_t*)entry_got; + break; + } else { + if (!entry) { + entry = ALLOC(transcoder_entry_t); + entry->sname = sname; + entry->dname = dname; + entry->lib = NULL; + entry->transcoder = NULL; + } + VALUE new_tbl = rb_managed_st_table_dup(tbl); + if (!key) key = strdup(key_buf); + rb_managed_st_table_insert(new_tbl, (st_data_t)key, (VALUE)entry); + if (RUBY_ATOMIC_VALUE_CAS(fast_transcoder_entry_table, tbl, new_tbl) == tbl) { + break; + } + } + } + // TODO: we should be able to remove this table soon RB_VM_LOCKING() { if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { @@ -240,58 +266,25 @@ make_transcoder_entry(const char *sname, const char *dname) } table2 = (st_table *)val; if (!st_lookup(table2, (st_data_t)dname, &val)) { - entry = ALLOC(transcoder_entry_t); - entry->sname = sname; - entry->dname = dname; - entry->lib = NULL; - entry->transcoder = NULL; val = (st_data_t)entry; st_add_direct(table2, (st_data_t)dname, val); } else { entry = (transcoder_entry_t*)val; } } - char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; - gen_src_to_dst_encodings_key(key_buf, sname, dname); - ID key = rb_intern(key_buf); - while (1) { - VALUE tbl = fast_transcoder_entry_table; - VALUE entry_got; - if (rb_managed_id_table_lookup(tbl, key, &entry_got)) { - break; - } else { - VALUE new_tbl = rb_managed_id_table_dup(tbl); - rb_managed_id_table_insert(new_tbl, key, (VALUE)entry); - if (RUBY_ATOMIC_VALUE_CAS(fast_transcoder_entry_table, tbl, new_tbl) == tbl) { - break; - } - } - } return entry; } static transcoder_entry_t * get_transcoder_entry(const char *sname, const char *dname) { - st_data_t val = 0; - st_table *table2; char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; gen_src_to_dst_encodings_key(key_buf, sname, dname); VALUE entry_val; - if (rb_managed_id_table_lookup(fast_transcoder_entry_table, rb_intern(key_buf), &entry_val)) { + if (rb_managed_st_table_lookup(fast_transcoder_entry_table, (st_data_t)key_buf, &entry_val)) { return (transcoder_entry_t*)entry_val; } - - // TODO: we should be able to remove this table soon - RB_VM_LOCKING() { - if (st_lookup(transcoder_table, (st_data_t)sname, &val)) { - table2 = (st_table *)val; - if (!st_lookup(table2, (st_data_t)dname, &val)) { - val = 0; - } - } - } - return (transcoder_entry_t *)val; + return NULL; } void @@ -1095,11 +1088,11 @@ rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const ch toarg.entries = NULL; toarg.num_additional = 0; char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + char *key; gen_src_to_dst_encodings_key(key_buf, sname, dname); - ID src_to_dest_id = rb_intern(key_buf); VALUE managed_val; VALUE tbl = RUBY_ATOMIC_VALUE_LOAD(fast_transcoder_path_table); - if (rb_managed_id_table_lookup(tbl, src_to_dest_id, &managed_val)) { + if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &managed_val)) { entries = (transcoder_entry_t **)managed_val; } else { num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); @@ -1111,8 +1104,9 @@ rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const ch // No need for CAS loop if it's not most recent `fast_transcoder_table`, some values // can be lost. It will just go through the slow path next time for the lost src/dst encoding // pairs - VALUE new_tbl = rb_managed_id_table_dup(tbl); - rb_managed_id_table_insert(new_tbl, src_to_dest_id, (VALUE)entries); + VALUE new_tbl = rb_managed_st_table_dup(tbl); + key = strdup(key_buf); + rb_managed_st_table_insert(new_tbl, (st_data_t)key, (VALUE)entries); RUBY_ATOMIC_VALUE_SET(fast_transcoder_path_table, new_tbl); } } @@ -1919,7 +1913,6 @@ rb_econv_asciicompat_encoding(const char *ascii_incompat_name) } } - } return data.ascii_compat_name; // can be NULL @@ -4561,9 +4554,9 @@ void Init_transcode(void) { transcoder_table = st_init_strcasetable(); - fast_transcoder_path_table = rb_managed_id_table_new(8); // NOTE: size is arbitrarily chosen + fast_transcoder_path_table = rb_managed_st_table_create_strcasetable(8); // NOTE: size is arbitrarily chosen rb_gc_register_address(&fast_transcoder_path_table); - fast_transcoder_entry_table = rb_managed_id_table_new(8); + fast_transcoder_entry_table = rb_managed_st_table_create_strcasetable(8); rb_gc_register_address(&fast_transcoder_entry_table); id_destination_encoding = rb_intern_const("destination_encoding"); From 8c3d8d5666e1dc90c7d9544c5eeeb12e1c44d250 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Wed, 6 Aug 2025 12:50:38 -0400 Subject: [PATCH 11/12] Use CAS in rb_econv_open0 --- transcode.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/transcode.c b/transcode.c index 79961a5aa7b75b..bdb90e4bef6610 100644 --- a/transcode.c +++ b/transcode.c @@ -28,8 +28,7 @@ #include "id.h" #define ENABLE_ECONV_NEWLINE_OPTION 1 -#define SRC_ENC_TO_DST_ENC_KEY_SIZE 128 -STATIC_ASSERT(encoding_namelen, SRC_ENC_TO_DST_ENC_KEY_SIZE > (ENCODING_NAMELEN_MAX * 2 + 1)); +#define SRC_ENC_TO_DST_ENC_KEY_SIZE (ENCODING_NAMELEN_MAX * 2 + 2) /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */ static VALUE rb_eUndefinedConversionError; @@ -1088,26 +1087,30 @@ rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const ch toarg.entries = NULL; toarg.num_additional = 0; char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; - char *key; + char *key = NULL; gen_src_to_dst_encodings_key(key_buf, sname, dname); VALUE managed_val; - VALUE tbl = RUBY_ATOMIC_VALUE_LOAD(fast_transcoder_path_table); - if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &managed_val)) { - entries = (transcoder_entry_t **)managed_val; - } else { - num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); - entries = toarg.entries; - if (num_trans < 0) { - xfree(entries); - return NULL; + while (1) { + VALUE tbl = fast_transcoder_path_table; + if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &managed_val)) { + entries = (transcoder_entry_t **)managed_val; + break; + } else { + if (!entries) { + num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); + entries = toarg.entries; + if (num_trans < 0) { + xfree(entries); + return NULL; + } + } + VALUE new_tbl = rb_managed_st_table_dup(tbl); + if (!key) key = strdup(key_buf); + rb_managed_st_table_insert(new_tbl, (st_data_t)key, (VALUE)entries); + if (RUBY_ATOMIC_VALUE_CAS(fast_transcoder_path_table, tbl, new_tbl) == tbl) { + break; + } } - // No need for CAS loop if it's not most recent `fast_transcoder_table`, some values - // can be lost. It will just go through the slow path next time for the lost src/dst encoding - // pairs - VALUE new_tbl = rb_managed_st_table_dup(tbl); - key = strdup(key_buf); - rb_managed_st_table_insert(new_tbl, (st_data_t)key, (VALUE)entries); - RUBY_ATOMIC_VALUE_SET(fast_transcoder_path_table, new_tbl); } } From ab982cebb2f26f70b36a80613880dcfcd2c126f6 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Wed, 6 Aug 2025 13:35:17 -0400 Subject: [PATCH 12/12] small fixups --- transcode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/transcode.c b/transcode.c index bdb90e4bef6610..93b449ec703c6e 100644 --- a/transcode.c +++ b/transcode.c @@ -220,7 +220,7 @@ gen_src_to_dst_encodings_key(const char *key_buf, const char *sname, const char memcpy(p, ":", 1); p += 1; size_t dlen = strlen(dname); - RUBY_ASSERT(slen + dlen < SRC_ENC_TO_DST_ENC_KEY_SIZE); + RUBY_ASSERT(slen + dlen + 1 < SRC_ENC_TO_DST_ENC_KEY_SIZE); memcpy(p, dname, dlen); } @@ -280,7 +280,8 @@ get_transcoder_entry(const char *sname, const char *dname) char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; gen_src_to_dst_encodings_key(key_buf, sname, dname); VALUE entry_val; - if (rb_managed_st_table_lookup(fast_transcoder_entry_table, (st_data_t)key_buf, &entry_val)) { + VALUE tbl = RUBY_ATOMIC_VALUE_LOAD(fast_transcoder_entry_table); + if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &entry_val)) { return (transcoder_entry_t*)entry_val; } return NULL; @@ -1057,7 +1058,7 @@ trans_open_i(const char *sname, const char *dname, int depth, void *arg) if (!toarg->entries) { size_t num = depth+1+toarg->num_additional+1; toarg->entries = ALLOC_N(transcoder_entry_t *, num); - memset(toarg->entries + num - 1, 0, sizeof(transcoder_entry_t*)); // last entry is 0 + memset(toarg->entries + num - 1, 0, sizeof(transcoder_entry_t*)); // last entry is 0 so we can loop over it } toarg->entries[depth] = get_transcoder_entry(sname, dname); }