diff --git a/depend b/depend index ea2486e9e8da1d..a08a7b178880b1 100644 --- a/depend +++ b/depend @@ -17738,6 +17738,7 @@ transcode.$(OBJEXT): $(top_srcdir)/internal/transcode.h transcode.$(OBJEXT): $(top_srcdir)/internal/variable.h transcode.$(OBJEXT): $(top_srcdir)/internal/warnings.h transcode.$(OBJEXT): {$(VPATH)}assert.h +transcode.$(OBJEXT): {$(VPATH)}atomic.h transcode.$(OBJEXT): {$(VPATH)}backward/2/assume.h transcode.$(OBJEXT): {$(VPATH)}backward/2/attributes.h transcode.$(OBJEXT): {$(VPATH)}backward/2/bool.h @@ -17909,6 +17910,7 @@ transcode.$(OBJEXT): {$(VPATH)}internal/xmalloc.h transcode.$(OBJEXT): {$(VPATH)}missing.h transcode.$(OBJEXT): {$(VPATH)}onigmo.h transcode.$(OBJEXT): {$(VPATH)}oniguruma.h +transcode.$(OBJEXT): {$(VPATH)}ruby_atomic.h transcode.$(OBJEXT): {$(VPATH)}shape.h transcode.$(OBJEXT): {$(VPATH)}st.h transcode.$(OBJEXT): {$(VPATH)}subst.h diff --git a/encoding.c b/encoding.c index 7bca8d1b2b3d38..a403b70038bc53 100644 --- a/encoding.c +++ b/encoding.c @@ -73,6 +73,10 @@ static struct enc_table { st_table *names; } global_enc_table; +static const char *string_UTF_8; +static const char *string_US_ASCII; +static const char *string_ASCII_8BIT; + static int enc_names_free_i(st_data_t name, st_data_t idx, st_data_t args) { @@ -258,6 +262,7 @@ must_encindex(int index) int rb_to_encoding_index(VALUE enc) { + ASSERT_vm_unlocking(); // can load encoding, so must not hold VM lock int idx; const char *name; @@ -668,9 +673,11 @@ rb_enc_alias(const char *alias, const char *orig) { int idx, r; + idx = rb_enc_find_index(orig); + GLOBAL_ENC_TABLE_LOCKING(enc_table) { enc_check_addable(enc_table, alias); - if ((idx = rb_enc_find_index(orig)) < 0) { + if (idx < 0) { r = -1; } else { @@ -707,7 +714,8 @@ rb_enc_init(struct enc_table *enc_table) enc_table->names = st_init_strcasetable_with_size(ENCODING_LIST_CAPA); } #define OnigEncodingASCII_8BIT OnigEncodingASCII -#define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) +#define ENC_REGISTER(enc) string_##enc = rb_enc_name(&OnigEncoding##enc); \ + enc_register_at(enc_table, ENCINDEX_##enc, string_##enc, &OnigEncoding##enc) ENC_REGISTER(ASCII_8BIT); ENC_REGISTER(UTF_8); ENC_REGISTER(US_ASCII); @@ -742,6 +750,7 @@ int rb_require_internal_silent(VALUE fname); static int load_encoding(const char *name) { + ASSERT_vm_unlocking(); VALUE enclib = rb_sprintf("enc/%s.so", name); VALUE debug = ruby_debug; VALUE errinfo; @@ -757,7 +766,7 @@ load_encoding(const char *name) enclib = rb_fstring(enclib); ruby_debug = Qfalse; errinfo = rb_errinfo(); - loaded = rb_require_internal_silent(enclib); + loaded = rb_require_internal_silent(enclib); // must run without VM_LOCK ruby_debug = debug; rb_set_errinfo(errinfo); @@ -781,6 +790,7 @@ enc_autoload_body(rb_encoding *enc) { rb_encoding *base; int i = 0; + ASSERT_vm_unlocking(); GLOBAL_ENC_TABLE_LOCKING(enc_table) { base = enc_table->list[ENC_TO_ENCINDEX(enc)].base; @@ -792,30 +802,32 @@ enc_autoload_body(rb_encoding *enc) } } while (enc_table->list[i].enc != base && (++i, 1)); } + } + - if (i != -1) { - if (base) { - bool do_register = true; - if (rb_enc_autoload_p(base)) { - if (rb_enc_autoload(base) < 0) { - do_register = false; - i = -1; - } + if (i != -1) { + if (base) { + bool do_register = true; + if (rb_enc_autoload_p(base)) { + if (rb_enc_autoload(base) < 0) { + do_register = false; + i = -1; } + } - i = enc->ruby_encoding_index; - if (do_register) { + if (do_register) { + GLOBAL_ENC_TABLE_LOCKING(enc_table) { + i = enc->ruby_encoding_index; enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base); ((rb_raw_encoding *)enc)->ruby_encoding_index = i; } - - i &= ENC_INDEX_MASK; - } - else { - i = -2; } - } + i &= ENC_INDEX_MASK; + } + else { + i = -2; + } } return i; @@ -824,6 +836,7 @@ enc_autoload_body(rb_encoding *enc) int rb_enc_autoload(rb_encoding *enc) { + ASSERT_vm_unlocking(); int i = enc_autoload_body(enc); if (i == -2) { i = load_encoding(rb_enc_name(enc)); @@ -843,6 +856,24 @@ rb_enc_autoload_p(rb_encoding *enc) int rb_enc_find_index(const char *name) { + ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary + size_t input_len = strlen(name); + switch(input_len) { + case 5: + if (STRCASECMP(name, string_UTF_8) == 0) { + return ENCINDEX_UTF_8; + } + case 8: + if (STRCASECMP(name, string_US_ASCII) == 0) { + return ENCINDEX_US_ASCII; + } + case 10: + if (STRCASECMP(name, string_ASCII_8BIT) == 0) { + return ENCINDEX_ASCII_8BIT; + } + default: + break; + } int i; GLOBAL_ENC_TABLE_LOCKING(enc_table) { i = enc_registered(enc_table, name); @@ -1019,7 +1050,6 @@ rb_enc_associate_index(VALUE obj, int idx) rb_encoding *enc; int oldidx, oldtermlen, termlen; -/* enc_check_capable(obj);*/ rb_check_frozen(obj); oldidx = rb_enc_get_index(obj); if (oldidx == idx) @@ -1526,6 +1556,9 @@ int rb_locale_charmap_index(void); int rb_locale_encindex(void) { + // `rb_locale_charmap_index` can call `enc_find_index`, which can + // load an encoding. This needs to be done without VM lock held. + ASSERT_vm_unlocking(); int idx = rb_locale_charmap_index(); if (idx < 0) idx = ENCINDEX_UTF_8; @@ -1584,6 +1617,10 @@ enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const cha /* Already set */ overridden = TRUE; + if (!NIL_P(encoding)) { + enc_check_encoding(encoding); // loads it if necessary. Needs to be done outside of VM lock. + } + GLOBAL_ENC_TABLE_LOCKING(enc_table) { if (NIL_P(encoding)) { def->index = -1; diff --git a/hash.c b/hash.c index 7ce1b768e0e831..de9bc97ea69cdf 100644 --- a/hash.c +++ b/hash.c @@ -5192,25 +5192,26 @@ env_enc_str_new(const char *ptr, long len, rb_encoding *enc) } static VALUE -env_str_new(const char *ptr, long len) +env_str_new(const char *ptr, long len, rb_encoding *enc) { - return env_enc_str_new(ptr, len, env_encoding()); + return env_enc_str_new(ptr, len, enc); } static VALUE -env_str_new2(const char *ptr) +env_str_new2(const char *ptr, rb_encoding *enc) { if (!ptr) return Qnil; - return env_str_new(ptr, strlen(ptr)); + return env_str_new(ptr, strlen(ptr), enc); } static VALUE getenv_with_lock(const char *name) { VALUE ret; + rb_encoding *enc = env_encoding(); ENV_LOCKING() { const char *val = getenv(name); - ret = env_str_new2(val); + ret = env_str_new2(val, enc); } return ret; } @@ -5773,13 +5774,14 @@ env_values(void) { VALUE ary = rb_ary_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_ary_push(ary, env_str_new2(s+1)); + rb_ary_push(ary, env_str_new2(s+1, enc)); } env++; } @@ -5865,14 +5867,15 @@ env_each_pair(VALUE ehash) VALUE ary = rb_ary_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_ary_push(ary, env_str_new(*env, s-*env)); - rb_ary_push(ary, env_str_new2(s+1)); + rb_ary_push(ary, env_str_new(*env, s-*env, enc)); + rb_ary_push(ary, env_str_new2(s+1, enc)); } env++; } @@ -6255,13 +6258,14 @@ env_to_a(VALUE _) { VALUE ary = rb_ary_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env), - env_str_new2(s+1))); + rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env, enc), + env_str_new2(s+1, enc))); } env++; } @@ -6509,6 +6513,7 @@ env_key(VALUE dmy, VALUE value) StringValue(value); VALUE str = Qnil; + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { @@ -6516,7 +6521,7 @@ env_key(VALUE dmy, VALUE value) if (s++) { long len = strlen(s); if (RSTRING_LEN(value) == len && strncmp(s, RSTRING_PTR(value), len) == 0) { - str = env_str_new(*env, s-*env-1); + str = env_str_new(*env, s-*env-1, enc); break; } } @@ -6533,13 +6538,14 @@ env_to_hash(void) { VALUE hash = rb_hash_new(); + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); while (*env) { char *s = strchr(*env, '='); if (s) { - rb_hash_aset(hash, env_str_new(*env, s-*env), - env_str_new2(s+1)); + rb_hash_aset(hash, env_str_new(*env, s-*env, enc), + env_str_new2(s+1, enc)); } env++; } @@ -6684,14 +6690,15 @@ env_shift(VALUE _) VALUE result = Qnil; VALUE key = Qnil; + rb_encoding *enc = env_encoding(); ENV_LOCKING() { char **env = GET_ENVIRON(environ); if (*env) { const char *p = *env; char *s = strchr(p, '='); if (s) { - key = env_str_new(p, s-p); - VALUE val = env_str_new2(getenv(RSTRING_PTR(key))); + key = env_str_new(p, s-p, enc); + VALUE val = env_str_new2(getenv(RSTRING_PTR(key)), enc); result = rb_assoc_new(key, val); } } diff --git a/include/ruby/internal/encoding/encoding.h b/include/ruby/internal/encoding/encoding.h index a58f9f2b1524b5..53412bd379bbda 100644 --- a/include/ruby/internal/encoding/encoding.h +++ b/include/ruby/internal/encoding/encoding.h @@ -67,6 +67,7 @@ enum ruby_encoding_consts { #define ENCODING_INLINE_MAX RUBY_ENCODING_INLINE_MAX /**< @old{RUBY_ENCODING_INLINE_MAX} */ #define ENCODING_SHIFT RUBY_ENCODING_SHIFT /**< @old{RUBY_ENCODING_SHIFT} */ #define ENCODING_MASK RUBY_ENCODING_MASK /**< @old{RUBY_ENCODING_MASK} */ +#define ENCODING_NAMELEN_MAX 63 /** * Destructively assigns the passed encoding to the passed object. The object diff --git a/include/ruby/st.h b/include/ruby/st.h index f35ab436037237..08ee3779c3cc75 100644 --- a/include/ruby/st.h +++ b/include/ruby/st.h @@ -187,6 +187,14 @@ CONSTFUNC(st_index_t rb_st_hash_start(st_index_t h)); void rb_hash_bulk_insert_into_st_table(long, const VALUE *, VALUE); +VALUE rb_managed_st_table_create_numtable(size_t capa); +VALUE rb_managed_st_table_create_strtable(size_t capa); +VALUE rb_managed_st_table_create_strcasetable(size_t capa); +int rb_managed_st_table_lookup(VALUE tbl, st_data_t key, st_data_t *value); +int rb_managed_st_table_insert(VALUE tbl, st_data_t key, st_data_t value); +void rb_managed_st_table_add_direct(VALUE tbl, st_data_t key, st_data_t value); +VALUE rb_managed_st_table_dup(VALUE old_table); + RUBY_SYMBOL_EXPORT_END #if defined(__cplusplus) diff --git a/st.c b/st.c index 195a16b8ad73e7..a7853df2d1760e 100644 --- a/st.c +++ b/st.c @@ -3206,4 +3206,106 @@ set_compact_table(set_table *tab) } } +static void +managed_st_table_free(void *data) +{ + st_table *tbl = (st_table *)data; + free(tbl->bins); + free(tbl->entries); +} + +static size_t +managed_st_table_memsize(const void *data) +{ + st_table *tbl = (st_table*)data; + return st_memsize(tbl) - sizeof(st_table); +} + +const rb_data_type_t rb_managed_st_table_type = { + .wrap_struct_name = "VM/managed_st_table", + .function = { + .dmark = NULL, // Nothing to mark + .dfree = (RUBY_DATA_FUNC)managed_st_table_free, + .dsize = managed_st_table_memsize, + }, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE, +}; + +static inline st_table * +managed_st_table_ptr(VALUE obj) +{ + RUBY_ASSERT(RB_TYPE_P(obj, T_DATA)); + RUBY_ASSERT(rb_typeddata_inherited_p(RTYPEDDATA_TYPE(obj), &rb_managed_st_table_type)); + + return RTYPEDDATA_GET_DATA(obj); +} + +static VALUE +rb_managed_st_table_create_type(const rb_data_type_t *type, const struct st_hash_type *table_type, size_t capa) +{ + struct st_table *tbl; + VALUE obj = TypedData_Make_Struct(0, struct st_table, type, tbl); + st_init_existing_table_with_size(tbl, table_type, capa); + return obj; +} + +VALUE +rb_managed_st_table_create_numtable(size_t capa) +{ + return rb_managed_st_table_create_type(&rb_managed_st_table_type, &type_numhash, capa); +} + +VALUE +rb_managed_st_table_create_strtable(size_t capa) +{ + return rb_managed_st_table_create_type(&rb_managed_st_table_type, &type_strhash, capa); +} + +VALUE +rb_managed_st_table_create_strcasetable(size_t capa) +{ + return rb_managed_st_table_create_type(&rb_managed_st_table_type, &type_strcasehash, capa); +} + +int +rb_managed_st_table_lookup(VALUE tbl, st_data_t key, VALUE *value) +{ + st_table *st = managed_st_table_ptr(tbl); + st_data_t *val = (st_data_t*)value; + return st_lookup(st, key, val); +} + +int +rb_managed_st_table_insert(VALUE tbl, st_data_t key, VALUE value) +{ + st_table *st = managed_st_table_ptr(tbl); + return st_insert(st, key, (st_data_t)value); +} + +void +rb_managed_st_table_add_direct(VALUE tbl, st_data_t key, st_data_t value) +{ + st_table *st = managed_st_table_ptr(tbl); + return st_add_direct(st, key, value); +} + +static int +managed_st_table_dup_i(st_data_t key, st_data_t val, st_data_t data) { + st_table *tbl = (st_table *)data; + st_insert(tbl, key, val); + return ST_CONTINUE; +} + +VALUE +rb_managed_st_table_dup(VALUE old_table) +{ + struct st_table *new_tbl; + VALUE obj = TypedData_Make_Struct(0, struct st_table, RTYPEDDATA_TYPE(old_table), new_tbl); + struct st_table *old_tbl = managed_st_table_ptr(old_table); + st_init_existing_table_with_size(new_tbl, old_tbl->type, old_tbl->num_entries+1); + st_foreach(old_tbl, managed_st_table_dup_i, (st_data_t)new_tbl); + + return obj; +} + #endif diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 63d37f4ba4ff90..2b6f8234ced64f 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -2320,6 +2320,46 @@ def test_newline_options assert_equal("A\nB\nC", s.encode(usascii, newline: :lf)) end + def test_ractor_lazy_load_encoding + assert_ractor("#{<<~"begin;"}\n#{<<~'end;'}") + begin; + rs = [] + autoload_encodings = Encoding.list.select { |e| e.inspect.include?("(autoload)") }.freeze + 7.times do + rs << Ractor.new(autoload_encodings) do |encodings| + str = "\u0300" + encodings.each do |enc| + str.encode(enc) rescue Encoding::UndefinedConversionError + end + end + end + + while rs.any? + r, _obj = Ractor.select(*rs) + rs.delete(r) + end + assert rs.empty? + end; + end + + def test_ractor_lazy_load_encoding_random + assert_ractor("#{<<~"begin;"}\n#{<<~'end;'}") + begin; + rs = [] + 100.times do + rs << Ractor.new do + "\u0300".encode(Encoding.list.sample) rescue Encoding::UndefinedConversionError + end + end + + while rs.any? + r, _obj = Ractor.select(*rs) + rs.delete(r) + end + assert rs.empty? + end; + end + private def assert_conversion_both_ways_utf8(utf8, raw, encoding) diff --git a/transcode.c b/transcode.c index d8cd90e56d5cf7..93b449ec703c6e 100644 --- a/transcode.c +++ b/transcode.c @@ -21,11 +21,14 @@ #include "internal/transcode.h" #include "ruby/encoding.h" #include "vm_sync.h" +#include "ruby_atomic.h" #include "transcode_data.h" +#include "id_table.h" #include "id.h" #define ENABLE_ECONV_NEWLINE_OPTION 1 +#define SRC_ENC_TO_DST_ENC_KEY_SIZE (ENCODING_NAMELEN_MAX * 2 + 2) /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */ static VALUE rb_eUndefinedConversionError; @@ -62,6 +65,9 @@ static VALUE sym_finished; static VALUE sym_after_output; static VALUE sym_incomplete_input; +static VALUE fast_transcoder_path_table; +static VALUE fast_transcoder_entry_table; + static unsigned char * allocate_converted_string(const char *sname, const char *dname, const unsigned char *str, size_t len, @@ -204,12 +210,54 @@ rb_free_transcoder_table(void) st_free_table(transcoder_table); } +static void +gen_src_to_dst_encodings_key(const char *key_buf, const char *sname, const char *dname) +{ + char *p = (char*)key_buf; + size_t slen = strlen(sname); + memcpy(p, sname, slen); + p += slen; + memcpy(p, ":", 1); + p += 1; + size_t dlen = strlen(dname); + RUBY_ASSERT(slen + dlen + 1 < SRC_ENC_TO_DST_ENC_KEY_SIZE); + memcpy(p, dname, dlen); +} + static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname) { st_data_t val; st_table *table2; + transcoder_entry_t *entry = NULL; + char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + char *key = NULL; + gen_src_to_dst_encodings_key(key_buf, sname, dname); + while (1) { + VALUE tbl = fast_transcoder_entry_table; + VALUE entry_got; + if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &entry_got)) { + entry = (transcoder_entry_t*)entry_got; + break; + } else { + if (!entry) { + entry = ALLOC(transcoder_entry_t); + entry->sname = sname; + entry->dname = dname; + entry->lib = NULL; + entry->transcoder = NULL; + } + VALUE new_tbl = rb_managed_st_table_dup(tbl); + if (!key) key = strdup(key_buf); + rb_managed_st_table_insert(new_tbl, (st_data_t)key, (VALUE)entry); + if (RUBY_ATOMIC_VALUE_CAS(fast_transcoder_entry_table, tbl, new_tbl) == tbl) { + break; + } + } + } + + // TODO: we should be able to remove this table soon RB_VM_LOCKING() { if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { val = (st_data_t)st_init_strcasetable(); @@ -217,32 +265,26 @@ make_transcoder_entry(const char *sname, const char *dname) } table2 = (st_table *)val; if (!st_lookup(table2, (st_data_t)dname, &val)) { - transcoder_entry_t *entry = ALLOC(transcoder_entry_t); - entry->sname = sname; - entry->dname = dname; - entry->lib = NULL; - entry->transcoder = NULL; val = (st_data_t)entry; st_add_direct(table2, (st_data_t)dname, val); + } else { + entry = (transcoder_entry_t*)val; } } - return (transcoder_entry_t *)val; + return entry; } static transcoder_entry_t * get_transcoder_entry(const char *sname, const char *dname) { - st_data_t val = 0; - st_table *table2; - RB_VM_LOCKING() { - if (st_lookup(transcoder_table, (st_data_t)sname, &val)) { - table2 = (st_table *)val; - if (!st_lookup(table2, (st_data_t)dname, &val)) { - val = 0; - } - } + char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + gen_src_to_dst_encodings_key(key_buf, sname, dname); + VALUE entry_val; + VALUE tbl = RUBY_ATOMIC_VALUE_LOAD(fast_transcoder_entry_table); + if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &entry_val)) { + return (transcoder_entry_t*)entry_val; } - return (transcoder_entry_t *)val; + return NULL; } void @@ -340,7 +382,7 @@ transcode_search_path(const char *sname, const char *dname, bfs.queue_last_ptr = &q->next; bfs.queue = q; - bfs.visited = st_init_strcasetable(); + bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL); RB_VM_LOCKING() { @@ -351,14 +393,14 @@ transcode_search_path(const char *sname, const char *dname, bfs.queue_last_ptr = &bfs.queue; } - lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); + lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2 if (!lookup_res) { xfree(q); continue; } table2 = (st_table *)val; - if (st_lookup(table2, (st_data_t)dname, &val)) { + if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc); xfree(q); found = true; @@ -411,8 +453,7 @@ int rb_require_internal_silent(VALUE fname); static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry) { - // changes result of entry->transcoder depending on if it's required or not, so needs lock - ASSERT_vm_locking(); + ASSERT_vm_unlocking(); if (entry->transcoder) return entry->transcoder; @@ -427,7 +468,7 @@ load_transcoder_entry(transcoder_entry_t *entry) memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len); rb_str_set_len(fn, total_len); OBJ_FREEZE(fn); - rb_require_internal_silent(fn); + rb_require_internal_silent(fn); // Sets entry->transcoder } if (entry->transcoder) @@ -977,19 +1018,19 @@ rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i) } static rb_econv_t * -rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries) +rb_econv_open_by_transcoder_entries(transcoder_entry_t **entries) { rb_econv_t *ec; int i, ret; - ASSERT_vm_locking(); - for (i = 0; i < n; i++) { + for (i = 0; entries && entries[i]; i++) { const rb_transcoder *tr; tr = load_transcoder_entry(entries[i]); if (!tr) return NULL; } + int n = i; ec = rb_econv_alloc(n); for (i = 0; i < n; i++) { @@ -1015,23 +1056,27 @@ trans_open_i(const char *sname, const char *dname, int depth, void *arg) struct trans_open_t *toarg = arg; if (!toarg->entries) { - toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional); + size_t num = depth+1+toarg->num_additional+1; + toarg->entries = ALLOC_N(transcoder_entry_t *, num); + memset(toarg->entries + num - 1, 0, sizeof(transcoder_entry_t*)); // last entry is 0 so we can loop over it } toarg->entries[depth] = get_transcoder_entry(sname, dname); } static rb_econv_t * -rb_econv_open0(const char *sname, const char *dname, int ecflags) +rb_econv_open0(rb_encoding *senc, const char *sname, rb_encoding *denc, const char *dname, int ecflags) { transcoder_entry_t **entries = NULL; int num_trans; rb_econv_t *ec; - ASSERT_vm_locking(); - /* Just check if sname and dname are defined */ - /* (This check is needed?) */ - if (*sname) rb_enc_find_index(sname); - if (*dname) rb_enc_find_index(dname); + // load encodings, if necessary + if (*sname && (!senc || !senc->max_enc_len)) { + rb_enc_find_index(sname); + } + if (*dname && (!denc || !denc->max_enc_len)) { + rb_enc_find_index(dname); + } if (*sname == '\0' && *dname == '\0') { num_trans = 0; @@ -1042,16 +1087,35 @@ rb_econv_open0(const char *sname, const char *dname, int ecflags) struct trans_open_t toarg; toarg.entries = NULL; toarg.num_additional = 0; - num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); - entries = toarg.entries; - if (num_trans < 0) { - xfree(entries); - return NULL; + char key_buf[SRC_ENC_TO_DST_ENC_KEY_SIZE] = { 0 }; + char *key = NULL; + gen_src_to_dst_encodings_key(key_buf, sname, dname); + VALUE managed_val; + while (1) { + VALUE tbl = fast_transcoder_path_table; + if (rb_managed_st_table_lookup(tbl, (st_data_t)key_buf, &managed_val)) { + entries = (transcoder_entry_t **)managed_val; + break; + } else { + if (!entries) { + num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg); + entries = toarg.entries; + if (num_trans < 0) { + xfree(entries); + return NULL; + } + } + VALUE new_tbl = rb_managed_st_table_dup(tbl); + if (!key) key = strdup(key_buf); + rb_managed_st_table_insert(new_tbl, (st_data_t)key, (VALUE)entries); + if (RUBY_ATOMIC_VALUE_CAS(fast_transcoder_path_table, tbl, new_tbl) == tbl) { + break; + } + } } } - ec = rb_econv_open_by_transcoder_entries(num_trans, entries); - xfree(entries); + ec = rb_econv_open_by_transcoder_entries(entries); if (!ec) return NULL; @@ -1105,8 +1169,8 @@ decorator_names(int ecflags, const char **decorators_ret) return num_decorators; } -rb_econv_t * -rb_econv_open(const char *sname, const char *dname, int ecflags) +static rb_econv_t * +rb_econv_open_enc(rb_encoding *senc, const char *sname, rb_encoding *denc, const char *dname, int ecflags) { rb_econv_t *ec; int num_decorators; @@ -1117,15 +1181,13 @@ rb_econv_open(const char *sname, const char *dname, int ecflags) if (num_decorators == -1) return NULL; - RB_VM_LOCKING() { - ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); - if (ec) { - for (i = 0; i < num_decorators; i++) { - if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { - rb_econv_close(ec); - ec = NULL; - break; - } + ec = rb_econv_open0(senc, sname, denc, dname, ecflags & ECONV_ERROR_HANDLER_MASK); + if (ec) { + for (i = 0; i < num_decorators; i++) { + if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { + rb_econv_close(ec); + ec = NULL; + break; } } } @@ -1136,6 +1198,12 @@ rb_econv_open(const char *sname, const char *dname, int ecflags) return ec; // can be NULL } +rb_econv_t * +rb_econv_open(const char *sname, const char *dname, int ecflags) +{ + return rb_econv_open_enc(NULL, sname, NULL, dname, ecflags); +} + static int trans_sweep(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, @@ -1849,7 +1917,6 @@ rb_econv_asciicompat_encoding(const char *ascii_incompat_name) } } - } return data.ascii_compat_name; // can be NULL @@ -1960,14 +2027,12 @@ rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int if (ec->started != 0) return -1; - RB_VM_LOCKING() { - entry = get_transcoder_entry(sname, dname); - if (entry) { - tr = load_transcoder_entry(entry); - } - + entry = get_transcoder_entry(sname, dname); + if (entry) { + tr = load_transcoder_entry(entry); } + return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1; } @@ -2342,11 +2407,16 @@ aref_fallback(VALUE fallback, VALUE c) return rb_funcallv_public(fallback, idAREF, 1, &c); } +static rb_econv_t * +rb_econv_open_opts_enc(rb_encoding *senc, const char *source_encoding, rb_encoding *denc, const char *destination_encoding, int ecflags, VALUE opthash); + static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), + rb_encoding *senc, + rb_encoding *denc, const char *src_encoding, const char *dst_encoding, int ecflags, @@ -2361,7 +2431,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, VALUE fallback = Qnil; VALUE (*fallback_func)(VALUE, VALUE) = 0; - ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); + ec = rb_econv_open_opts_enc(senc, src_encoding, denc, dst_encoding, ecflags, ecopts); if (!ec) rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); @@ -2666,8 +2736,8 @@ rb_econv_prepare_opts(VALUE opthash, VALUE *opts) return rb_econv_prepare_options(opthash, opts, 0); } -rb_econv_t * -rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) +static rb_econv_t * +rb_econv_open_opts_enc(rb_encoding *senc, const char *source_encoding, rb_encoding *denc, const char *destination_encoding, int ecflags, VALUE opthash) { rb_econv_t *ec; VALUE replacement; @@ -2681,27 +2751,31 @@ rb_econv_open_opts(const char *source_encoding, const char *destination_encoding replacement = rb_hash_aref(opthash, sym_replace); } - RB_VM_LOCKING() { - ec = rb_econv_open(source_encoding, destination_encoding, ecflags); - if (ec) { - if (!NIL_P(replacement)) { - int ret; - rb_encoding *enc = rb_enc_get(replacement); - - ret = rb_econv_set_replacement(ec, - (const unsigned char *)RSTRING_PTR(replacement), - RSTRING_LEN(replacement), - rb_enc_name(enc)); - if (ret == -1) { - rb_econv_close(ec); - ec = NULL; - } + ec = rb_econv_open_enc(senc, source_encoding, denc, destination_encoding, ecflags); + if (ec) { + if (!NIL_P(replacement)) { + int ret; + rb_encoding *enc = rb_enc_get(replacement); + + ret = rb_econv_set_replacement(ec, + (const unsigned char *)RSTRING_PTR(replacement), + RSTRING_LEN(replacement), + rb_enc_name(enc)); + if (ret == -1) { + rb_econv_close(ec); + ec = NULL; } } } return ec; // can be NULL } +rb_econv_t * +rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) +{ + return rb_econv_open_opts_enc(NULL, source_encoding, NULL, destination_encoding, ecflags, opthash); +} + static int enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p) { @@ -2762,7 +2836,7 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) long blen, slen; unsigned char *buf, *bp, *sp; const unsigned char *fromp; - rb_encoding *senc, *denc; + rb_encoding *senc = NULL, *denc = NULL; const char *sname, *dname; int dencidx; int explicitly_invalid_replace = TRUE; @@ -2831,7 +2905,7 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) dest = rb_str_tmp_new(blen); bp = (unsigned char *)RSTRING_PTR(dest); - transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts); + transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, senc, denc, sname, dname, ecflags, ecopts); if (fromp != sp+slen) { rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp); } @@ -3132,10 +3206,8 @@ decorate_convpath(VALUE convpath, int ecflags) const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1))); transcoder_entry_t *entry; const rb_transcoder *tr; - RB_VM_LOCKING() { - entry = get_transcoder_entry(sname, dname); - tr = load_transcoder_entry(entry); - } + entry = get_transcoder_entry(sname, dname); + tr = load_transcoder_entry(entry); if (!tr) return -1; if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && @@ -4486,6 +4558,10 @@ void Init_transcode(void) { transcoder_table = st_init_strcasetable(); + fast_transcoder_path_table = rb_managed_st_table_create_strcasetable(8); // NOTE: size is arbitrarily chosen + rb_gc_register_address(&fast_transcoder_path_table); + fast_transcoder_entry_table = rb_managed_st_table_create_strcasetable(8); + rb_gc_register_address(&fast_transcoder_entry_table); id_destination_encoding = rb_intern_const("destination_encoding"); id_destination_encoding_name = rb_intern_const("destination_encoding_name");