Skip to content

Fix str encode in ractors #674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 65 additions & 36 deletions encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "ruby/util.h"
#include "ruby_assert.h"
#include "vm_sync.h"
#include "ruby_atomic.h"

#ifndef ENC_DEBUG
#define ENC_DEBUG 0
Expand Down Expand Up @@ -144,10 +145,14 @@ enc_list_update(int index, rb_raw_encoding *encoding)
{
RUBY_ASSERT(index < ENCODING_LIST_CAPA);

VALUE list = rb_encoding_list;
VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);

if (list && NIL_P(rb_ary_entry(list, index))) {
VALUE new_list = rb_ary_dup(list);
RBASIC_CLEAR_CLASS(new_list);
/* initialize encoding data */
rb_ary_store(list, index, enc_new(encoding));
rb_ary_store(new_list, index, enc_new(encoding));
RUBY_ATOMIC_VALUE_SET(rb_encoding_list, new_list);
}
}

Expand All @@ -157,7 +162,7 @@ enc_list_lookup(int idx)
VALUE list, enc = Qnil;

if (idx < ENCODING_LIST_CAPA) {
list = rb_encoding_list;
list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
RUBY_ASSERT(list);
enc = rb_ary_entry(list, idx);
}
Expand Down Expand Up @@ -258,6 +263,7 @@ must_encindex(int index)
int
rb_to_encoding_index(VALUE enc)
{
ASSERT_vm_unlocking(); // can load encoding, so must not hold VM lock
int idx;
const char *name;

Expand Down Expand Up @@ -667,15 +673,15 @@ int
rb_enc_alias(const char *alias, const char *orig)
{
int idx, r;
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
enc_check_addable(enc_table, alias); // can raise
}

idx = rb_enc_find_index(orig);
if (idx < 0) return -1;

GLOBAL_ENC_TABLE_LOCKING(enc_table) {
enc_check_addable(enc_table, alias);
if ((idx = rb_enc_find_index(orig)) < 0) {
r = -1;
}
else {
r = enc_alias(enc_table, alias, idx);
}
r = enc_alias(enc_table, alias, idx);
}

return r;
Expand Down Expand Up @@ -742,6 +748,7 @@ int rb_require_internal_silent(VALUE fname);
static int
load_encoding(const char *name)
{
ASSERT_vm_unlocking();
VALUE enclib = rb_sprintf("enc/%s.so", name);
VALUE debug = ruby_debug;
VALUE errinfo;
Expand All @@ -757,7 +764,7 @@ load_encoding(const char *name)
enclib = rb_fstring(enclib);
ruby_debug = Qfalse;
errinfo = rb_errinfo();
loaded = rb_require_internal_silent(enclib);
loaded = rb_require_internal_silent(enclib); // must run without VM_LOCK
ruby_debug = debug;
rb_set_errinfo(errinfo);

Expand All @@ -781,6 +788,7 @@ enc_autoload_body(rb_encoding *enc)
{
rb_encoding *base;
int i = 0;
ASSERT_vm_unlocking();

GLOBAL_ENC_TABLE_LOCKING(enc_table) {
base = enc_table->list[ENC_TO_ENCINDEX(enc)].base;
Expand All @@ -792,30 +800,32 @@ enc_autoload_body(rb_encoding *enc)
}
} while (enc_table->list[i].enc != base && (++i, 1));
}
}


if (i != -1) {
if (base) {
bool do_register = true;
if (rb_enc_autoload_p(base)) {
if (rb_enc_autoload(base) < 0) {
do_register = false;
i = -1;
}
if (i != -1) {
if (base) {
bool do_register = true;
if (rb_enc_autoload_p(base)) {
if (rb_enc_autoload(base) < 0) {
do_register = false;
i = -1;
}
}

i = enc->ruby_encoding_index;
if (do_register) {
if (do_register) {
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
i = enc->ruby_encoding_index;
enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base);
((rb_raw_encoding *)enc)->ruby_encoding_index = i;
}

i &= ENC_INDEX_MASK;
}
else {
i = -2;
}
}

i &= ENC_INDEX_MASK;
}
else {
i = -2;
}
}

return i;
Expand All @@ -824,6 +834,7 @@ enc_autoload_body(rb_encoding *enc)
int
rb_enc_autoload(rb_encoding *enc)
{
ASSERT_vm_unlocking();
int i = enc_autoload_body(enc);
if (i == -2) {
i = load_encoding(rb_enc_name(enc));
Expand All @@ -844,6 +855,7 @@ int
rb_enc_find_index(const char *name)
{
int i;
ASSERT_vm_unlocking(); // it needs to be unlocked so it can call `load_encoding` if necessary
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
i = enc_registered(enc_table, name);
}
Expand Down Expand Up @@ -1019,7 +1031,6 @@ rb_enc_associate_index(VALUE obj, int idx)
rb_encoding *enc;
int oldidx, oldtermlen, termlen;

/* enc_check_capable(obj);*/
rb_check_frozen(obj);
oldidx = rb_enc_get_index(obj);
if (oldidx == idx)
Expand Down Expand Up @@ -1355,7 +1366,10 @@ enc_names(VALUE self)

args[0] = (VALUE)rb_to_encoding_index(self);
args[1] = rb_ary_new2(0);
st_foreach(global_enc_table.names, enc_names_i, (st_data_t)args);

GLOBAL_ENC_TABLE_LOCKING(enc_table) {
st_foreach(enc_table->names, enc_names_i, (st_data_t)args);
}
return args[1];
}

Expand All @@ -1380,8 +1394,9 @@ enc_names(VALUE self)
static VALUE
enc_list(VALUE klass)
{
VALUE ary = rb_ary_new2(0);
rb_ary_replace(ary, rb_encoding_list);
VALUE ary = rb_ary_new2(ENCODING_LIST_CAPA);
VALUE list = RUBY_ATOMIC_VALUE_LOAD(rb_encoding_list);
rb_ary_replace(ary, list);
return ary;
}

Expand Down Expand Up @@ -1526,6 +1541,9 @@ int rb_locale_charmap_index(void);
int
rb_locale_encindex(void)
{
// `rb_locale_charmap_index` can call `enc_find_index`, which can
// load an encoding. This needs to be done without VM lock held.
ASSERT_vm_unlocking();
int idx = rb_locale_charmap_index();

if (idx < 0) idx = ENCINDEX_UTF_8;
Expand Down Expand Up @@ -1584,6 +1602,10 @@ enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const cha
/* Already set */
overridden = TRUE;

if (!NIL_P(encoding)) {
enc_check_encoding(encoding); // loads it if necessary. Needs to be done outside of VM lock.
}

GLOBAL_ENC_TABLE_LOCKING(enc_table) {
if (NIL_P(encoding)) {
def->index = -1;
Expand Down Expand Up @@ -1854,8 +1876,11 @@ rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
static VALUE
rb_enc_name_list(VALUE klass)
{
VALUE ary = rb_ary_new2(global_enc_table.names->num_entries);
st_foreach(global_enc_table.names, rb_enc_name_list_i, (st_data_t)ary);
VALUE ary;
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
ary = rb_ary_new2(enc_table->names->num_entries);
st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary);
}
return ary;
}

Expand Down Expand Up @@ -1901,7 +1926,9 @@ rb_enc_aliases(VALUE klass)
aliases[0] = rb_hash_new();
aliases[1] = rb_ary_new();

st_foreach(global_enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases);
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases);
}

return aliases[0];
}
Expand Down Expand Up @@ -1969,9 +1996,9 @@ Init_Encoding(void)

struct enc_table *enc_table = &global_enc_table;

rb_gc_register_address(&rb_encoding_list);
list = rb_encoding_list = rb_ary_new2(ENCODING_LIST_CAPA);
RBASIC_CLEAR_CLASS(list);
rb_vm_register_global_object(list);

for (i = 0; i < enc_table->count; ++i) {
rb_ary_push(list, enc_new(enc_table->list[i].enc));
Expand Down Expand Up @@ -2003,5 +2030,7 @@ Init_encodings(void)
void
rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
{
st_foreach(global_enc_table.names, func, arg);
GLOBAL_ENC_TABLE_LOCKING(enc_table) {
st_foreach(enc_table->names, func, arg);
}
}
37 changes: 22 additions & 15 deletions hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -5192,25 +5192,26 @@ env_enc_str_new(const char *ptr, long len, rb_encoding *enc)
}

static VALUE
env_str_new(const char *ptr, long len)
env_str_new(const char *ptr, long len, rb_encoding *enc)
{
return env_enc_str_new(ptr, len, env_encoding());
return env_enc_str_new(ptr, len, enc);
}

static VALUE
env_str_new2(const char *ptr)
env_str_new2(const char *ptr, rb_encoding *enc)
{
if (!ptr) return Qnil;
return env_str_new(ptr, strlen(ptr));
return env_str_new(ptr, strlen(ptr), enc);
}

static VALUE
getenv_with_lock(const char *name)
{
VALUE ret;
rb_encoding *enc = env_encoding();
ENV_LOCKING() {
const char *val = getenv(name);
ret = env_str_new2(val);
ret = env_str_new2(val, enc);
}
return ret;
}
Expand Down Expand Up @@ -5773,13 +5774,14 @@ env_values(void)
{
VALUE ary = rb_ary_new();

rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);

while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_ary_push(ary, env_str_new2(s+1));
rb_ary_push(ary, env_str_new2(s+1, enc));
}
env++;
}
Expand Down Expand Up @@ -5865,14 +5867,15 @@ env_each_pair(VALUE ehash)

VALUE ary = rb_ary_new();

rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);

while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_ary_push(ary, env_str_new(*env, s-*env));
rb_ary_push(ary, env_str_new2(s+1));
rb_ary_push(ary, env_str_new(*env, s-*env, enc));
rb_ary_push(ary, env_str_new2(s+1, enc));
}
env++;
}
Expand Down Expand Up @@ -6255,13 +6258,14 @@ env_to_a(VALUE _)
{
VALUE ary = rb_ary_new();

rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env),
env_str_new2(s+1)));
rb_ary_push(ary, rb_assoc_new(env_str_new(*env, s-*env, enc),
env_str_new2(s+1, enc)));
}
env++;
}
Expand Down Expand Up @@ -6509,14 +6513,15 @@ env_key(VALUE dmy, VALUE value)
StringValue(value);
VALUE str = Qnil;

rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
char *s = strchr(*env, '=');
if (s++) {
long len = strlen(s);
if (RSTRING_LEN(value) == len && strncmp(s, RSTRING_PTR(value), len) == 0) {
str = env_str_new(*env, s-*env-1);
str = env_str_new(*env, s-*env-1, enc);
break;
}
}
Expand All @@ -6533,13 +6538,14 @@ env_to_hash(void)
{
VALUE hash = rb_hash_new();

rb_encoding *enc = env_encoding();
ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
while (*env) {
char *s = strchr(*env, '=');
if (s) {
rb_hash_aset(hash, env_str_new(*env, s-*env),
env_str_new2(s+1));
rb_hash_aset(hash, env_str_new(*env, s-*env, enc),
env_str_new2(s+1, enc));
}
env++;
}
Expand Down Expand Up @@ -6684,14 +6690,15 @@ env_shift(VALUE _)
VALUE result = Qnil;
VALUE key = Qnil;

rb_encoding *enc = env_encoding();
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So env_encoding calls rb_locale_encoding which calls rb_locale_encindex(), which acquire the VM lock, so this isn't helping much.

I'd start by making rb_locale_encindex() lock-free in most case. A simple atomic check to see if the locale encoding was initialized should suffice.

Copy link

@luke-gruber luke-gruber Aug 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was more that env_encoding could load the encoding, and that needed to be done outside the VM lock or else it deadlocks but I'll look into making it atomic 👍 It shouldn't load the encoding often, but native exts could call setlocale() in which case it could.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. That makes sense.

Perhaps it doesn't matter, as this is only used when accessing ENV, which hopefulyl wouldn't be in a hotspot. But yeah, the amount of work done every time is surprising.

I'll look into making it atomic

No need. Let's make if safe. We'll see about optimizing later.

ENV_LOCKING() {
char **env = GET_ENVIRON(environ);
if (*env) {
const char *p = *env;
char *s = strchr(p, '=');
if (s) {
key = env_str_new(p, s-p);
VALUE val = env_str_new2(getenv(RSTRING_PTR(key)));
key = env_str_new(p, s-p, enc);
VALUE val = env_str_new2(getenv(RSTRING_PTR(key)), enc);
result = rb_assoc_new(key, val);
}
}
Expand Down
Loading
Loading