Skip to content

Commit 00bf182

Browse files
author
Charlie Somerville
committed
Merge branch 'github-encoding-compatibility' into github-2.0.0
2 parents 8bbcbef + 9ec1662 commit 00bf182

File tree

6 files changed

+71
-10
lines changed

6 files changed

+71
-10
lines changed

enc/trans/single_byte.trans

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
<%
44
us_ascii_map = [["{00-7f}", :nomap]]
5+
binary_map = [["{00-ff}", :nomap]]
56

67
transcode_tblgen "US-ASCII", "UTF-8", us_ascii_map
78
transcode_tblgen "UTF-8", "US-ASCII", us_ascii_map
89
transcode_tblgen "ASCII-8BIT", "UTF-8", us_ascii_map
910
transcode_tblgen "UTF-8", "ASCII-8BIT", us_ascii_map
11+
transcode_tblgen "UTF-8-COMPAT", "ASCII-8BIT", binary_map, '{00-ff}'
1012

1113
CONTROL1_TO_UCS_TBL = (0x80..0x9f).map {|c| ["%02X" % c, c] }
1214

@@ -84,8 +86,14 @@
8486

8587
<%= transcode_generated_code %>
8688

89+
extern rb_encoding_compat;
90+
8791
TRANS_INIT(single_byte)
8892
{
93+
if (rb_encoding_compat) {
94+
((struct rb_transcoder *)&rb_from_ASCII_8BIT)->conv_tree_start = from_UTF_8_COMPAT_to_ASCII_8BIT;
95+
((struct rb_transcoder *) &rb_to_ASCII_8BIT)->conv_tree_start = from_UTF_8_COMPAT_to_ASCII_8BIT;
96+
}
8997
<%= transcode_register_code %>
9098
}
9199

encoding.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ void rb_encdb_set_unicode(int index);
3636
#pragma GCC visibility pop
3737
#endif
3838

39+
int rb_encoding_compat;
40+
3941
static ID id_encoding;
4042
VALUE rb_cEncoding;
4143
static VALUE rb_encoding_list;
@@ -843,6 +845,14 @@ rb_enc_compatible(VALUE str1, VALUE str2)
843845
if (cr2 == ENC_CODERANGE_7BIT) {
844846
return enc1;
845847
}
848+
if (rb_encoding_compat) {
849+
if (idx1 == ENCINDEX_UTF_8 && idx2 == ENCINDEX_ASCII) {
850+
return enc2;
851+
}
852+
else if (idx1 == ENCINDEX_ASCII && idx2 == ENCINDEX_UTF_8) {
853+
return enc1;
854+
}
855+
}
846856
}
847857
if (cr1 == ENC_CODERANGE_7BIT)
848858
return enc2;
@@ -935,7 +945,10 @@ rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
935945
rb_raise(rb_eArgError, "empty string");
936946
r = rb_enc_precise_mbclen(p, e, enc);
937947
if (!MBCLEN_CHARFOUND_P(r)) {
938-
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
948+
if (rb_encoding_compat && enc == rb_utf8_encoding() && MBCLEN_CHARFOUND_P(r = rb_enc_precise_mbclen(p, e, rb_ascii8bit_encoding())))
949+
enc = rb_ascii8bit_encoding();
950+
else
951+
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
939952
}
940953
if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
941954
return rb_enc_mbc_to_codepoint(p, e, enc);
@@ -1420,6 +1433,12 @@ get_default_internal(VALUE klass)
14201433
return rb_enc_default_internal();
14211434
}
14221435

1436+
static VALUE
1437+
rb_enc_compat_mode_enabled_p(VALUE klass)
1438+
{
1439+
return rb_encoding_compat ? Qtrue : Qfalse;
1440+
}
1441+
14231442
void
14241443
rb_enc_set_default_internal(VALUE encoding)
14251444
{
@@ -1872,6 +1891,9 @@ Init_Encoding(void)
18721891
for (i = 0; i < enc_table.count; ++i) {
18731892
rb_ary_push(list, enc_new(enc_table.list[i].enc));
18741893
}
1894+
1895+
rb_const_set(rb_cEncoding, rb_intern_const("COMPAT_MODE_AVAILABLE"), Qtrue);
1896+
rb_define_singleton_method(rb_cEncoding, "compat_mode_enabled?", rb_enc_compat_mode_enabled_p, 0);
18751897
}
18761898

18771899
/* locale insensitive ctype functions */

include/ruby/encoding.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ extern "C" {
7777
ENC_CODERANGE_SET(rb_encoding_coderange_obj, (cr)); \
7878
} while (0)
7979

80+
extern int rb_encoding_compat;
8081
typedef OnigEncodingType rb_encoding;
8182

8283
int rb_char_to_option_kcode(int c, int *option, int *kcode);

re.c

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,15 +1247,16 @@ static rb_encoding*
12471247
rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
12481248
{
12491249
rb_encoding *enc = 0;
1250+
enc = rb_enc_get(str);
12501251

12511252
if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
1252-
rb_raise(rb_eArgError,
1253-
"invalid byte sequence in %s",
1254-
rb_enc_name(rb_enc_get(str)));
1253+
if (!(rb_encoding_compat && enc == rb_utf8_encoding()))
1254+
rb_raise(rb_eArgError,
1255+
"invalid byte sequence in %s",
1256+
rb_enc_name(rb_enc_get(str)));
12551257
}
12561258

12571259
rb_reg_check(re);
1258-
enc = rb_enc_get(str);
12591260
if (!rb_enc_str_asciicompat_p(str)) {
12601261
if (RREGEXP(re)->ptr->enc != enc) {
12611262
reg_enc_error(re, str);
@@ -1265,15 +1266,21 @@ rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
12651266
if (RREGEXP(re)->ptr->enc != enc &&
12661267
(!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
12671268
rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
1269+
if (rb_encoding_compat &&
1270+
((RREGEXP(re)->ptr->enc == rb_ascii8bit_encoding() && enc == rb_utf8_encoding()) ||
1271+
(enc == rb_ascii8bit_encoding() && RREGEXP(re)->ptr->enc == rb_utf8_encoding()))) {
1272+
return rb_ascii8bit_encoding();
1273+
}
12681274
reg_enc_error(re, str);
12691275
}
12701276
enc = RREGEXP(re)->ptr->enc;
12711277
}
12721278
if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
12731279
enc != rb_ascii8bit_encoding() &&
12741280
rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1275-
rb_warn("regexp match /.../n against to %s string",
1276-
rb_enc_name(enc));
1281+
if (!(rb_encoding_compat && enc == rb_utf8_encoding()))
1282+
rb_warn("regexp match /.../n against to %s string",
1283+
rb_enc_name(enc));
12771284
}
12781285
return enc;
12791286
}

ruby.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,9 @@ proc_options(long argc, char **argv, struct cmdline_options *opt, int envopt)
10981098
set_source_encoding_once(opt, s, 0);
10991099
}
11001100
#endif
1101+
else if (strcmp("encoding-compatibility", s) == 0) {
1102+
rb_encoding_compat = 1;
1103+
}
11011104
else if (strcmp("version", s) == 0) {
11021105
if (envopt) goto noenvopt_long;
11031106
opt->dump |= DUMP_BIT(version);

string.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2019,7 +2019,12 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
20192019
if (ptr_cr_ret)
20202020
*ptr_cr_ret = ptr_cr;
20212021

2022-
if (str_encindex != ptr_encindex &&
2022+
if (rb_encoding_compat &&
2023+
((str_encindex == rb_utf8_encindex() && ptr_encindex == rb_ascii8bit_encindex()) ||
2024+
(str_encindex == rb_ascii8bit_encindex() && ptr_encindex == rb_utf8_encindex()))) {
2025+
/* fall through to conditional below */
2026+
}
2027+
else if (str_encindex != ptr_encindex &&
20232028
str_cr != ENC_CODERANGE_7BIT &&
20242029
ptr_cr != ENC_CODERANGE_7BIT) {
20252030
incompatible:
@@ -2028,7 +2033,14 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
20282033
rb_enc_name(rb_enc_from_index(ptr_encindex)));
20292034
}
20302035

2031-
if (str_cr == ENC_CODERANGE_UNKNOWN) {
2036+
if (rb_encoding_compat &&
2037+
str_encindex != ptr_encindex &&
2038+
str_cr != ENC_CODERANGE_7BIT && ptr_cr != ENC_CODERANGE_7BIT) {
2039+
/* from fall through above */
2040+
res_encindex = rb_ascii8bit_encindex();
2041+
res_cr = ENC_CODERANGE_VALID;
2042+
}
2043+
else if (str_cr == ENC_CODERANGE_UNKNOWN) {
20322044
res_encindex = str_encindex;
20332045
res_cr = ENC_CODERANGE_UNKNOWN;
20342046
}
@@ -2240,6 +2252,8 @@ rb_str_hash(VALUE str)
22402252
if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
22412253
e = 0;
22422254
}
2255+
if (rb_encoding_compat && e == rb_utf8_encindex() || e == rb_ascii8bit_encindex())
2256+
e = 0;
22432257
return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
22442258
}
22452259

@@ -2294,6 +2308,11 @@ rb_str_comparable(VALUE str1, VALUE str2)
22942308
if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
22952309
return TRUE;
22962310
}
2311+
if (rb_encoding_compat &&
2312+
((idx1 == rb_utf8_encindex() && idx2 == rb_ascii8bit_encindex()) ||
2313+
(idx1 == rb_ascii8bit_encindex() && idx2 == rb_utf8_encindex()))) {
2314+
return TRUE;
2315+
}
22972316
return FALSE;
22982317
}
22992318

@@ -6034,7 +6053,8 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
60346053
long slen = RSTRING_LEN(spat);
60356054

60366055
if (is_broken_string(str)) {
6037-
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
6056+
if (!(rb_encoding_compat && STR_ENC_GET(str) == rb_utf8_encoding()))
6057+
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
60386058
}
60396059
if (is_broken_string(spat)) {
60406060
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));

0 commit comments

Comments
 (0)