Skip to content

Commit d25384c

Browse files
committed
merge revision(s) 44604,44605,44606: [Backport ruby#9415]
test_m17n.rb: split tests for inspect * test/ruby/test_m17n.rb (test_utf_16_32_inspect): split tests for each encodings. * string.c (get_actual_encoding): get actual encoding according to the BOM if exists. * string.c (rb_str_inspect): use according encoding, instead of pseudo encodings, UTF-{16,32}. [ruby-core:59757] [Bug ruby#8940] * string.c (get_encoding): respect BOM on pseudo encodings. [ruby-dev:47895] [Bug ruby#9415] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_1@45074 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
1 parent 42f1c52 commit d25384c

File tree

5 files changed

+88
-37
lines changed

5 files changed

+88
-37
lines changed

ChangeLog

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
Fri Feb 21 16:47:20 2014 Nobuyoshi Nakada <[email protected]>
2+
3+
* string.c (get_encoding): respect BOM on pseudo encodings.
4+
[ruby-dev:47895] [Bug #9415]
5+
6+
Fri Feb 21 16:47:20 2014 Nobuyoshi Nakada <[email protected]>
7+
8+
* string.c (get_actual_encoding): get actual encoding according to
9+
the BOM if exists.
10+
11+
* string.c (rb_str_inspect): use according encoding, instead of
12+
pseudo encodings, UTF-{16,32}. [ruby-core:59757] [Bug #8940]
13+
114
Fri Feb 21 13:39:21 2014 Charlie Somerville <[email protected]>
215

316
* compile.c (iseq_build_from_ary_body): Use :blockptr instead of :block

encoding.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,12 @@ rb_enc_from_index(int index)
598598
return enc_table.list[index].enc;
599599
}
600600

601+
rb_encoding *
602+
rb_enc_get_from_index(int index)
603+
{
604+
return must_encindex(index);
605+
}
606+
601607
int
602608
rb_enc_registered(const char *name)
603609
{

string.c

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,45 @@ VALUE rb_cSymbol;
121121
#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
122122
#define STR_HEAP_SIZE(str) (RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
123123

124-
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
124+
#define STR_ENC_GET(str) get_encoding(str)
125+
126+
rb_encoding *rb_enc_get_from_index(int index);
127+
128+
static rb_encoding *
129+
get_actual_encoding(const int encidx, VALUE str)
130+
{
131+
const unsigned char *q;
132+
133+
switch (encidx) {
134+
case ENCINDEX_UTF_16:
135+
if (RSTRING_LEN(str) < 2) break;
136+
q = (const unsigned char *)RSTRING_PTR(str);
137+
if (q[0] == 0xFE && q[1] == 0xFF) {
138+
return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
139+
}
140+
if (q[0] == 0xFF && q[1] == 0xFE) {
141+
return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
142+
}
143+
return rb_ascii8bit_encoding();
144+
case ENCINDEX_UTF_32:
145+
if (RSTRING_LEN(str) < 4) break;
146+
q = (const unsigned char *)RSTRING_PTR(str);
147+
if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
148+
return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
149+
}
150+
if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
151+
return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
152+
}
153+
return rb_ascii8bit_encoding();
154+
}
155+
return rb_enc_from_index(encidx);
156+
}
157+
158+
static rb_encoding *
159+
get_encoding(VALUE str)
160+
{
161+
return get_actual_encoding(ENCODING_GET(str), str);
162+
}
125163

126164
static int fstring_cmp(VALUE a, VALUE b);
127165

@@ -4750,8 +4788,8 @@ rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
47504788
VALUE
47514789
rb_str_inspect(VALUE str)
47524790
{
4753-
rb_encoding *enc = STR_ENC_GET(str);
4754-
int encidx = rb_enc_to_index(enc);
4791+
int encidx = ENCODING_GET(str);
4792+
rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
47554793
const char *p, *pend, *prev;
47564794
char buf[CHAR_ESC_LEN + 1];
47574795
VALUE result = rb_str_buf_new(0);
@@ -4766,27 +4804,10 @@ rb_str_inspect(VALUE str)
47664804

47674805
p = RSTRING_PTR(str); pend = RSTRING_END(str);
47684806
prev = p;
4769-
if (encidx == ENCINDEX_UTF_16 && p + 2 <= pend) {
4770-
const unsigned char *q = (const unsigned char *)p;
4771-
if (q[0] == 0xFE && q[1] == 0xFF)
4772-
enc = rb_enc_from_index(ENCINDEX_UTF_16BE);
4773-
else if (q[0] == 0xFF && q[1] == 0xFE)
4774-
enc = rb_enc_from_index(ENCINDEX_UTF_16LE);
4775-
else {
4776-
enc = rb_ascii8bit_encoding();
4777-
unicode_p = 0;
4778-
}
4779-
}
4780-
else if (encidx == ENCINDEX_UTF_32 && p + 4 <= pend) {
4781-
const unsigned char *q = (const unsigned char *)p;
4782-
if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
4783-
enc = rb_enc_from_index(ENCINDEX_UTF_32BE);
4784-
else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
4785-
enc = rb_enc_from_index(ENCINDEX_UTF_32LE);
4786-
else {
4787-
enc = rb_ascii8bit_encoding();
4788-
unicode_p = 0;
4789-
}
4807+
actenc = get_actual_encoding(encidx, str);
4808+
if (actenc != enc) {
4809+
enc = actenc;
4810+
if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
47904811
}
47914812
while (p < pend) {
47924813
unsigned int c, cc;

test/ruby/test_m17n.rb

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -226,24 +226,35 @@ def test_string_inspect_encoding
226226
end
227227
end
228228

229-
def test_utf_16_32_inspect
230-
str = "\u3042"
231-
%w/UTF-16 UTF-32/.each do |enc|
232-
%w/BE LE/.each do |endian|
233-
s = str.encode(enc + endian)
229+
STR_WITHOUT_BOM = "\u3042".freeze
230+
STR_WITH_BOM = "\uFEFF\u3042".freeze
231+
bug8940 = '[ruby-core:59757] [Bug #8940]'
232+
bug9415 = '[ruby-dev:47895] [Bug #9415]'
233+
%w/UTF-16 UTF-32/.each do |enc|
234+
%w/BE LE/.each do |endian|
235+
bom = "\uFEFF".encode("#{enc}#{endian}").force_encoding(enc)
236+
237+
define_method("test_utf_16_32_inspect(#{enc}#{endian})") do
238+
s = STR_WITHOUT_BOM.encode(enc + endian)
234239
# When a UTF-16/32 string doesn't have a BOM,
235240
# inspect as a dummy encoding string.
236241
assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect,
237242
s.dup.force_encoding(enc).inspect)
243+
assert_normal_exit("#{bom.b.dump}.force_encoding('#{enc}').inspect", bug8940)
238244
end
239-
end
240245

241-
str = "\uFEFF\u3042"
242-
%w/UTF-16 UTF-32/.each do |enc|
243-
%w/BE LE/.each do |endian|
244-
s = str.encode(enc + endian)
245-
# When a UTF-16/32 string doesn't have a BOM,
246-
# inspect as a dummy encoding string.
246+
define_method("test_utf_16_32_codepoints(#{enc}#{endian})") do
247+
assert_equal([0xFEFF], bom.codepoints, bug9415)
248+
end
249+
250+
define_method("test_utf_16_32_ord(#{enc}#{endian})") do
251+
assert_equal(0xFEFF, bom.ord, bug9415)
252+
end
253+
254+
define_method("test_utf_16_32_inspect(#{enc}#{endian}-BOM)") do
255+
s = STR_WITH_BOM.encode(enc + endian)
256+
# When a UTF-16/32 string has a BOM,
257+
# inspect as a particular encoding string.
247258
assert_equal(s.inspect,
248259
s.dup.force_encoding(enc).inspect)
249260
end

version.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#define RUBY_VERSION "2.1.1"
22
#define RUBY_RELEASE_DATE "2014-02-21"
3-
#define RUBY_PATCHLEVEL 40
3+
#define RUBY_PATCHLEVEL 41
44

55
#define RUBY_RELEASE_YEAR 2014
66
#define RUBY_RELEASE_MONTH 2

0 commit comments

Comments
 (0)