Skip to content

Commit 939379f

Browse files
committed
[GR-41361] Backports for 22.3 batch 2
PullRequest: truffleruby/3509
2 parents 84b886d + c83ae6d commit 939379f

File tree

9 files changed

+91
-52
lines changed

9 files changed

+91
-52
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Bug fixes:
1818
* Fix `String#split` missing a value in its return array when called with a pattern of `" "` and a _limit_ value > 0 on a string with trailing whitespace where the limit hasn't been met (@nirvdrum).
1919
* Fix `Kernel#sleep` and `Mutex#sleep` for durations smaller than 1 millisecond (#2716, @eregon).
2020
* Fix `IO#{wait,wait_readable,wait_writable}` with a timeout > INT_MAX seconds (@eregon).
21+
* Use the compatible encoding for `String#{sub,gsub,index,rindex}` (#2749, @eregon).
2122

2223
Compatibility:
2324

spec/ruby/core/string/gsub_spec.rb

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,6 @@ def replacement.to_str() "hello_replacement" end
210210
end
211211
end
212212

213-
# Note: $~ cannot be tested because mspec messes with it
214-
215213
it "sets $~ to MatchData of last match and nil when there's none" do
216214
'hello.'.gsub('hello', 'x')
217215
$~[0].should == 'hello'
@@ -225,6 +223,18 @@ def replacement.to_str() "hello_replacement" end
225223
'hello.'.gsub(/not/, 'x')
226224
$~.should == nil
227225
end
226+
227+
it "handles a pattern in a superset encoding" do
228+
result = 'abc'.force_encoding(Encoding::US_ASCII).gsub('é', 'è')
229+
result.should == 'abc'
230+
result.encoding.should == Encoding::US_ASCII
231+
end
232+
233+
it "handles a pattern in a subset encoding" do
234+
result = 'été'.gsub('t'.force_encoding(Encoding::US_ASCII), 'u')
235+
result.should == 'éué'
236+
result.encoding.should == Encoding::UTF_8
237+
end
228238
end
229239

230240
describe "String#gsub with pattern and Hash" do

spec/ruby/core/string/index_spec.rb

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,14 @@
159159
"あれ".index char
160160
end.should raise_error(Encoding::CompatibilityError)
161161
end
162+
163+
it "handles a substring in a superset encoding" do
164+
'abc'.force_encoding(Encoding::US_ASCII).index('é').should == nil
165+
end
166+
167+
it "handles a substring in a subset encoding" do
168+
'été'.index('t'.force_encoding(Encoding::US_ASCII)).should == 1
169+
end
162170
end
163171

164172
describe "String#index with Regexp" do

spec/ruby/core/string/rindex_spec.rb

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,14 @@ def obj.method_missing(*args) 5 end
196196
it "raises a TypeError when given offset is nil" do
197197
-> { "str".rindex("st", nil) }.should raise_error(TypeError)
198198
end
199+
200+
it "handles a substring in a superset encoding" do
201+
'abc'.force_encoding(Encoding::US_ASCII).rindex('é').should == nil
202+
end
203+
204+
it "handles a substring in a subset encoding" do
205+
'été'.rindex('t'.force_encoding(Encoding::US_ASCII)).should == 1
206+
end
199207
end
200208

201209
describe "String#rindex with Regexp" do

spec/ruby/core/string/sub_spec.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,17 @@
214214
"ababa".sub(/(b)/, '\\\\\1').should == "a\\baba"
215215
end
216216

217+
it "handles a pattern in a superset encoding" do
218+
result = 'abc'.force_encoding(Encoding::US_ASCII).sub('é', 'è')
219+
result.should == 'abc'
220+
result.encoding.should == Encoding::US_ASCII
221+
end
222+
223+
it "handles a pattern in a subset encoding" do
224+
result = 'été'.sub('t'.force_encoding(Encoding::US_ASCII), 'u')
225+
result.should == 'éué'
226+
result.encoding.should == Encoding::UTF_8
227+
end
217228
end
218229

219230
describe "String#sub with pattern and block" do

src/main/java/org/truffleruby/core/string/StringNodes.java

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3774,7 +3774,7 @@ protected Object findStringByteIndex(Object rubyString, Object rubyPattern, int
37743774

37753775
}
37763776

3777-
@Primitive(name = "string_byte_character_index", lowerFixnum = 1)
3777+
@Primitive(name = "byte_index_to_character_index", lowerFixnum = 1)
37783778
public abstract static class StringByteCharacterIndexNode extends PrimitiveArrayArgumentsNode {
37793779
@Specialization
37803780
protected int byteIndexToCodePointIndex(Object string, int byteIndex,
@@ -3786,16 +3786,29 @@ protected int byteIndexToCodePointIndex(Object string, int byteIndex,
37863786
}
37873787
}
37883788

3789+
// Named 'string_byte_index' in Rubinius.
3790+
@Primitive(name = "character_index_to_byte_index", lowerFixnum = 1)
3791+
public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode {
3792+
@Specialization
3793+
protected Object byteIndexFromCharIndex(Object string, int characterIndex,
3794+
@Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode,
3795+
@Cached RubyStringLibrary libString) {
3796+
return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex,
3797+
libString.getTEncoding(string));
3798+
}
3799+
}
3800+
37893801
/** Search pattern in string starting after offset characters, and return a character index or nil */
3790-
@Primitive(name = "string_character_index", lowerFixnum = 2)
3802+
@Primitive(name = "string_character_index", lowerFixnum = 3)
37913803
public abstract static class StringCharacterIndexNode extends PrimitiveArrayArgumentsNode {
37923804

37933805
protected final RubyStringLibrary libString = RubyStringLibrary.create();
37943806
protected final RubyStringLibrary libPattern = RubyStringLibrary.create();
37953807
@Child SingleByteOptimizableNode singleByteOptimizableNode = SingleByteOptimizableNode.create();
37963808

37973809
@Specialization(guards = "singleByteOptimizableNode.execute(string, stringEncoding)")
3798-
protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, int codePointOffset,
3810+
protected Object singleByteOptimizable(
3811+
Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset,
37993812
@Bind("libString.getTString(rubyString)") AbstractTruffleString string,
38003813
@Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding,
38013814
@Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern,
@@ -3808,12 +3821,11 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in
38083821
// When single-byte optimizable, the byte length and the codepoint length are the same.
38093822
int stringByteLength = string.byteLength(stringEncoding.tencoding);
38103823

3811-
assert codePointOffset + pattern.byteLength(
3812-
patternEncoding.tencoding) <= stringByteLength : "already checked in the caller, String#index";
3824+
assert codePointOffset + pattern.byteLength(patternEncoding.tencoding) <= stringByteLength
3825+
: "already checked in the caller, String#index";
38133826

3814-
int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset,
3815-
stringByteLength,
3816-
stringEncoding.tencoding);
3827+
int found = byteIndexOfStringNode.execute(string, pattern, codePointOffset, stringByteLength,
3828+
compatibleEncoding.tencoding);
38173829

38183830
if (foundProfile.profile(found >= 0)) {
38193831
return found;
@@ -3823,7 +3835,8 @@ protected Object singleByteOptimizable(Object rubyString, Object rubyPattern, in
38233835
}
38243836

38253837
@Specialization(guards = "!singleByteOptimizableNode.execute(string, stringEncoding)")
3826-
protected Object multiByte(Object rubyString, Object rubyPattern, int codePointOffset,
3838+
protected Object multiByte(
3839+
Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int codePointOffset,
38273840
@Bind("libString.getTString(rubyString)") AbstractTruffleString string,
38283841
@Bind("libString.getEncoding(rubyString)") RubyEncoding stringEncoding,
38293842
@Bind("libPattern.getTString(rubyPattern)") AbstractTruffleString pattern,
@@ -3838,7 +3851,7 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO
38383851

38393852
int stringCodePointLength = codePointLengthNode.execute(string, stringEncoding.tencoding);
38403853
int found = indexOfStringNode.execute(string, pattern, codePointOffset, stringCodePointLength,
3841-
stringEncoding.tencoding);
3854+
compatibleEncoding.tencoding);
38423855

38433856
if (foundProfile.profile(found >= 0)) {
38443857
return found;
@@ -3849,11 +3862,12 @@ protected Object multiByte(Object rubyString, Object rubyPattern, int codePointO
38493862
}
38503863

38513864
/** Search pattern in string starting after offset bytes, and return a byte index or nil */
3852-
@Primitive(name = "string_byte_index", lowerFixnum = 2)
3865+
@Primitive(name = "string_byte_index", lowerFixnum = 3)
38533866
public abstract static class StringByteIndexNode extends PrimitiveArrayArgumentsNode {
38543867

38553868
@Specialization
3856-
protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byteOffset,
3869+
protected Object stringByteIndex(
3870+
Object rubyString, Object rubyPattern, RubyEncoding compatibleEncoding, int byteOffset,
38573871
@Cached RubyStringLibrary libString,
38583872
@Cached RubyStringLibrary libPattern,
38593873
@Cached TruffleString.ByteIndexOfStringNode byteIndexOfStringNode,
@@ -3862,18 +3876,17 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte
38623876
assert byteOffset >= 0;
38633877

38643878
var string = libString.getTString(rubyString);
3865-
var stringEncoding = libString.getEncoding(rubyString).tencoding;
3866-
int stringByteLength = string.byteLength(stringEncoding);
3879+
int stringByteLength = libString.byteLength(rubyString);
38673880

38683881
var pattern = libPattern.getTString(rubyPattern);
3869-
var patternEncoding = libPattern.getEncoding(rubyPattern).tencoding;
3870-
int patternByteLength = pattern.byteLength(patternEncoding);
3882+
int patternByteLength = libPattern.byteLength(rubyPattern);
38713883

38723884
if (indexOutOfBoundsProfile.profile(byteOffset + patternByteLength > stringByteLength)) {
38733885
return nil;
38743886
}
38753887

3876-
int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength, stringEncoding);
3888+
int found = byteIndexOfStringNode.execute(string, pattern, byteOffset, stringByteLength,
3889+
compatibleEncoding.tencoding);
38773890
if (foundProfile.profile(found >= 0)) {
38783891
return found;
38793892
}
@@ -3882,18 +3895,6 @@ protected Object stringByteIndex(Object rubyString, Object rubyPattern, int byte
38823895
}
38833896
}
38843897

3885-
// Named 'string_byte_index' in Rubinius.
3886-
@Primitive(name = "string_byte_index_from_char_index", lowerFixnum = 1)
3887-
public abstract static class StringByteIndexFromCharIndexNode extends PrimitiveArrayArgumentsNode {
3888-
@Specialization
3889-
protected Object byteIndexFromCharIndex(Object string, int characterIndex,
3890-
@Cached TruffleString.CodePointIndexToByteIndexNode codePointIndexToByteIndexNode,
3891-
@Cached RubyStringLibrary libString) {
3892-
return codePointIndexToByteIndexNode.execute(libString.getTString(string), 0, characterIndex,
3893-
libString.getTEncoding(string));
3894-
}
3895-
}
3896-
38973898
// Port of Rubinius's String::previous_byte_index.
38983899
//
38993900
// This method takes a byte index, finds the corresponding character the byte index belongs to, and then returns
@@ -3984,7 +3985,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff
39843985
assert byteOffset >= 0;
39853986

39863987
// Throw an exception if the encodings are not compatible.
3987-
checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern);
3988+
var compatibleEncoding = checkEncodingNode.executeCheckEncoding(rubyString, rubyPattern);
39883989

39893990
var string = libString.getTString(rubyString);
39903991
var stringEncoding = libString.getEncoding(rubyString).tencoding;
@@ -4007,7 +4008,7 @@ protected Object stringRindex(Object rubyString, Object rubyPattern, int byteOff
40074008
}
40084009

40094010
int result = lastByteIndexOfStringNode.execute(string, pattern, normalizedStart + patternByteLength, 0,
4010-
stringEncoding);
4011+
compatibleEncoding.tencoding);
40114012

40124013
if (result < 0) {
40134014
noMatchProfile.enter();

src/main/ruby/truffleruby/core/string.rb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ def index(str, start=undefined)
10251025
if Primitive.object_kind_of?(str, Regexp)
10261026
Primitive.encoding_ensure_compatible self, str
10271027

1028-
start = Primitive.string_byte_index_from_char_index(self, start)
1028+
start = Primitive.character_index_to_byte_index(self, start)
10291029
if match = Truffle::RegexpOperations.match_from(str, self, start)
10301030
Primitive.regexp_last_match_set(Primitive.caller_special_variables, match)
10311031
return match.begin(0)
@@ -1038,11 +1038,11 @@ def index(str, start=undefined)
10381038
str = StringValue(str)
10391039
return start if str == ''
10401040

1041-
Primitive.encoding_ensure_compatible_str self, str
1041+
enc = Primitive.encoding_ensure_compatible_str self, str
10421042

10431043
return if start + str.size > size
10441044

1045-
Primitive.string_character_index(self, str, start)
1045+
Primitive.string_character_index(self, str, enc, start)
10461046
end
10471047

10481048
def initialize(other = undefined, capacity: nil, encoding: nil)
@@ -1064,7 +1064,7 @@ def rindex(sub, finish=undefined)
10641064
finish = size if finish >= size
10651065
end
10661066

1067-
byte_finish = Primitive.string_byte_index_from_char_index(self, finish)
1067+
byte_finish = Primitive.character_index_to_byte_index(self, finish)
10681068

10691069
if Primitive.object_kind_of?(sub, Regexp)
10701070
Primitive.encoding_ensure_compatible self, sub
@@ -1085,7 +1085,7 @@ def rindex(sub, finish=undefined)
10851085

10861086
Primitive.encoding_ensure_compatible_str self, needle
10871087
if byte_index = Primitive.find_string_reverse(self, needle, byte_finish)
1088-
return Primitive.string_byte_character_index(self, byte_index)
1088+
return Primitive.byte_index_to_character_index(self, byte_index)
10891089
end
10901090
end
10911091

src/main/ruby/truffleruby/core/truffle/regexp_operations.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def self.match(re, str, pos=0)
3737

3838
pos = pos < 0 ? pos + str.size : pos
3939
return nil if pos < 0 or pos > str.size
40-
pos = Primitive.string_byte_index_from_char_index(str, pos)
40+
pos = Primitive.character_index_to_byte_index(str, pos)
4141

4242
search_region(re, str, pos, str.bytesize, true, true)
4343
end
@@ -50,7 +50,7 @@ def self.match?(re, str, pos=0)
5050

5151
pos = pos < 0 ? pos + str.size : pos
5252
return false if pos < 0 or pos > str.size
53-
pos = Primitive.string_byte_index_from_char_index(str, pos)
53+
pos = Primitive.character_index_to_byte_index(str, pos)
5454

5555
search_region(re, str, pos, str.bytesize, true, false)
5656
end

src/main/ruby/truffleruby/core/truffle/string_operations.rb

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ def self.gsub_regexp_matches(global, orig, pattern)
105105
def self.gsub_string_matches(global, orig, pattern)
106106
res = []
107107
offset = 0
108-
while index = byte_index(orig, pattern, offset)
108+
enc = Primitive.encoding_ensure_compatible_str orig, pattern
109+
110+
while index = byte_index(orig, pattern, enc, offset)
109111
match = Primitive.matchdata_create_single_group(pattern, orig.dup, index, index + pattern.bytesize)
110112
res << match
111113
break unless global
@@ -285,7 +287,7 @@ def self.validate_case_mapping_options(options, downcasing)
285287
end
286288

287289
# MRI: rb_str_byteindex_m
288-
def self.byte_index(src, str, start=0)
290+
def self.byte_index(src, str, enc, start = 0)
289291
start += src.bytesize if start < 0
290292
if start < 0 or start > src.bytesize
291293
Primitive.regexp_last_match_set(Primitive.caller_special_variables, nil) if Primitive.object_kind_of?(str, Regexp)
@@ -294,9 +296,7 @@ def self.byte_index(src, str, start=0)
294296

295297
return start if str == ''
296298

297-
Primitive.encoding_ensure_compatible_str src, str
298-
299-
Primitive.string_byte_index(src, str, start)
299+
Primitive.string_byte_index(src, str, enc, start)
300300
end
301301

302302
def self.subpattern(string, pattern, capture)
@@ -320,7 +320,7 @@ def self.assign_index(string, index, count, replacement)
320320
raise IndexError, "index #{index} out of string"
321321
end
322322

323-
unless bi = Primitive.string_byte_index_from_char_index(string, index)
323+
unless bi = Primitive.character_index_to_byte_index(string, index)
324324
raise IndexError, "unable to find character at: #{index}"
325325
end
326326

@@ -335,10 +335,10 @@ def self.assign_index(string, index, count, replacement)
335335
if total >= string.size
336336
bs = string.bytesize - bi
337337
else
338-
bs = Primitive.string_byte_index_from_char_index(string, total) - bi
338+
bs = Primitive.character_index_to_byte_index(string, total) - bi
339339
end
340340
else
341-
bs = index == string.size ? 0 : Primitive.string_byte_index_from_char_index(string, index + 1) - bi
341+
bs = index == string.size ? 0 : Primitive.character_index_to_byte_index(string, index + 1) - bi
342342
end
343343

344344
replacement = StringValue replacement
@@ -364,15 +364,15 @@ def self.assign_range(string, index, replacement)
364364

365365
raise RangeError, "#{index.first} is out of range" if start < 0 or start > string.size
366366

367-
bi = Primitive.string_byte_index_from_char_index(string, start)
367+
bi = Primitive.character_index_to_byte_index(string, start)
368368
raise IndexError, "unable to find character at: #{start}" unless bi
369369

370370
if stop < start
371371
bs = 0
372372
elsif stop >= string.size
373373
bs = string.bytesize - bi
374374
else
375-
bs = Primitive.string_byte_index_from_char_index(string, stop + 1) - bi
375+
bs = Primitive.character_index_to_byte_index(string, stop + 1) - bi
376376
end
377377

378378
replacement = StringValue replacement
@@ -406,8 +406,8 @@ def self.assign_regexp(string, index, count, replacement)
406406
replacement = StringValue replacement
407407
enc = Primitive.encoding_ensure_compatible_str string, replacement
408408

409-
bi = Primitive.string_byte_index_from_char_index(string, match.begin(count))
410-
bs = Primitive.string_byte_index_from_char_index(string, match.end(count)) - bi
409+
bi = Primitive.character_index_to_byte_index(string, match.begin(count))
410+
bs = Primitive.character_index_to_byte_index(string, match.end(count)) - bi
411411

412412
Primitive.string_splice(string, replacement, bi, bs, enc)
413413
end

0 commit comments

Comments
 (0)