Skip to content

Commit a4f0600

Browse files
committed
[GR-19220] Fix String#split (#2715)
PullRequest: truffleruby/3478
2 parents 76dfee9 + 69e3c2e commit a4f0600

File tree

3 files changed

+103
-56
lines changed

3 files changed

+103
-56
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Bug fixes:
1414
* Fix `File::Stat`'s `#executable?` and `#executable_real?` predicates that unconditionally returned `true` for a superuser (#2690, @andrykonchin).
1515
* The `strip` option `--keep-section=.llvmbc` is not supported on macOS (#2697, @eregon).
1616
* Disallow the marshaling of polyglot exceptions since we can't properly reconstruct them (@nirvdrum).
17+
* Fix `String#split` missing a value in its return array when called with a pattern of `" "` and a _limit_ value > 0 on a string with trailing whitespace where the limit hasn't been met (@nirvdrum).
1718

1819
Compatibility:
1920

spec/ruby/core/string/split_spec.rb

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,35 @@
2929
"1,2,,3,4,,".split(',').should == ["1", "2", "", "3", "4"]
3030
"1,2,,3,4,,".split(',', 0).should == ["1", "2", "", "3", "4"]
3131
" a b c\nd ".split(" ").should == ["", "a", "b", "c\nd"]
32+
" a あ c\nd ".split(" ").should == ["", "a", "あ", "c\nd"]
3233
"hai".split("hai").should == []
3334
",".split(",").should == []
3435
",".split(",", 0).should == []
36+
"あ".split("あ").should == []
37+
"あ".split("あ", 0).should == []
38+
end
39+
40+
it "does not suppress trailing empty fields when a positive limit is given" do
41+
" 1 2 ".split(" ", 2).should == ["1", "2 "]
42+
" 1 2 ".split(" ", 3).should == ["1", "2", ""]
43+
" 1 2 ".split(" ", 4).should == ["1", "2", ""]
44+
" 1 あ ".split(" ", 2).should == ["1", "あ "]
45+
" 1 あ ".split(" ", 3).should == ["1", "あ", ""]
46+
" 1 あ ".split(" ", 4).should == ["1", "あ", ""]
47+
48+
"1,2,".split(',', 2).should == ["1", "2,"]
49+
"1,2,".split(',', 3).should == ["1", "2", ""]
50+
"1,2,".split(',', 4).should == ["1", "2", ""]
51+
"1,あ,".split(',', 2).should == ["1", "あ,"]
52+
"1,あ,".split(',', 3).should == ["1", "あ", ""]
53+
"1,あ,".split(',', 4).should == ["1", "あ", ""]
54+
55+
"1 2 ".split(/ /, 2).should == ["1", "2 "]
56+
"1 2 ".split(/ /, 3).should == ["1", "2", ""]
57+
"1 2 ".split(/ /, 4).should == ["1", "2", ""]
58+
"1 あ ".split(/ /, 2).should == ["1", "あ "]
59+
"1 あ ".split(/ /, 3).should == ["1", "あ", ""]
60+
"1 あ ".split(/ /, 4).should == ["1", "あ", ""]
3561
end
3662

3763
it "returns an array with one entry if limit is 1: the original string" do

src/main/java/org/truffleruby/core/string/StringNodes.java

Lines changed: 76 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,12 @@
7171
import static org.truffleruby.core.string.StringSupport.MBCLEN_INVALID_P;
7272
import static org.truffleruby.core.string.StringSupport.MBCLEN_NEEDMORE_P;
7373

74+
import com.oracle.truffle.api.TruffleSafepoint;
7475
import com.oracle.truffle.api.dsl.Bind;
7576
import com.oracle.truffle.api.dsl.Cached.Exclusive;
7677
import com.oracle.truffle.api.dsl.Cached.Shared;
7778
import com.oracle.truffle.api.dsl.GenerateNodeFactory;
79+
import com.oracle.truffle.api.profiles.LoopConditionProfile;
7880
import com.oracle.truffle.api.strings.AbstractTruffleString;
7981
import com.oracle.truffle.api.strings.InternalByteArray;
8082
import com.oracle.truffle.api.strings.MutableTruffleString;
@@ -3138,9 +3140,10 @@ public abstract static class StringAwkSplitPrimitiveNode extends PrimitiveArrayA
31383140
@Child GetByteCodeRangeNode codeRangeNode = GetByteCodeRangeNode.create();
31393141

31403142
private static final int SUBSTRING_CREATED = -1;
3143+
private static final int DEFAULT_SPLIT_VALUES_SIZE = 10;
31413144

31423145
@Specialization(guards = "is7Bit(tstring, encoding, codeRangeNode)")
3143-
protected Object stringAwkSplitSingleByte(Object string, int limit, Object block,
3146+
protected Object stringAwkSplitAsciiOnly(Object string, int limit, Object block,
31443147
@Cached RubyStringLibrary strings,
31453148
@Cached ConditionProfile executeBlockProfile,
31463149
@Cached ConditionProfile growArrayProfile,
@@ -3149,42 +3152,52 @@ protected Object stringAwkSplitSingleByte(Object string, int limit, Object block
31493152
@Cached TruffleString.MaterializeNode materializeNode,
31503153
@Cached TruffleString.ReadByteNode readByteNode,
31513154
@Cached TruffleString.SubstringByteIndexNode substringNode,
3155+
@Cached LoopConditionProfile loopProfile,
31523156
@Bind("strings.getTString(string)") AbstractTruffleString tstring,
31533157
@Bind("strings.getEncoding(string)") RubyEncoding encoding) {
3154-
Object[] ret = new Object[10];
3158+
int retSize = limit > 0 && limit < DEFAULT_SPLIT_VALUES_SIZE ? limit : DEFAULT_SPLIT_VALUES_SIZE;
3159+
Object[] ret = new Object[retSize];
31553160
int storeIndex = 0;
31563161

31573162
int byteLength = tstring.byteLength(encoding.tencoding);
31583163
materializeNode.execute(tstring, encoding.tencoding);
31593164

31603165
int substringStart = 0;
31613166
boolean findingSubstringEnd = false;
3162-
for (int i = 0; i < byteLength; i++) {
3163-
if (StringSupport.isAsciiSpace(readByteNode.execute(tstring, i, encoding.tencoding))) {
3164-
if (findingSubstringEnd) {
3165-
findingSubstringEnd = false;
3166-
3167-
final RubyString substring = createSubString(substringNode, tstring, encoding, substringStart,
3168-
i - substringStart);
3169-
ret = addSubstring(
3170-
ret,
3171-
storeIndex++,
3172-
substring,
3173-
block,
3174-
executeBlockProfile,
3175-
growArrayProfile);
3176-
substringStart = SUBSTRING_CREATED;
3177-
}
3178-
} else {
3179-
if (!findingSubstringEnd) {
3180-
substringStart = i;
3181-
findingSubstringEnd = true;
31823167

3183-
if (storeIndex == limit - 1) {
3184-
break;
3168+
int i = 0;
3169+
try {
3170+
for (; loopProfile.inject(i < byteLength); i++) {
3171+
if (StringSupport.isAsciiSpace(readByteNode.execute(tstring, i, encoding.tencoding))) {
3172+
if (findingSubstringEnd) {
3173+
findingSubstringEnd = false;
3174+
3175+
final RubyString substring = createSubString(substringNode, tstring, encoding,
3176+
substringStart, i - substringStart);
3177+
ret = addSubstring(
3178+
ret,
3179+
storeIndex++,
3180+
substring,
3181+
block,
3182+
executeBlockProfile,
3183+
growArrayProfile);
3184+
substringStart = SUBSTRING_CREATED;
3185+
}
3186+
} else {
3187+
if (!findingSubstringEnd) {
3188+
substringStart = i;
3189+
findingSubstringEnd = true;
3190+
3191+
if (storeIndex == limit - 1) {
3192+
break;
3193+
}
31853194
}
31863195
}
3196+
3197+
TruffleSafepoint.poll(this);
31873198
}
3199+
} finally {
3200+
profileAndReportLoopCount(loopProfile, i);
31883201
}
31893202

31903203
if (trailingSubstringProfile.profile(findingSubstringEnd)) {
@@ -3193,7 +3206,7 @@ protected Object stringAwkSplitSingleByte(Object string, int limit, Object block
31933206
ret = addSubstring(ret, storeIndex++, substring, block, executeBlockProfile, growArrayProfile);
31943207
}
31953208

3196-
if (trailingEmptyStringProfile.profile(limit < 0 &&
3209+
if (trailingEmptyStringProfile.profile((limit < 0 || storeIndex < limit) &&
31973210
StringSupport.isAsciiSpace(readByteNode.execute(tstring, byteLength - 1, encoding.tencoding)))) {
31983211
final RubyString substring = createSubString(substringNode, tstring, encoding, byteLength - 1, 0);
31993212
ret = addSubstring(ret, storeIndex++, substring, block, executeBlockProfile, growArrayProfile);
@@ -3215,9 +3228,11 @@ protected Object stringAwkSplit(Object string, int limit, Object block,
32153228
@Cached CreateCodePointIteratorNode createCodePointIteratorNode,
32163229
@Cached TruffleStringIterator.NextNode nextNode,
32173230
@Cached TruffleString.SubstringByteIndexNode substringNode,
3231+
@Cached LoopConditionProfile loopProfile,
32183232
@Bind("strings.getTString(string)") AbstractTruffleString tstring,
32193233
@Bind("strings.getEncoding(string)") RubyEncoding encoding) {
3220-
Object[] ret = new Object[10];
3234+
int retSize = limit > 0 && limit < DEFAULT_SPLIT_VALUES_SIZE ? limit : DEFAULT_SPLIT_VALUES_SIZE;
3235+
Object[] ret = new Object[retSize];
32213236
int storeIndex = 0;
32223237

32233238
final boolean limitPositive = limit > 0;
@@ -3229,40 +3244,45 @@ protected Object stringAwkSplit(Object string, int limit, Object block,
32293244
var iterator = createCodePointIteratorNode.execute(tstring, tencoding, ErrorHandling.RETURN_NEGATIVE);
32303245

32313246
boolean skip = true;
3232-
int e = 0, b = 0;
3233-
while (iterator.hasNext()) {
3234-
int c = nextNode.execute(iterator);
3235-
int p = iterator.getByteIndex();
3236-
3237-
if (skip) {
3238-
if (StringSupport.isAsciiSpace(c)) {
3239-
b = p;
3240-
} else {
3241-
e = p;
3242-
skip = false;
3243-
if (limitPositive && limit <= i) {
3244-
break;
3245-
}
3246-
}
3247-
} else {
3248-
if (StringSupport.isAsciiSpace(c)) {
3249-
var substring = createSubString(substringNode, tstring, encoding, b, e - b);
3250-
ret = addSubstring(
3251-
ret,
3252-
storeIndex++,
3253-
substring,
3254-
block,
3255-
executeBlockProfile,
3256-
growArrayProfile);
3257-
skip = true;
3258-
b = p;
3259-
if (limitPositive) {
3260-
i++;
3247+
int e = 0, b = 0, iterations = 0;
3248+
try {
3249+
while (loopProfile.inject(iterator.hasNext())) {
3250+
int c = nextNode.execute(iterator);
3251+
int p = iterator.getByteIndex();
3252+
iterations++;
3253+
3254+
if (skip) {
3255+
if (StringSupport.isAsciiSpace(c)) {
3256+
b = p;
3257+
} else {
3258+
e = p;
3259+
skip = false;
3260+
if (limitPositive && limit <= i) {
3261+
break;
3262+
}
32613263
}
32623264
} else {
3263-
e = p;
3265+
if (StringSupport.isAsciiSpace(c)) {
3266+
var substring = createSubString(substringNode, tstring, encoding, b, e - b);
3267+
ret = addSubstring(
3268+
ret,
3269+
storeIndex++,
3270+
substring,
3271+
block,
3272+
executeBlockProfile,
3273+
growArrayProfile);
3274+
skip = true;
3275+
b = p;
3276+
if (limitPositive) {
3277+
i++;
3278+
}
3279+
} else {
3280+
e = p;
3281+
}
32643282
}
32653283
}
3284+
} finally {
3285+
profileAndReportLoopCount(loopProfile, iterations);
32663286
}
32673287

32683288
if (trailingSubstringProfile.profile(len > 0 && (limitPositive || len > b || limit < 0))) {

0 commit comments

Comments
 (0)