Skip to content

Commit fa612d1

Browse files
authored
retain code-range upon string extraction + align with strscan.c (#186)
added an extra step upon creating a substring from `scanner.string` -> retaining code-range when possible also, there was an attempt to raise `"regexp buffer overflow"` upon `Matcher.INTERRUPTED` (-2), which isn't possible as the `RubyRegexp#matcherMatch/matcherSearch` [handle the interrupt return](https://github.com/jruby/jruby/blob/10.0.2.0/core/src/main/java/org/jruby/RubyRegexp.java#L236-L257). a follow-up on #185
1 parent 7823b93 commit fa612d1

File tree

1 file changed

+37
-22
lines changed

1 file changed

+37
-22
lines changed

ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,9 @@ public IRubyObject set_string(ThreadContext context, IRubyObject str) {
203203
}
204204

205205
@JRubyMethod(name = {"concat", "<<"})
206-
public IRubyObject concat(ThreadContext context, IRubyObject obj) {
206+
public IRubyObject concat(ThreadContext context, IRubyObject str) {
207207
check(context);
208-
str.append(obj.convertToString());
208+
this.str.append(RubyString.stringValue(str));
209209
return this;
210210
}
211211

@@ -261,7 +261,7 @@ private IRubyObject extractBegLen(Ruby runtime, int beg, int len) {
261261
}
262262

263263
// MRI: strscan_do_scan
264-
private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succptr, boolean getstr, boolean headonly) {
264+
private IRubyObject scan(ThreadContext context, IRubyObject pattern, boolean succptr, boolean getstr, boolean headonly) {
265265
final Ruby runtime = context.runtime;
266266
check(context);
267267
clearMatchStatus();
@@ -274,12 +274,12 @@ private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succp
274274
ByteList strBL = str.getByteList();
275275
int currPtr = strBL.getBegin() + curr;
276276

277-
if (regex instanceof RubyRegexp) {
278-
pattern = ((RubyRegexp) regex).preparePattern(str);
277+
if (pattern instanceof RubyRegexp) {
278+
this.pattern = ((RubyRegexp) pattern).preparePattern(str);
279279

280280
int range = currPtr + restLen;
281281

282-
Matcher matcher = pattern.matcher(strBL.getUnsafeBytes(), matchTarget(), range);
282+
Matcher matcher = this.pattern.matcher(strBL.getUnsafeBytes(), matchTarget(), range);
283283
final int ret;
284284
if (headonly) {
285285
ret = RubyRegexp.matcherMatch(context, matcher, currPtr, range, Option.NONE);
@@ -294,26 +294,28 @@ private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succp
294294
regs = matchRegion;
295295
}
296296

297-
if (ret == -2) {
298-
throw runtime.newRaiseException((RubyClass) getMetaClass().getConstant("ScanError"), "regexp buffer overflow");
297+
if (ret < 0) { // MISMATCH
298+
return context.nil;
299299
}
300-
if (ret < 0) return context.nil;
301300
} else {
302-
RubyString pattern = regex.convertToString();
303-
Encoding patternEnc = str.checkEncoding(pattern);
304-
ByteList patternBL = pattern.getByteList();
305-
int patternSize = patternBL.realSize();
301+
RubyString patternStr = RubyString.stringValue(pattern);
302+
ByteList patternBL = patternStr.getByteList();
303+
final int patternSize = patternBL.realSize();
304+
305+
if (restLen < patternSize) {
306+
str.checkEncoding(patternStr);
307+
return context.nil;
308+
}
306309

307310
if (headonly) {
308-
if (restLen < pattern.size()) {
309-
return context.nil;
310-
}
311+
str.checkEncoding(patternStr);
312+
311313
if (ByteList.memcmp(strBL.unsafeBytes(), currPtr, patternBL.unsafeBytes(), patternBL.begin(), patternSize) != 0) {
312314
return context.nil;
313315
}
314316
setRegisters(0, patternSize);
315317
} else {
316-
int pos = StringSupport.index(strBL, patternBL, currPtr, patternEnc);
318+
int pos = StringSupport.index(strBL, patternBL, currPtr, str.checkEncoding(patternStr));
317319
if (pos == -1) {
318320
return context.nil;
319321
}
@@ -705,7 +707,7 @@ public IRubyObject op_aref(ThreadContext context, IRubyObject idx) {
705707

706708
if (idx instanceof RubySymbol || idx instanceof RubyString) {
707709
if (pattern == null) {
708-
throw runtime.newRaiseException((RubyClass) getMetaClass().getConstant("IndexError"), "undefined group name reference: " + idx);
710+
throw runtime.newRaiseException(runtime.getIndexError(), "undefined group name reference: " + idx);
709711
}
710712
}
711713

@@ -943,12 +945,25 @@ public IRubyObject values_at(ThreadContext context, IRubyObject index1, IRubyObj
943945

944946
// MRI: str_new
945947
private RubyString newString(Ruby runtime, int start, int length) {
946-
ByteList byteList = str.getByteList();
947-
int begin = byteList.begin();
948+
final ByteList strBytes = this.str.getByteList();
949+
ByteList newBytes = new ByteList(strBytes.unsafeBytes(), strBytes.begin() + start, length, true);
948950

949-
ByteList newByteList = new ByteList(byteList.unsafeBytes(), begin + start, length, byteList.getEncoding(), true);
951+
final RubyString newStr = RubyString.newString(runtime, newBytes, strBytes.getEncoding());
952+
copyCodeRangeForSubstr(newStr, this.str);
953+
return newStr;
954+
}
950955

951-
return RubyString.newString(runtime, newByteList);
956+
/**
957+
* Same as JRuby's (private) <code>RubyString#copyCodeRangeForSubstr</code>.
958+
* Isn't really necessary, but will avoid extra code-range scans for the substrings returned.
959+
*/
960+
private void copyCodeRangeForSubstr(RubyString str, RubyString from) {
961+
if (str.size() == 0) {
962+
str.setCodeRange(from.getEncoding().isAsciiCompatible() ? StringSupport.CR_7BIT : StringSupport.CR_VALID);
963+
} else {
964+
if (from.getCodeRange() == StringSupport.CR_7BIT) str.setCodeRange(StringSupport.CR_7BIT);
965+
// otherwise, leave it as CR_UNKNOWN
966+
}
952967
}
953968

954969
/**

0 commit comments

Comments
 (0)