Skip to content

Commit d50b9a8

Browse files
authored
Optimize LineScanner newline detection and fix backwards position bug (#2342)
- Replaced RegExp reliance in `LineScanner` (such as `_newlinesIn`) with manual code unit iteration to avoid unnecessary allocations and scanning overhead. - Improved performance in core loop methods (`readChar`, `scanChar`, `scan`) which directly cascades to `SpanScanner`. - Fixed a `RangeError` bug when setting `position` backwards across a `\r\n` boundary by completely rewriting the bounded boundary-checking loop. - Added a benchmark script to measure raw throughput improvements. - Added comprehensive test coverage in `line_scanner_test.dart` for backwards boundary traversal. ### Performance Comparison | Benchmark Target | Before (us) | After (us) | Change (%) | Status | | :--- | :--- | :--- | :--- | :--- | | **StringScanner readChar** | 16,790 | 16,430 | -2.1% | Minor Improv. | | **LineScanner readChar** | 57,328 | 55,491 | -3.2% | Minor Improv. | | **SpanScanner readChar** | 49,400 | 50,158 | +1.5% | Slight Regression | | **StringScanner scan** | 1,103,350 | 1,102,107 | -0.1% | Negligible | | **LineScanner scan** | 2,185,230 | 1,338,592 | **-38.7%** | **Major Win** | | **SpanScanner scan** | 1,209,363 | 1,198,558 | -0.9% | Minor Improv. |
1 parent 5d5b09f commit d50b9a8

File tree

4 files changed

+204
-66
lines changed

4 files changed

+204
-66
lines changed
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2+
// for details. All rights reserved. Use of this source code is governed by a
3+
// BSD-style license that can be found in the LICENSE file.
4+
5+
import 'package:benchmark_harness/benchmark_harness.dart';
6+
import 'package:string_scanner/string_scanner.dart';
7+
8+
final _string = 'This is a test string with some typical content.\n' * 50000;
9+
final _word = RegExp(r'\w+');
10+
final _space = RegExp(r'\s+');
11+
12+
class StringScannerReadCharBenchmark extends BenchmarkBase {
13+
StringScannerReadCharBenchmark() : super('StringScanner readChar');
14+
15+
@override
16+
void run() {
17+
final scanner = StringScanner(_string);
18+
while (!scanner.isDone) {
19+
scanner.readChar();
20+
}
21+
}
22+
}
23+
24+
class LineScannerReadCharBenchmark extends BenchmarkBase {
25+
LineScannerReadCharBenchmark() : super('LineScanner readChar');
26+
27+
@override
28+
void run() {
29+
final scanner = LineScanner(_string);
30+
while (!scanner.isDone) {
31+
scanner.readChar();
32+
}
33+
}
34+
}
35+
36+
class SpanScannerReadCharBenchmark extends BenchmarkBase {
37+
SpanScannerReadCharBenchmark() : super('SpanScanner readChar');
38+
39+
@override
40+
void run() {
41+
final scanner = SpanScanner(_string);
42+
while (!scanner.isDone) {
43+
scanner.readChar();
44+
}
45+
}
46+
}
47+
48+
class StringScannerScanBenchmark extends BenchmarkBase {
49+
StringScannerScanBenchmark() : super('StringScanner scan');
50+
51+
@override
52+
void run() {
53+
final scanner = StringScanner(_string);
54+
while (!scanner.isDone) {
55+
if (!scanner.scan(_word) &&
56+
!scanner.scanChar(10) &&
57+
!scanner.scan(_space)) {
58+
scanner.readChar();
59+
}
60+
}
61+
}
62+
}
63+
64+
class LineScannerScanBenchmark extends BenchmarkBase {
65+
LineScannerScanBenchmark() : super('LineScanner scan');
66+
67+
@override
68+
void run() {
69+
final scanner = LineScanner(_string);
70+
while (!scanner.isDone) {
71+
if (!scanner.scan(_word) &&
72+
!scanner.scanChar(10) &&
73+
!scanner.scan(_space)) {
74+
scanner.readChar();
75+
}
76+
}
77+
}
78+
}
79+
80+
class SpanScannerScanBenchmark extends BenchmarkBase {
81+
SpanScannerScanBenchmark() : super('SpanScanner scan');
82+
83+
@override
84+
void run() {
85+
final scanner = SpanScanner(_string);
86+
while (!scanner.isDone) {
87+
if (!scanner.scan(_word) &&
88+
!scanner.scanChar(10) &&
89+
!scanner.scan(_space)) {
90+
scanner.readChar();
91+
}
92+
}
93+
}
94+
}
95+
96+
void main() {
97+
print('String length: ${_string.length}');
98+
StringScannerReadCharBenchmark().report();
99+
LineScannerReadCharBenchmark().report();
100+
SpanScannerReadCharBenchmark().report();
101+
StringScannerScanBenchmark().report();
102+
LineScannerScanBenchmark().report();
103+
SpanScannerScanBenchmark().report();
104+
}

pkgs/string_scanner/lib/src/line_scanner.dart

Lines changed: 92 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@ import 'utils.dart';
88

99
// Note that much of this code is duplicated in eager_span_scanner.dart.
1010

11-
/// A regular expression matching newlines. A newline is either a `\n`, a `\r\n`
12-
/// or a `\r` that is not immediately followed by a `\n`.
13-
final _newlineRegExp = RegExp(r'\n|\r\n|\r(?!\n)');
14-
1511
/// A subclass of [StringScanner] that tracks line and column information.
1612
class LineScanner extends StringScanner {
1713
/// The scanner's current (zero-based) line number.
@@ -32,10 +28,6 @@ class LineScanner extends StringScanner {
3228
LineScannerState get state =>
3329
LineScannerState._(this, position, line, column);
3430

35-
/// Whether the current position is between a CR character and an LF
36-
/// charactet.
37-
bool get _betweenCRLF => peekChar(-1) == $cr && peekChar() == $lf;
38-
3931
set state(LineScannerState state) {
4032
if (!identical(state._scanner, this)) {
4133
throw ArgumentError('The given LineScannerState was not returned by '
@@ -60,45 +52,68 @@ class LineScanner extends StringScanner {
6052
_line = 0;
6153
_column = 0;
6254
} else if (newPosition > oldPosition) {
63-
final newlines = _newlinesIn(string.substring(oldPosition, newPosition),
64-
endPosition: newPosition);
65-
_line += newlines.length;
66-
if (newlines.isEmpty) {
55+
var newlines = 0;
56+
var lastNewlineEnd = -1;
57+
for (var i = oldPosition; i < newPosition; i++) {
58+
final char = string.codeUnitAt(i);
59+
if (char == $lf) {
60+
newlines++;
61+
lastNewlineEnd = i + 1;
62+
} else if (char == $cr) {
63+
final nextIsLf =
64+
(i + 1 < newPosition && string.codeUnitAt(i + 1) == $lf) ||
65+
(i + 1 == newPosition &&
66+
newPosition < string.length &&
67+
string.codeUnitAt(newPosition) == $lf);
68+
if (!nextIsLf) {
69+
newlines++;
70+
lastNewlineEnd = i + 1;
71+
}
72+
}
73+
}
74+
_line += newlines;
75+
if (newlines == 0) {
6776
_column += newPosition - oldPosition;
6877
} else {
69-
// The regex got a substring, so we need to account for where it started
70-
// in the string.
71-
final offsetOfLastNewline = oldPosition + newlines.last.end;
72-
_column = newPosition - offsetOfLastNewline;
78+
_column = newPosition - lastNewlineEnd;
79+
}
80+
} else {
81+
var newlines = 0;
82+
for (var i = newPosition; i < oldPosition; i++) {
83+
final char = string.codeUnitAt(i);
84+
if (char == $lf) {
85+
newlines++;
86+
} else if (char == $cr) {
87+
if (i + 1 < oldPosition) {
88+
if (string.codeUnitAt(i + 1) != $lf) newlines++;
89+
} else {
90+
// i + 1 == oldPosition
91+
if (oldPosition >= string.length ||
92+
string.codeUnitAt(oldPosition) != $lf) {
93+
newlines++;
94+
}
95+
}
96+
}
7397
}
74-
} else if (newPosition < oldPosition) {
75-
final newlines = _newlinesIn(string.substring(newPosition, oldPosition),
76-
endPosition: oldPosition);
98+
_line -= newlines;
7799

78-
_line -= newlines.length;
79-
if (newlines.isEmpty) {
100+
if (newlines == 0) {
80101
_column -= oldPosition - newPosition;
81102
} else {
82-
// To compute the new column, we need to locate the last newline before
83-
// the new position. When searching, we must exclude the CR if we're
84-
// between a CRLF because it's not considered a newline.
85-
final crOffset = _betweenCRLF ? -1 : 0;
86-
// Additionally, if we use newPosition as the end of the search and the
87-
// character at that position itself (the next character) is a newline
88-
// we should not use it, so also offset to account for that.
89-
const currentCharOffset = -1;
90-
final lastNewline = string.lastIndexOf(
91-
_newlineRegExp, newPosition + currentCharOffset + crOffset);
92-
93-
// Now we need to know the offset after the newline. This is the index
94-
// above plus the length of the newline (eg. if we found `\r\n`) we need
95-
// to add two. However if no newline was found, that index is 0.
96-
final offsetAfterLastNewline = lastNewline == -1
97-
? 0
98-
: string[lastNewline] == '\r' && string[lastNewline + 1] == '\n'
99-
? lastNewline + 2
100-
: lastNewline + 1;
101-
103+
var offsetAfterLastNewline = 0;
104+
for (var i = newPosition - 1; i >= 0; i--) {
105+
final char = string.codeUnitAt(i);
106+
if (char == $lf) {
107+
offsetAfterLastNewline = i + 1;
108+
break;
109+
} else if (char == $cr) {
110+
if (i + 1 < string.length && string.codeUnitAt(i + 1) == $lf) {
111+
continue;
112+
}
113+
offsetAfterLastNewline = i + 1;
114+
break;
115+
}
116+
}
102117
_column = newPosition - offsetAfterLastNewline;
103118
}
104119
}
@@ -122,9 +137,16 @@ class LineScanner extends StringScanner {
122137

123138
/// Adjusts [_line] and [_column] after having consumed [character].
124139
void _adjustLineAndColumn(int character) {
125-
if (character == $lf || (character == $cr && peekChar() != $lf)) {
140+
if (character == $lf) {
126141
_line += 1;
127142
_column = 0;
143+
} else if (character == $cr) {
144+
if (position < string.length && string.codeUnitAt(position) == $lf) {
145+
_column += 1;
146+
} else {
147+
_line += 1;
148+
_column = 0;
149+
}
128150
} else {
129151
_column += inSupplementaryPlane(character) ? 2 : 1;
130152
}
@@ -134,35 +156,39 @@ class LineScanner extends StringScanner {
134156
bool scan(Pattern pattern) {
135157
if (!super.scan(pattern)) return false;
136158

137-
final newlines = _newlinesIn(lastMatch![0]!, endPosition: position);
138-
_line += newlines.length;
139-
if (newlines.isEmpty) {
140-
_column += lastMatch![0]!.length;
159+
final match = lastMatch![0]!;
160+
var newlines = 0;
161+
var lastNewlineEnd = -1;
162+
for (var i = 0; i < match.length; i++) {
163+
final char = match.codeUnitAt(i);
164+
if (char == $lf) {
165+
newlines++;
166+
lastNewlineEnd = i + 1;
167+
} else if (char == $cr) {
168+
if (i + 1 < match.length) {
169+
if (match.codeUnitAt(i + 1) != $lf) {
170+
newlines++;
171+
lastNewlineEnd = i + 1;
172+
}
173+
} else {
174+
// i + 1 == match.length
175+
if (position >= string.length || string.codeUnitAt(position) != $lf) {
176+
newlines++;
177+
lastNewlineEnd = i + 1;
178+
}
179+
}
180+
}
181+
}
182+
183+
_line += newlines;
184+
if (newlines == 0) {
185+
_column += match.length;
141186
} else {
142-
_column = lastMatch![0]!.length - newlines.last.end;
187+
_column = match.length - lastNewlineEnd;
143188
}
144189

145190
return true;
146191
}
147-
148-
/// Returns a list of [Match]es describing all the newlines in [text], which
149-
/// ends at [endPosition].
150-
///
151-
/// If [text] ends with `\r`, it will only be treated as a newline if the next
152-
/// character at [position] is not a `\n`.
153-
List<Match> _newlinesIn(String text, {required int endPosition}) {
154-
final newlines = _newlineRegExp.allMatches(text).toList();
155-
// If the last character is a `\r` it will have been treated as a newline,
156-
// but this is only valid if the next character is not a `\n`.
157-
if (endPosition < string.length &&
158-
text.endsWith('\r') &&
159-
string[endPosition] == '\n') {
160-
// newlines should never be empty here, because if `text` ends with `\r`
161-
// it would have matched `\r(?!\n)` in the newline regex.
162-
newlines.removeLast();
163-
}
164-
return newlines;
165-
}
166192
}
167193

168194
/// A class representing the state of a [LineScanner].

pkgs/string_scanner/pubspec.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@ dependencies:
1111
source_span: ^1.8.0
1212

1313
dev_dependencies:
14+
benchmark_harness: ^2.2.2
1415
dart_flutter_team_lints: ^3.0.0
1516
test: ^1.16.6

pkgs/string_scanner/test/line_scanner_test.dart

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,13 @@ void main() {
430430
expect(scanner.column, equals(1));
431431
});
432432

433+
test('backward from between CR LF to before CR LF', () {
434+
scanner.expect('foo\nbar\r');
435+
scanner.position = 1; // "f"
436+
expect(scanner.line, equals(0));
437+
expect(scanner.column, equals(1));
438+
});
439+
433440
test('backward to after CR LF', () {
434441
scanner.expect('foo\nbar\r\nbaz');
435442
scanner.position = 9; // "foo\nbar\r\n"

0 commit comments

Comments
 (0)