Skip to content

Commit c1094e0

Browse files
author
jneen
committed
implement #scan(Int), #skip(Int), and #check(Int)
1 parent bb8a1a5 commit c1094e0

File tree

2 files changed

+167
-19
lines changed

2 files changed

+167
-19
lines changed

spec/std/string_scanner_spec.cr

Lines changed: 108 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ describe StringScanner do
99
s.scan(' ').should eq(" ")
1010
s.scan("is ").should eq("is ")
1111
s.scan(/\w+\s/).should eq("a ")
12-
s.scan(/\w+/).should eq("string")
12+
s.scan(2).should eq("st")
13+
s.scan(/\w+/).should eq("ring")
1314
end
1415

1516
it "returns nil if it can't match from the offset" do
@@ -18,8 +19,28 @@ describe StringScanner do
1819
s.scan(/\w+/).should be_nil
1920
s.scan('s').should be_nil
2021
s.scan("string").should be_nil
21-
s.scan(/\s\w+/).should_not be_nil # => " string"
22-
s.scan(/.*/).should_not be_nil # => ""
22+
s.scan(/\s\w\w/).should_not be_nil # => " string"
23+
s.scan(4).should eq("ring")
24+
s.scan(/.*/).should_not be_nil # => ""
25+
s.scan(1).should be_nil
26+
end
27+
28+
it "errors on negative ints" do
29+
s = StringScanner.new("testy mctesterson")
30+
s.scan(10)
31+
expect_raises(ArgumentError, "Negative lookahead count: -10") { s.scan(-10) }
32+
end
33+
34+
it "works on multi-byte strings" do
35+
s = StringScanner.new("テストの文字列")
36+
s.scan(/\w\w\w/).should eq("テスト")
37+
s.scan(/[a-z]+/).should be_nil
38+
s.scan('ト').should be_nil
39+
s.scan('の').should eq("")
40+
s.scan(10).should be_nil
41+
s.scan(2).should eq("文字")
42+
s.scan("不在").should be_nil
43+
s.scan("").should eq("")
2344
end
2445
end
2546

@@ -44,27 +65,79 @@ describe StringScanner do
4465
end
4566

4667
describe "#skip" do
47-
it "advances the offset but does not returns the string matched" do
48-
s = StringScanner.new("this is a string")
68+
describe "with single byte strings" do
69+
it "advances the offset but does not return the string matched" do
70+
s = StringScanner.new("this is a string")
71+
72+
s.skip(/\w+\s/).should eq(5)
73+
s.offset.should eq(5)
74+
s[0]?.should eq("this ")
75+
76+
s.skip(/\d+/).should be_nil
77+
s.offset.should eq(5)
78+
s[0]?.should be_nil
79+
80+
s.skip('i').should eq(1)
81+
s.offset.should eq(6)
82+
s[0]?.should eq("i")
83+
84+
s.skip("s ").should eq(2)
85+
s.offset.should eq(8)
86+
s[0]?.should eq("s ")
87+
88+
s.skip(/\w+\s/).should eq(2)
89+
s.offset.should eq(10)
90+
s[0]?.should eq("a ")
91+
92+
s.skip(5).should eq(5)
93+
s.offset.should eq(15)
94+
s[0]?.should eq("strin")
95+
96+
s.skip(100).should be_nil
97+
s.skip(2).should be_nil
98+
s.skip(1).should eq(1)
99+
s.skip(1).should be_nil
100+
s.skip(0).should eq(0)
101+
end
102+
end
49103

50-
s.skip(/\w+\s/).should eq(5)
51-
s.offset.should eq(5)
52-
s[0]?.should_not be_nil
104+
describe "with multibyte strings" do
105+
it "advances the offset but does not return the string matched" do
106+
s = StringScanner.new("これは文字列である")
53107

54-
s.skip(/\d+/).should be_nil
55-
s.offset.should eq(5)
108+
s.skip(/\w\w\w/).should eq(3)
109+
s.offset.should eq(3)
110+
s[0]?.should eq("これは")
56111

57-
s.skip('i').should eq(1)
58-
s.offset.should eq(6)
112+
s.skip(/\d+/).should be_nil
113+
s.offset.should eq(3)
59114

60-
s.skip("s ").should eq(2)
61-
s.offset.should eq(8)
115+
s.skip(100).should be_nil
62116

63-
s.skip(/\w+\s/).should eq(2)
64-
s.offset.should eq(10)
117+
s.skip("文字").should eq(2)
118+
s.offset.should eq(5)
119+
s[0]?.should eq("文字")
65120

66-
s.skip(/\w+/).should eq(6)
67-
s.offset.should eq(16)
121+
s.skip('列').should eq(1)
122+
s.offset.should eq(6)
123+
s[0]?.should eq("")
124+
125+
s.skip(2).should eq(2)
126+
s.offset.should eq(8)
127+
s[0]?.should eq("であ")
128+
129+
s.skip(2).should be_nil
130+
s.skip(0).should eq(0)
131+
s[0]?.should eq("")
132+
133+
s.skip(1).should eq(1)
134+
s[0]?.should eq("")
135+
136+
s.eos?.should be_true
137+
s.skip(1).should be_nil
138+
s.skip(0).should eq(0)
139+
s[0]?.should eq("")
140+
end
68141
end
69142
end
70143

@@ -106,19 +179,35 @@ describe StringScanner do
106179

107180
s.check(/\w+\s/).should eq("is ")
108181
s.offset.should eq(5)
182+
s[0].should eq("is ")
183+
109184
s.check(/\w+\s/).should eq("is ")
110185
s.offset.should eq(5)
186+
s[0].should eq("is ")
187+
111188
s.check('i').should eq("i")
112189
s.offset.should eq(5)
113-
s.check("is ").should eq("is ")
190+
s[0].should eq("i")
191+
192+
s.check("is a str").should eq("is a str")
193+
s.offset.should eq(5)
194+
s[0].should eq("is a str")
195+
196+
s.check(4).should eq("is a")
114197
s.offset.should eq(5)
198+
s[0].should eq("is a")
199+
200+
s.check(100).should be_nil
201+
s.offset.should eq(5)
202+
s[0]?.should be_nil
115203
end
116204

117205
it "returns nil if it can't match from the offset" do
118206
s = StringScanner.new("test string")
119207
s.check(/\d+/).should be_nil
120208
s.check('0').should be_nil
121209
s.check("01").should be_nil
210+
s.check(100).should be_nil
122211
end
123212
end
124213

src/string_scanner.cr

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ class StringScanner
7272
end
7373

7474
# Sets the *position* of the scan offset.
75+
#
76+
# NOTE: Moving the scan head with this method can cause performance issues in
77+
# multibyte strings. For a more performant way to move the head, see
78+
# [`#skip(Int)`](#skip%28len%3AInt%29%3AInt32%7CNil-instance-method).
7579
def offset=(position : Int)
7680
raise IndexError.new unless position >= 0
7781
@byte_offset = @str.char_index_to_byte_index(position) || @str.bytesize
@@ -111,6 +115,25 @@ class StringScanner
111115
match(pattern, advance: true, anchored: true)
112116
end
113117

118+
# Advances the offset by *len* chars, and returns a string of that length.
119+
#
120+
# NOTE: If there are less than the requested number of characters
121+
# remaining in the string, this method will return nil and _not advance
122+
# the scan head_. To obtain the entire rest of the input string, use `#rest`.
123+
#
124+
# ```
125+
# require "string_scanner"
126+
#
127+
# s = StringScanner.new("あいうえお")
128+
# s.scan(3) # => "あいう"
129+
# s.scan(100) # => nil
130+
# s.scan(2) # => "えお"
131+
# s.scan(0) # => ""
132+
# ```
133+
def scan(len : Int) : String?
134+
match(len, advance: true)
135+
end
136+
114137
# Scans the string _until_ the *pattern* is matched. Returns the substring up
115138
# to and including the end of the match, the last match is saved, and
116139
# advances the scan offset. Returns `nil` if no match.
@@ -178,6 +201,24 @@ class StringScanner
178201
end
179202
end
180203

204+
private def match(len : Int, advance = true)
205+
byte_len = lookahead_byte_length(len)
206+
207+
# off the end of the string
208+
if byte_len.nil?
209+
@last_match = nil
210+
return nil
211+
end
212+
213+
result = @str.byte_slice(@byte_offset, byte_len)
214+
215+
@byte_offset += byte_len if advance
216+
217+
@last_match = StringMatchData.new(result)
218+
219+
result
220+
end
221+
181222
# Attempts to skip over the given *pattern* beginning with the scan offset.
182223
#
183224
# If there's a match, the scanner advances the scan offset, the last match is
@@ -203,6 +244,19 @@ class StringScanner
203244
match.size if match
204245
end
205246

247+
# Advances the offset by *len* chars.
248+
#
249+
# Prefer this to `scanner.offset += len`, since that can cause a full
250+
# scan of the string in the case of multibyte characters.
251+
#
252+
# NOTE: If there are less than the requested number of characters
253+
# remaining in the string, this method will return nil and _not advance
254+
# the scan head_. To move the scan head to the very end, use `#terminate`.
255+
def skip(len : Int) : Int32?
256+
match = scan(len)
257+
match.size if match
258+
end
259+
206260
# Attempts to skip _until_ the given *pattern* is found after the scan
207261
# offset. In other words, the pattern is not anchored to the current scan
208262
# offset.
@@ -256,6 +310,11 @@ class StringScanner
256310
match(pattern, advance: false, anchored: true)
257311
end
258312

313+
# :ditto:
314+
def check(len : Int) : String?
315+
match(len, advance: false)
316+
end
317+
259318
# Returns the value that `#scan_until` would return, without advancing the
260319
# scan offset. The last match is still saved, however.
261320
#

0 commit comments

Comments
 (0)