Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ extend-exclude = [
"lib/**",
"man/**",
"spec/compiler/semantic/did_you_mean_spec.cr",
"spec/std/string_scanner_spec.cr",
"spec/std/data/**",
"spec/std/string/grapheme_break_spec.cr",
"src/compiler/crystal/tools/playground/public/vendor/",
Expand Down
129 changes: 109 additions & 20 deletions spec/std/string_scanner_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ describe StringScanner do
s.scan(' ').should eq(" ")
s.scan("is ").should eq("is ")
s.scan(/\w+\s/).should eq("a ")
s.scan(/\w+/).should eq("string")
s.scan(2).should eq("st")
s.scan(/\w+/).should eq("ring")
end

it "returns nil if it can't match from the offset" do
Expand All @@ -18,8 +19,28 @@ describe StringScanner do
s.scan(/\w+/).should be_nil
s.scan('s').should be_nil
s.scan("string").should be_nil
s.scan(/\s\w+/).should_not be_nil # => " string"
s.scan(/.*/).should_not be_nil # => ""
s.scan(/\s\w\w/).should_not be_nil # => " string"
s.scan(4).should eq("ring")
s.scan(/.*/).should_not be_nil # => ""
s.scan(1).should be_nil
end

it "errors on negative ints" do
s = StringScanner.new("testy mctesterson")
s.scan(10)
expect_raises(ArgumentError, "Negative lookahead count: -10") { s.scan(-10) }
end

it "works on multi-byte strings" do
s = StringScanner.new("テストの文字列")
s.scan(/\w\w\w/).should eq("テスト")
s.scan(/[a-z]+/).should be_nil
s.scan('ト').should be_nil
s.scan('の').should eq("の")
s.scan(10).should be_nil
s.scan(2).should eq("文字")
s.scan("不在").should be_nil
s.scan("列").should eq("列")
end
end

Expand All @@ -44,32 +65,84 @@ describe StringScanner do
end

describe "#skip" do
it "advances the offset but does not returns the string matched" do
s = StringScanner.new("this is a string")
describe "with single byte strings" do
it "advances the offset but does not return the string matched" do
s = StringScanner.new("this is a string")

s.skip(/\w+\s/).should eq(5)
s.offset.should eq(5)
s[0]?.should eq("this ")

s.skip(/\d+/).should be_nil
s.offset.should eq(5)
s[0]?.should be_nil

s.skip('i').should eq(1)
s.offset.should eq(6)
s[0]?.should eq("i")

s.skip("s ").should eq(2)
s.offset.should eq(8)
s[0]?.should eq("s ")

s.skip(/\w+\s/).should eq(2)
s.offset.should eq(10)
s[0]?.should eq("a ")

s.skip(5).should eq(5)
s.offset.should eq(15)
s[0]?.should eq("strin")

s.skip(100).should be_nil
s.skip(2).should be_nil
s.skip(1).should eq(1)
s.skip(1).should be_nil
s.skip(0).should eq(0)
end
end

s.skip(/\w+\s/).should eq(5)
s.offset.should eq(5)
s[0]?.should_not be_nil
describe "with multibyte strings" do
it "advances the offset but does not return the string matched" do
s = StringScanner.new("これは文字列である")

s.skip(/\d+/).should be_nil
s.offset.should eq(5)
s.skip(/\w\w\w/).should eq(3)
s.offset.should eq(3)
s[0]?.should eq("これは")

s.skip('i').should eq(1)
s.offset.should eq(6)
s.skip(/\d+/).should be_nil
s.offset.should eq(3)

s.skip("s ").should eq(2)
s.offset.should eq(8)
s.skip(100).should be_nil

s.skip(/\w+\s/).should eq(2)
s.offset.should eq(10)
s.skip("文字").should eq(2)
s.offset.should eq(5)
s[0]?.should eq("文字")

s.skip(/\w+/).should eq(6)
s.offset.should eq(16)
s.skip('列').should eq(1)
s.offset.should eq(6)
s[0]?.should eq("列")

s.skip(2).should eq(2)
s.offset.should eq(8)
s[0]?.should eq("であ")

s.skip(2).should be_nil
s.skip(0).should eq(0)
s[0]?.should eq("")

s.skip(1).should eq(1)
s[0]?.should eq("る")

s.eos?.should be_true
s.skip(1).should be_nil
s.skip(0).should eq(0)
s[0]?.should eq("")
end
end
end

describe "#skip_until" do
it "advances the offset but does not returns the string matched" do
it "advances the offset but does not return the string matched" do
s = StringScanner.new("this is a string")

s.skip_until(/not/).should be_nil
Expand Down Expand Up @@ -106,19 +179,35 @@ describe StringScanner do

s.check(/\w+\s/).should eq("is ")
s.offset.should eq(5)
s[0].should eq("is ")

s.check(/\w+\s/).should eq("is ")
s.offset.should eq(5)
s[0].should eq("is ")

s.check('i').should eq("i")
s.offset.should eq(5)
s.check("is ").should eq("is ")
s[0].should eq("i")

s.check("is a str").should eq("is a str")
s.offset.should eq(5)
s[0].should eq("is a str")

s.check(4).should eq("is a")
s.offset.should eq(5)
s[0].should eq("is a")

s.check(100).should be_nil
s.offset.should eq(5)
s[0]?.should be_nil
end

it "returns nil if it can't match from the offset" do
s = StringScanner.new("test string")
s.check(/\d+/).should be_nil
s.check('0').should be_nil
s.check("01").should be_nil
s.check(100).should be_nil
end
end

Expand Down
120 changes: 118 additions & 2 deletions src/string_scanner.cr
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,19 @@
class StringScanner
@last_match : Regex::MatchData | StringMatchData | Nil

# The byte offset of the scan head. This is distinct from #offset in that
# it counts raw bytes instead of characters.
getter byte_offset : Int32

def initialize(@str : String)
@byte_offset = 0
end

# Sets the *position* of the scan offset.
#
# NOTE: Moving the scan head with this method can cause performance issues in
# multibyte strings. For a more performant way to move the head, see
# [`#skip(Int)`](#skip%28len%3AInt%29%3AInt32%7CNil-instance-method).
def offset=(position : Int)
raise IndexError.new unless position >= 0
@byte_offset = @str.char_index_to_byte_index(position) || @str.bytesize
Expand Down Expand Up @@ -107,6 +115,25 @@ class StringScanner
match(pattern, advance: true, anchored: true)
end

# Advances the offset by *len* chars, and returns a string of that length.
#
# NOTE: If there are less than the requested number of characters
# remaining in the string, this method will return nil and _not advance
# the scan head_. To obtain the entire rest of the input string, use `#rest`.
#
# ```
# require "string_scanner"
#
# s = StringScanner.new("あいうえお")
# s.scan(3) # => "あいう"
# s.scan(100) # => nil
# s.scan(2) # => "えお"
# s.scan(0) # => ""
# ```
def scan(len : Int) : String?
match(len, advance: true)
end

# Scans the string _until_ the *pattern* is matched. Returns the substring up
# to and including the end of the match, the last match is saved, and
# advances the scan offset. Returns `nil` if no match.
Expand Down Expand Up @@ -134,7 +161,7 @@ class StringScanner
match(pattern, advance: true, anchored: false)
end

private def match(pattern : Regex, advance = true, options = Regex::MatchOptions::ANCHORED)
private def match(pattern : Regex, advance : Bool = true, options : Regex::MatchOptions = Regex::MatchOptions::ANCHORED)
match = pattern.match_at_byte_index(@str, @byte_offset, options)
@last_match = match
if match
Expand Down Expand Up @@ -174,8 +201,25 @@ class StringScanner
end
end

private def match(len : Int, advance = true)
byte_len = lookahead_byte_length(len)

# off the end of the string
if byte_len.nil?
@last_match = nil
return nil
end

result = @str.byte_slice(@byte_offset, byte_len)

@byte_offset += byte_len if advance

@last_match = StringMatchData.new(result)

result
end

# Attempts to skip over the given *pattern* beginning with the scan offset.
# In other words, the pattern is not anchored to the current scan offset.
#
# If there's a match, the scanner advances the scan offset, the last match is
# saved, and it returns the size of the skipped match. Otherwise it returns
Expand All @@ -200,6 +244,19 @@ class StringScanner
match.size if match
end

# Advances the offset by *len* chars.
#
# Prefer this to `scanner.offset += len`, since that can cause a full
# scan of the string in the case of multibyte characters.
#
# NOTE: If there are less than the requested number of characters
# remaining in the string, this method will return nil and _not advance
# the scan head_. To move the scan head to the very end, use `#terminate`.
def skip(len : Int) : Int32?
match = scan(len)
match.size if match
end

# Attempts to skip _until_ the given *pattern* is found after the scan
# offset. In other words, the pattern is not anchored to the current scan
# offset.
Expand Down Expand Up @@ -253,6 +310,11 @@ class StringScanner
match(pattern, advance: false, anchored: true)
end

# :ditto:
def check(len : Int) : String?
match(len, advance: false)
end

# Returns the value that `#scan_until` would return, without advancing the
# scan offset. The last match is still saved, however.
#
Expand Down Expand Up @@ -387,6 +449,60 @@ class StringScanner
io << " \"" << @str[start, 5] << "\" >"
end

private def make_char_reader : Char::Reader
Char::Reader.new(@str, @byte_offset)
end

# Transforms a character count into a byte count *forward*
# from the scan head. Returns nil if the string doesn't have
# enough characters in it to advance by the given character
# count.
#
# Return value, if not nil, is guaranteed to be in ([email protected] - @byte_offset)
private def lookahead_byte_length(len : Int) : Int32?
raise ArgumentError.new("Negative lookahead count: #{len}") if len < 0
return 0 if len.zero?
if @str.single_byte_optimizable?
return len <= @str.bytesize - @byte_offset ? len : nil
end

# some redundant logic here from String#find_start_and_end, but in this case
# it is likely we are far into the string and len is small, so it is very
# important not to start at the beginning of the string.
reader = make_char_reader

current = reader.current_char?

len.times do
return nil if current.nil?
current = reader.next_char?
end

reader.pos - @byte_offset
end

# Similar to #lookahead_byte_length, transforms a character count
# into a byte count *backwards* from the scan head, and returns nil
# if this would fall off the beginning of the string.
#
# Return value, if not nil, is guaranteed to be in (0..@byte_offset)
private def lookbehind_byte_length(len : Int) : Int32?
raise ArgumentError.new("Negative lookbehind count: #{len}") if len < 0
return 0 if len.zero?

if @str.single_byte_optimizable?
return len <= @byte_offset ? len : nil
end

reader = make_char_reader

len.times do
reader.previous_char? || return nil
end

@byte_offset - reader.pos
end

# :nodoc:
struct StringMatchData
def initialize(@str : String)
Expand Down