crystal-lang · straight-shoota · Jan 22, 2026 · Dec 20, 2025 · Dec 20, 2025 · Dec 20, 2025
diff --git a/_typos.toml b/_typos.toml
@@ -58,6 +58,7 @@ extend-exclude = [
   "lib/**",
   "man/**",
   "spec/compiler/semantic/did_you_mean_spec.cr",
+  "spec/std/string_scanner_spec.cr",
   "spec/std/data/**",
   "spec/std/string/grapheme_break_spec.cr",
   "src/compiler/crystal/tools/playground/public/vendor/",

diff --git a/spec/std/string_scanner_spec.cr b/spec/std/string_scanner_spec.cr
@@ -9,7 +9,8 @@ describe StringScanner do
       s.scan(' ').should eq(" ")
       s.scan("is ").should eq("is ")
       s.scan(/\w+\s/).should eq("a ")
-      s.scan(/\w+/).should eq("string")
+      s.scan(2).should eq("st")
+      s.scan(/\w+/).should eq("ring")
     end
 
     it "returns nil if it can't match from the offset" do
@@ -18,8 +19,28 @@ describe StringScanner do
       s.scan(/\w+/).should be_nil
       s.scan('s').should be_nil
       s.scan("string").should be_nil
-      s.scan(/\s\w+/).should_not be_nil # => " string"
-      s.scan(/.*/).should_not be_nil    # => ""
+      s.scan(/\s\w\w/).should_not be_nil # => " string"
+      s.scan(4).should eq("ring")
+      s.scan(/.*/).should_not be_nil # => ""
+      s.scan(1).should be_nil
+    end
+
+    it "errors on negative ints" do
+      s = StringScanner.new("testy mctesterson")
+      s.scan(10)
+      expect_raises(ArgumentError, "Negative lookahead count: -10") { s.scan(-10) }
+    end
+
+    it "works on multi-byte strings" do
+      s = StringScanner.new("テストの文字列")
+      s.scan(/\w\w\w/).should eq("テスト")
+      s.scan(/[a-z]+/).should be_nil
+      s.scan('ト').should be_nil
+      s.scan('の').should eq("の")
+      s.scan(10).should be_nil
+      s.scan(2).should eq("文字")
+      s.scan("不在").should be_nil
+      s.scan("列").should eq("列")
     end
   end
 
@@ -44,32 +65,84 @@ describe StringScanner do
   end
 
   describe "#skip" do
-    it "advances the offset but does not returns the string matched" do
-      s = StringScanner.new("this is a string")
+    describe "with single byte strings" do
+      it "advances the offset but does not return the string matched" do
+        s = StringScanner.new("this is a string")
+
+        s.skip(/\w+\s/).should eq(5)
+        s.offset.should eq(5)
+        s[0]?.should eq("this ")
+
+        s.skip(/\d+/).should be_nil
+        s.offset.should eq(5)
+        s[0]?.should be_nil
+
+        s.skip('i').should eq(1)
+        s.offset.should eq(6)
+        s[0]?.should eq("i")
+
+        s.skip("s ").should eq(2)
+        s.offset.should eq(8)
+        s[0]?.should eq("s ")
+
+        s.skip(/\w+\s/).should eq(2)
+        s.offset.should eq(10)
+        s[0]?.should eq("a ")
+
+        s.skip(5).should eq(5)
+        s.offset.should eq(15)
+        s[0]?.should eq("strin")
+
+        s.skip(100).should be_nil
+        s.skip(2).should be_nil
+        s.skip(1).should eq(1)
+        s.skip(1).should be_nil
+        s.skip(0).should eq(0)
+      end
+    end
 
-      s.skip(/\w+\s/).should eq(5)
-      s.offset.should eq(5)
-      s[0]?.should_not be_nil
+    describe "with multibyte strings" do
+      it "advances the offset but does not return the string matched" do
+        s = StringScanner.new("これは文字列である")
 
-      s.skip(/\d+/).should be_nil
-      s.offset.should eq(5)
+        s.skip(/\w\w\w/).should eq(3)
+        s.offset.should eq(3)
+        s[0]?.should eq("これは")
 
-      s.skip('i').should eq(1)
-      s.offset.should eq(6)
+        s.skip(/\d+/).should be_nil
+        s.offset.should eq(3)
 
-      s.skip("s ").should eq(2)
-      s.offset.should eq(8)
+        s.skip(100).should be_nil
 
-      s.skip(/\w+\s/).should eq(2)
-      s.offset.should eq(10)
+        s.skip("文字").should eq(2)
+        s.offset.should eq(5)
+        s[0]?.should eq("文字")
 
-      s.skip(/\w+/).should eq(6)
-      s.offset.should eq(16)
+        s.skip('列').should eq(1)
+        s.offset.should eq(6)
+        s[0]?.should eq("列")
+
+        s.skip(2).should eq(2)
+        s.offset.should eq(8)
+        s[0]?.should eq("であ")
+
+        s.skip(2).should be_nil
+        s.skip(0).should eq(0)
+        s[0]?.should eq("")
+
+        s.skip(1).should eq(1)
+        s[0]?.should eq("る")
+
+        s.eos?.should be_true
+        s.skip(1).should be_nil
+        s.skip(0).should eq(0)
+        s[0]?.should eq("")
+      end
     end
   end
 
   describe "#skip_until" do
-    it "advances the offset but does not returns the string matched" do
+    it "advances the offset but does not return the string matched" do
       s = StringScanner.new("this is a string")
 
       s.skip_until(/not/).should be_nil
@@ -106,19 +179,35 @@ describe StringScanner do
 
       s.check(/\w+\s/).should eq("is ")
       s.offset.should eq(5)
+      s[0].should eq("is ")
+
       s.check(/\w+\s/).should eq("is ")
       s.offset.should eq(5)
+      s[0].should eq("is ")
+
       s.check('i').should eq("i")
       s.offset.should eq(5)
-      s.check("is ").should eq("is ")
+      s[0].should eq("i")
+
+      s.check("is a str").should eq("is a str")
+      s.offset.should eq(5)
+      s[0].should eq("is a str")
+
+      s.check(4).should eq("is a")
       s.offset.should eq(5)
+      s[0].should eq("is a")
+
+      s.check(100).should be_nil
+      s.offset.should eq(5)
+      s[0]?.should be_nil
     end
 
     it "returns nil if it can't match from the offset" do
       s = StringScanner.new("test string")
       s.check(/\d+/).should be_nil
       s.check('0').should be_nil
       s.check("01").should be_nil
+      s.check(100).should be_nil
     end
   end
 

diff --git a/src/string_scanner.cr b/src/string_scanner.cr
@@ -63,11 +63,19 @@
 class StringScanner
   @last_match : Regex::MatchData | StringMatchData | Nil
 
+  # The byte offset of the scan head. This is distinct from #offset in that
+  # it counts raw bytes instead of characters.
+  getter byte_offset : Int32
+
   def initialize(@str : String)
     @byte_offset = 0
   end
 
   # Sets the *position* of the scan offset.
+  #
+  # NOTE: Moving the scan head with this method can cause performance issues in
+  # multibyte strings. For a more performant way to move the head, see
+  # [`#skip(Int)`](#skip%28len%3AInt%29%3AInt32%7CNil-instance-method).
   def offset=(position : Int)
     raise IndexError.new unless position >= 0
     @byte_offset = @str.char_index_to_byte_index(position) || @str.bytesize
@@ -107,6 +115,25 @@ class StringScanner
     match(pattern, advance: true, anchored: true)
   end
 
+  # Advances the offset by *len* chars, and returns a string of that length.
+  #
+  # NOTE: If there are less than the requested number of characters
+  # remaining in the string, this method will return nil and _not advance
+  # the scan head_. To obtain the entire rest of the input string, use `#rest`.
+  #
+  # ```
+  # require "string_scanner"
+  #
+  # s = StringScanner.new("あいうえお")
+  # s.scan(3)   # => "あいう"
+  # s.scan(100) # => nil
+  # s.scan(2)   # => "えお"
+  # s.scan(0)   # => ""
+  # ```
+  def scan(len : Int) : String?
+    match(len, advance: true)
+  end
+
   # Scans the string _until_ the *pattern* is matched. Returns the substring up
   # to and including the end of the match, the last match is saved, and
   # advances the scan offset. Returns `nil` if no match.
@@ -134,7 +161,7 @@ class StringScanner
     match(pattern, advance: true, anchored: false)
   end
 
-  private def match(pattern : Regex, advance = true, options = Regex::MatchOptions::ANCHORED)
+  private def match(pattern : Regex, advance : Bool = true, options : Regex::MatchOptions = Regex::MatchOptions::ANCHORED)
     match = pattern.match_at_byte_index(@str, @byte_offset, options)
     @last_match = match
     if match
@@ -174,8 +201,25 @@ class StringScanner
     end
   end
 
+  private def match(len : Int, advance = true)
+    byte_len = lookahead_byte_length(len)
+
+    # off the end of the string
+    if byte_len.nil?
+      @last_match = nil
+      return nil
+    end
+
+    result = @str.byte_slice(@byte_offset, byte_len)
+
+    @byte_offset += byte_len if advance
+
+    @last_match = StringMatchData.new(result)
+
+    result
+  end
+
   # Attempts to skip over the given *pattern* beginning with the scan offset.
-  # In other words, the pattern is not anchored to the current scan offset.
   #
   # If there's a match, the scanner advances the scan offset, the last match is
   # saved, and it returns the size of the skipped match. Otherwise it returns
@@ -200,6 +244,19 @@ class StringScanner
     match.size if match
   end
 
+  # Advances the offset by *len* chars.
+  #
+  # Prefer this to `scanner.offset += len`, since that can cause a full
+  # scan of the string in the case of multibyte characters.
+  #
+  # NOTE: If there are less than the requested number of characters
+  # remaining in the string, this method will return nil and _not advance
+  # the scan head_. To move the scan head to the very end, use `#terminate`.
+  def skip(len : Int) : Int32?
+    match = scan(len)
+    match.size if match
+  end
+
   # Attempts to skip _until_ the given *pattern* is found after the scan
   # offset. In other words, the pattern is not anchored to the current scan
   # offset.
@@ -253,6 +310,11 @@ class StringScanner
     match(pattern, advance: false, anchored: true)
   end
 
+  # :ditto:
+  def check(len : Int) : String?
+    match(len, advance: false)
+  end
+
   # Returns the value that `#scan_until` would return, without advancing the
   # scan offset. The last match is still saved, however.
   #
@@ -387,6 +449,60 @@ class StringScanner
     io << " \"" << @str[start, 5] << "\" >"
   end
 
+  private def make_char_reader : Char::Reader
+    Char::Reader.new(@str, @byte_offset)
+  end
+
+  # Transforms a character count into a byte count *forward*
+  # from the scan head. Returns nil if the string doesn't have
+  # enough characters in it to advance by the given character
+  # count.
+  #
+  # Return value, if not nil, is guaranteed to be in ([email protected] - @byte_offset)
+  private def lookahead_byte_length(len : Int) : Int32?
+    raise ArgumentError.new("Negative lookahead count: #{len}") if len < 0
+    return 0 if len.zero?
+    if @str.single_byte_optimizable?
+      return len <= @str.bytesize - @byte_offset ? len : nil
+    end
+
+    # some redundant logic here from String#find_start_and_end, but in this case
+    # it is likely we are far into the string and len is small, so it is very
+    # important not to start at the beginning of the string.
+    reader = make_char_reader
+
+    current = reader.current_char?
+
+    len.times do
+      return nil if current.nil?
+      current = reader.next_char?
+    end
+
+    reader.pos - @byte_offset
+  end
+
+  # Similar to #lookahead_byte_length, transforms a character count
+  # into a byte count *backwards* from the scan head, and returns nil
+  # if this would fall off the beginning of the string.
+  #
+  # Return value, if not nil, is guaranteed to be in (0..@byte_offset)
+  private def lookbehind_byte_length(len : Int) : Int32?
+    raise ArgumentError.new("Negative lookbehind count: #{len}") if len < 0
+    return 0 if len.zero?
+
+    if @str.single_byte_optimizable?
+      return len <= @byte_offset ? len : nil
+    end
+
+    reader = make_char_reader
+
+    len.times do
+      reader.previous_char? || return nil
+    end
+
+    @byte_offset - reader.pos
+  end
+
   # :nodoc:
   struct StringMatchData
     def initialize(@str : String)