Fix invalid UTF-8 handling in Char::Reader#previous_char (#14013)

HertzDevil · web-flow · commit c7202d4948c6 · 2023-11-25T20:31:27.000+01:00
diff --git a/spec/std/char/reader_spec.cr b/spec/std/char/reader_spec.cr
@@ -1,11 +1,31 @@
 require "spec"
 require "char/reader"
 
-private def assert_invalid_byte_sequence(bytes)
+private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LINE__)
   reader = Char::Reader.new(String.new bytes)
-  reader.current_char.should eq(Char::REPLACEMENT)
-  reader.current_char_width.should eq(1)
-  reader.error.should eq(bytes[0])
+  reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
+  reader.current_char_width.should eq(1), file: file, line: line
+  reader.error.should eq(bytes[0]), file: file, line: line
+end
+
+private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__)
+  str = String.new bytes
+  reader = Char::Reader.new(str, pos: bytes.size)
+  reader.previous_char
+  reader.current_char.should eq(str[0]), file: file, line: line
+  reader.current_char_width.should eq(bytes.size), file: file, line: line
+  reader.pos.should eq(0), file: file, line: line
+  reader.error.should be_nil, file: file, line: line
+end
+
+private def assert_invalid_byte_sequence_at_end(bytes, *, file = __FILE__, line = __LINE__)
+  str = String.new bytes
+  reader = Char::Reader.new(str, pos: bytes.size)
+  reader.previous_char
+  reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
+  reader.current_char_width.should eq(1), file: file, line: line
+  reader.pos.should eq(bytes.size - 1), file: file, line: line
+  reader.error.should eq(bytes[-1]), file: file, line: line
 end
 
 describe "Char::Reader" do
@@ -242,4 +262,112 @@ describe "Char::Reader" do
   it "errors if fourth_byte is out of bounds" do
     assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0]
   end
+
+  describe "#previous_char" do
+    it "reads on valid UTF-8" do
+      assert_reads_at_end Bytes[0x00]
+      assert_reads_at_end Bytes[0x7f]
+
+      assert_reads_at_end Bytes[0xc2, 0x80]
+      assert_reads_at_end Bytes[0xc2, 0xbf]
+      assert_reads_at_end Bytes[0xdf, 0x80]
+      assert_reads_at_end Bytes[0xdf, 0xbf]
+
+      assert_reads_at_end Bytes[0xe1, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xe1, 0x80, 0xbf]
+      assert_reads_at_end Bytes[0xe1, 0x9f, 0x80]
+      assert_reads_at_end Bytes[0xe1, 0x9f, 0xbf]
+      assert_reads_at_end Bytes[0xed, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xed, 0x80, 0xbf]
+      assert_reads_at_end Bytes[0xed, 0x9f, 0x80]
+      assert_reads_at_end Bytes[0xed, 0x9f, 0xbf]
+      assert_reads_at_end Bytes[0xef, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xef, 0x80, 0xbf]
+      assert_reads_at_end Bytes[0xef, 0x9f, 0x80]
+      assert_reads_at_end Bytes[0xef, 0x9f, 0xbf]
+
+      assert_reads_at_end Bytes[0xe0, 0xa0, 0x80]
+      assert_reads_at_end Bytes[0xe0, 0xa0, 0xbf]
+      assert_reads_at_end Bytes[0xe0, 0xbf, 0x80]
+      assert_reads_at_end Bytes[0xe0, 0xbf, 0xbf]
+      assert_reads_at_end Bytes[0xe1, 0xa0, 0x80]
+      assert_reads_at_end Bytes[0xe1, 0xa0, 0xbf]
+      assert_reads_at_end Bytes[0xe1, 0xbf, 0x80]
+      assert_reads_at_end Bytes[0xe1, 0xbf, 0xbf]
+      assert_reads_at_end Bytes[0xef, 0xa0, 0x80]
+      assert_reads_at_end Bytes[0xef, 0xa0, 0xbf]
+      assert_reads_at_end Bytes[0xef, 0xbf, 0x80]
+      assert_reads_at_end Bytes[0xef, 0xbf, 0xbf]
+
+      assert_reads_at_end Bytes[0xf1, 0x80, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xf1, 0x8f, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xf4, 0x80, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xf4, 0x8f, 0x80, 0x80]
+
+      assert_reads_at_end Bytes[0xf0, 0x90, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xf0, 0xbf, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xf3, 0x90, 0x80, 0x80]
+      assert_reads_at_end Bytes[0xf3, 0xbf, 0x80, 0x80]
+    end
+
+    it "errors on invalid UTF-8" do
+      assert_invalid_byte_sequence_at_end Bytes[0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xbf]
+      assert_invalid_byte_sequence_at_end Bytes[0xc0]
+      assert_invalid_byte_sequence_at_end Bytes[0xff]
+
+      assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x9f, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80]
+
+      assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x8f, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x90, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc2, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xdf, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x9f, 0xbf]
+      assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80]
+
+      assert_invalid_byte_sequence_at_end Bytes[0x00, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x7f, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x80, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x8f, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0x90, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xbf, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc0, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc1, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xc2, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xdf, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xed, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xed, 0xbf, 0xbf]
+      assert_invalid_byte_sequence_at_end Bytes[0xf0, 0xa0, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xff, 0xa0, 0x80]
+
+      assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xef, 0x80, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x80, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80, 0x80]
+
+      assert_invalid_byte_sequence_at_end Bytes[0x00, 0x90, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xef, 0x90, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xf4, 0x90, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x90, 0x80, 0x80]
+      assert_invalid_byte_sequence_at_end Bytes[0xff, 0x90, 0x80, 0x80]
+    end
+  end
 end
diff --git a/src/char/reader.cr b/src/char/reader.cr
@@ -331,7 +331,7 @@ struct Char
     end
 
     private macro invalid_byte_sequence
-      return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8
+      return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8!
     end
 
     @[AlwaysInline]
@@ -343,15 +343,92 @@ struct Char
       end
     end
 
-    private def decode_previous_char
-      return if @pos == 0
+    # The reverse UTF-8 DFA transition table for reference: (contrast with
+    # `Unicode::UTF8_ENCODING_DFA`)
+    #
+    #              accepted (initial state)
+    #              | 1 continuation byte
+    #              | | 2 continuation bytes; disallow overlong encodings up to U+07FF
+    #              | | | 2 continuation bytes; disallow surrogate pairs
+    #              | | | | 3 continuation bytes; disallow overlong encodings up to U+FFFF
+    #              | | | | | 3 continuation bytes; disallow codepoints above U+10FFFF
+    #              v v v v v v
+    #
+    #            | 0 2 3 4 5 6
+    # -----------+------------
+    # 0x00..0x7F | 0 _ _ _ _ _
+    # 0x80..0x8F | 2 3 5 5 _ _
+    # 0x90..0x9F | 2 3 6 6 _ _
+    # 0xA0..0xBF | 2 4 6 6 _ _
+    # 0xC2..0xDF | _ 0 _ _ _ _
+    # 0xE0..0xE0 | _ _ _ 0 _ _
+    # 0xE1..0xEC | _ _ 0 0 _ _
+    # 0xED..0xED | _ _ 0 _ _ _
+    # 0xEE..0xEF | _ _ 0 0 _ _
+    # 0xF0..0xF0 | _ _ _ _ _ 0
+    # 0xF1..0xF3 | _ _ _ _ 0 0
+    # 0xF4..0xF4 | _ _ _ _ 0 _
+    private def decode_char_before(pos, & : UInt32, Int32, UInt8? ->)
+      fourth = byte_at(pos - 1)
+      if fourth <= 0x7f
+        return yield fourth, 1, nil
+      end
 
-      while @pos > 0
-        @pos -= 1
-        break if (byte_at(@pos) & 0xC0) != 0x80
+      if fourth > 0xbf || pos < 2
+        invalid_byte_sequence_before
       end
-      decode_char_at(@pos) do |code_point, width, error|
+
+      third = byte_at(pos - 2)
+      if 0xc2 <= third <= 0xdf
+        return yield (third << 6) &+ (fourth &- 0x3080), 2, nil
+      end
+
+      if (third & 0xc0) != 0x80 || pos < 3
+        invalid_byte_sequence_before
+      end
+
+      second = byte_at(pos - 3)
+      if second & 0xf0 == 0xe0
+        if second == 0xe0 && third <= 0x9f
+          invalid_byte_sequence_before
+        end
+
+        if second == 0xed && third >= 0xa0
+          invalid_byte_sequence_before
+        end
+
+        return yield (second << 12) &+ (third << 6) &+ (fourth &- 0xE2080), 3, nil
+      end
+
+      if (second & 0xc0) != 0x80 || pos < 4
+        invalid_byte_sequence_before
+      end
+
+      first = byte_at(pos - 4)
+      if second <= 0x8f
+        unless 0xf1 <= first <= 0xf4
+          invalid_byte_sequence_before
+        end
+      else
+        unless 0xf0 <= first <= 0xf3
+          invalid_byte_sequence_before
+        end
+      end
+
+      return yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil
+    end
+
+    private macro invalid_byte_sequence_before
+      return yield Char::REPLACEMENT.ord.to_u32!, 1, fourth.to_u8!
+    end
+
+    @[AlwaysInline]
+    private def decode_previous_char
+      return nil if @pos == 0
+
+      decode_char_before(@pos) do |code_point, width, error|
         @current_char_width = width
+        @pos -= width
         @error = error
         @current_char = code_point.unsafe_chr
       end