Skip to content

Commit c7202d4

Browse files
authored
Fix invalid UTF-8 handling in Char::Reader#previous_char (#14013)
1 parent f9b7226 commit c7202d4

File tree

2 files changed

+216
-11
lines changed

2 files changed

+216
-11
lines changed

spec/std/char/reader_spec.cr

Lines changed: 132 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,31 @@
11
require "spec"
22
require "char/reader"
33

4-
private def assert_invalid_byte_sequence(bytes)
4+
private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LINE__)
55
reader = Char::Reader.new(String.new bytes)
6-
reader.current_char.should eq(Char::REPLACEMENT)
7-
reader.current_char_width.should eq(1)
8-
reader.error.should eq(bytes[0])
6+
reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
7+
reader.current_char_width.should eq(1), file: file, line: line
8+
reader.error.should eq(bytes[0]), file: file, line: line
9+
end
10+
11+
private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__)
12+
str = String.new bytes
13+
reader = Char::Reader.new(str, pos: bytes.size)
14+
reader.previous_char
15+
reader.current_char.should eq(str[0]), file: file, line: line
16+
reader.current_char_width.should eq(bytes.size), file: file, line: line
17+
reader.pos.should eq(0), file: file, line: line
18+
reader.error.should be_nil, file: file, line: line
19+
end
20+
21+
private def assert_invalid_byte_sequence_at_end(bytes, *, file = __FILE__, line = __LINE__)
22+
str = String.new bytes
23+
reader = Char::Reader.new(str, pos: bytes.size)
24+
reader.previous_char
25+
reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
26+
reader.current_char_width.should eq(1), file: file, line: line
27+
reader.pos.should eq(bytes.size - 1), file: file, line: line
28+
reader.error.should eq(bytes[-1]), file: file, line: line
929
end
1030

1131
describe "Char::Reader" do
@@ -242,4 +262,112 @@ describe "Char::Reader" do
242262
it "errors if fourth_byte is out of bounds" do
243263
assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0]
244264
end
265+
266+
describe "#previous_char" do
267+
it "reads on valid UTF-8" do
268+
assert_reads_at_end Bytes[0x00]
269+
assert_reads_at_end Bytes[0x7f]
270+
271+
assert_reads_at_end Bytes[0xc2, 0x80]
272+
assert_reads_at_end Bytes[0xc2, 0xbf]
273+
assert_reads_at_end Bytes[0xdf, 0x80]
274+
assert_reads_at_end Bytes[0xdf, 0xbf]
275+
276+
assert_reads_at_end Bytes[0xe1, 0x80, 0x80]
277+
assert_reads_at_end Bytes[0xe1, 0x80, 0xbf]
278+
assert_reads_at_end Bytes[0xe1, 0x9f, 0x80]
279+
assert_reads_at_end Bytes[0xe1, 0x9f, 0xbf]
280+
assert_reads_at_end Bytes[0xed, 0x80, 0x80]
281+
assert_reads_at_end Bytes[0xed, 0x80, 0xbf]
282+
assert_reads_at_end Bytes[0xed, 0x9f, 0x80]
283+
assert_reads_at_end Bytes[0xed, 0x9f, 0xbf]
284+
assert_reads_at_end Bytes[0xef, 0x80, 0x80]
285+
assert_reads_at_end Bytes[0xef, 0x80, 0xbf]
286+
assert_reads_at_end Bytes[0xef, 0x9f, 0x80]
287+
assert_reads_at_end Bytes[0xef, 0x9f, 0xbf]
288+
289+
assert_reads_at_end Bytes[0xe0, 0xa0, 0x80]
290+
assert_reads_at_end Bytes[0xe0, 0xa0, 0xbf]
291+
assert_reads_at_end Bytes[0xe0, 0xbf, 0x80]
292+
assert_reads_at_end Bytes[0xe0, 0xbf, 0xbf]
293+
assert_reads_at_end Bytes[0xe1, 0xa0, 0x80]
294+
assert_reads_at_end Bytes[0xe1, 0xa0, 0xbf]
295+
assert_reads_at_end Bytes[0xe1, 0xbf, 0x80]
296+
assert_reads_at_end Bytes[0xe1, 0xbf, 0xbf]
297+
assert_reads_at_end Bytes[0xef, 0xa0, 0x80]
298+
assert_reads_at_end Bytes[0xef, 0xa0, 0xbf]
299+
assert_reads_at_end Bytes[0xef, 0xbf, 0x80]
300+
assert_reads_at_end Bytes[0xef, 0xbf, 0xbf]
301+
302+
assert_reads_at_end Bytes[0xf1, 0x80, 0x80, 0x80]
303+
assert_reads_at_end Bytes[0xf1, 0x8f, 0x80, 0x80]
304+
assert_reads_at_end Bytes[0xf4, 0x80, 0x80, 0x80]
305+
assert_reads_at_end Bytes[0xf4, 0x8f, 0x80, 0x80]
306+
307+
assert_reads_at_end Bytes[0xf0, 0x90, 0x80, 0x80]
308+
assert_reads_at_end Bytes[0xf0, 0xbf, 0x80, 0x80]
309+
assert_reads_at_end Bytes[0xf3, 0x90, 0x80, 0x80]
310+
assert_reads_at_end Bytes[0xf3, 0xbf, 0x80, 0x80]
311+
end
312+
313+
it "errors on invalid UTF-8" do
314+
assert_invalid_byte_sequence_at_end Bytes[0x80]
315+
assert_invalid_byte_sequence_at_end Bytes[0xbf]
316+
assert_invalid_byte_sequence_at_end Bytes[0xc0]
317+
assert_invalid_byte_sequence_at_end Bytes[0xff]
318+
319+
assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80]
320+
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80]
321+
assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80]
322+
assert_invalid_byte_sequence_at_end Bytes[0x9f, 0x80]
323+
assert_invalid_byte_sequence_at_end Bytes[0xa0, 0x80]
324+
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80]
325+
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80]
326+
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80]
327+
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80]
328+
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80]
329+
330+
assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80]
331+
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80, 0x80]
332+
assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80, 0x80]
333+
assert_invalid_byte_sequence_at_end Bytes[0x8f, 0x80, 0x80]
334+
assert_invalid_byte_sequence_at_end Bytes[0x90, 0x80, 0x80]
335+
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80, 0x80]
336+
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80, 0x80]
337+
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80, 0x80]
338+
assert_invalid_byte_sequence_at_end Bytes[0xc2, 0x80, 0x80]
339+
assert_invalid_byte_sequence_at_end Bytes[0xdf, 0x80, 0x80]
340+
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80, 0x80]
341+
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x9f, 0xbf]
342+
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80]
343+
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80]
344+
345+
assert_invalid_byte_sequence_at_end Bytes[0x00, 0xa0, 0x80]
346+
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0xa0, 0x80]
347+
assert_invalid_byte_sequence_at_end Bytes[0x80, 0xa0, 0x80]
348+
assert_invalid_byte_sequence_at_end Bytes[0x8f, 0xa0, 0x80]
349+
assert_invalid_byte_sequence_at_end Bytes[0x90, 0xa0, 0x80]
350+
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0xa0, 0x80]
351+
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0xa0, 0x80]
352+
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0xa0, 0x80]
353+
assert_invalid_byte_sequence_at_end Bytes[0xc2, 0xa0, 0x80]
354+
assert_invalid_byte_sequence_at_end Bytes[0xdf, 0xa0, 0x80]
355+
assert_invalid_byte_sequence_at_end Bytes[0xed, 0xa0, 0x80]
356+
assert_invalid_byte_sequence_at_end Bytes[0xed, 0xbf, 0xbf]
357+
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0xa0, 0x80]
358+
assert_invalid_byte_sequence_at_end Bytes[0xff, 0xa0, 0x80]
359+
360+
assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80, 0x80]
361+
assert_invalid_byte_sequence_at_end Bytes[0xef, 0x80, 0x80, 0x80]
362+
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80, 0x80]
363+
assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x80, 0x80, 0x80]
364+
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80, 0x80]
365+
366+
assert_invalid_byte_sequence_at_end Bytes[0x00, 0x90, 0x80, 0x80]
367+
assert_invalid_byte_sequence_at_end Bytes[0xef, 0x90, 0x80, 0x80]
368+
assert_invalid_byte_sequence_at_end Bytes[0xf4, 0x90, 0x80, 0x80]
369+
assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x90, 0x80, 0x80]
370+
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x90, 0x80, 0x80]
371+
end
372+
end
245373
end

src/char/reader.cr

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ struct Char
331331
end
332332

333333
private macro invalid_byte_sequence
334-
return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8
334+
return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8!
335335
end
336336

337337
@[AlwaysInline]
@@ -343,15 +343,92 @@ struct Char
343343
end
344344
end
345345

346-
private def decode_previous_char
347-
return if @pos == 0
346+
# The reverse UTF-8 DFA transition table for reference: (contrast with
347+
# `Unicode::UTF8_ENCODING_DFA`)
348+
#
349+
# accepted (initial state)
350+
# | 1 continuation byte
351+
# | | 2 continuation bytes; disallow overlong encodings up to U+07FF
352+
# | | | 2 continuation bytes; disallow surrogate pairs
353+
# | | | | 3 continuation bytes; disallow overlong encodings up to U+FFFF
354+
# | | | | | 3 continuation bytes; disallow codepoints above U+10FFFF
355+
# v v v v v v
356+
#
357+
# | 0 2 3 4 5 6
358+
# -----------+------------
359+
# 0x00..0x7F | 0 _ _ _ _ _
360+
# 0x80..0x8F | 2 3 5 5 _ _
361+
# 0x90..0x9F | 2 3 6 6 _ _
362+
# 0xA0..0xBF | 2 4 6 6 _ _
363+
# 0xC2..0xDF | _ 0 _ _ _ _
364+
# 0xE0..0xE0 | _ _ _ 0 _ _
365+
# 0xE1..0xEC | _ _ 0 0 _ _
366+
# 0xED..0xED | _ _ 0 _ _ _
367+
# 0xEE..0xEF | _ _ 0 0 _ _
368+
# 0xF0..0xF0 | _ _ _ _ _ 0
369+
# 0xF1..0xF3 | _ _ _ _ 0 0
370+
# 0xF4..0xF4 | _ _ _ _ 0 _
371+
private def decode_char_before(pos, & : UInt32, Int32, UInt8? ->)
372+
fourth = byte_at(pos - 1)
373+
if fourth <= 0x7f
374+
return yield fourth, 1, nil
375+
end
348376

349-
while @pos > 0
350-
@pos -= 1
351-
break if (byte_at(@pos) & 0xC0) != 0x80
377+
if fourth > 0xbf || pos < 2
378+
invalid_byte_sequence_before
352379
end
353-
decode_char_at(@pos) do |code_point, width, error|
380+
381+
third = byte_at(pos - 2)
382+
if 0xc2 <= third <= 0xdf
383+
return yield (third << 6) &+ (fourth &- 0x3080), 2, nil
384+
end
385+
386+
if (third & 0xc0) != 0x80 || pos < 3
387+
invalid_byte_sequence_before
388+
end
389+
390+
second = byte_at(pos - 3)
391+
if second & 0xf0 == 0xe0
392+
if second == 0xe0 && third <= 0x9f
393+
invalid_byte_sequence_before
394+
end
395+
396+
if second == 0xed && third >= 0xa0
397+
invalid_byte_sequence_before
398+
end
399+
400+
return yield (second << 12) &+ (third << 6) &+ (fourth &- 0xE2080), 3, nil
401+
end
402+
403+
if (second & 0xc0) != 0x80 || pos < 4
404+
invalid_byte_sequence_before
405+
end
406+
407+
first = byte_at(pos - 4)
408+
if second <= 0x8f
409+
unless 0xf1 <= first <= 0xf4
410+
invalid_byte_sequence_before
411+
end
412+
else
413+
unless 0xf0 <= first <= 0xf3
414+
invalid_byte_sequence_before
415+
end
416+
end
417+
418+
return yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil
419+
end
420+
421+
private macro invalid_byte_sequence_before
422+
return yield Char::REPLACEMENT.ord.to_u32!, 1, fourth.to_u8!
423+
end
424+
425+
@[AlwaysInline]
426+
private def decode_previous_char
427+
return nil if @pos == 0
428+
429+
decode_char_before(@pos) do |code_point, width, error|
354430
@current_char_width = width
431+
@pos -= width
355432
@error = error
356433
@current_char = code_point.unsafe_chr
357434
end

0 commit comments

Comments
 (0)