Skip to content

Commit 9b2f29f

Browse files
committed
[SwiftParser] Correct the range for the start byte of a UTF8 character
1 parent a7fa220 commit 9b2f29f

File tree

1 file changed

+20
-13
lines changed

1 file changed

+20
-13
lines changed

Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -156,12 +156,6 @@ extension Unicode.Scalar {
156156
// including and above the DEL character U+7F.
157157
return self.value >= 0x20 && self.value < 0x7F
158158
}
159-
160-
var isStartOfUTF8Character: Bool {
161-
// RFC 2279: The octet values FE and FF never appear.
162-
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
163-
return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
164-
}
165159
}
166160

167161
extension Unicode.Scalar {
@@ -179,20 +173,25 @@ extension Unicode.Scalar {
179173
return Unicode.Scalar(curByte)
180174
}
181175

182-
// Read the number of high bits set, which indicates the number of bytes in
183-
// the character.
184-
let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
185-
186-
// If this is 0b10XXXXXX, then it is a continuation character.
187-
if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
176+
// If this is not the start of a UTF8 character,
177+
// then it is either a continuation byte or an invalid UTF8 code point.
178+
if !curByte.isStartOfUTF8Character {
188179
// Skip until we get the start of another character. This is guaranteed to
189180
// at least stop at the nul at the end of the buffer.
190-
while let peeked = peek(), !Unicode.Scalar(peeked).isStartOfUTF8Character {
181+
while let peeked = peek(), !peeked.isStartOfUTF8Character {
191182
_ = advance()
192183
}
193184
return nil
194185
}
195186

187+
// Read the number of high bits set, which indicates the number of bytes in
188+
// the character.
189+
let encodedBytes = (~curByte).leadingZeroBitCount
190+
// We have a multi-byte UTF-8 scalar.
191+
// Single-byte UTF-8 scalars are handled at the start of the function by checking `curByte < 0x80`.
192+
// `isStartOfUTF8Character` guaranteed that the `curByte` has 2 to 4 leading ones.
193+
precondition(encodedBytes >= 2 && encodedBytes <= 4)
194+
196195
// Drop the high bits indicating the # bytes of the result.
197196
var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
198197

@@ -252,3 +251,11 @@ extension Unicode.Scalar {
252251
return self.lexing(advance: advance, peek: peek)
253252
}
254253
}
254+
255+
extension UInt8 {
256+
var isStartOfUTF8Character: Bool {
257+
// RFC 2279: The octet values FE and FF never appear.
258+
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
259+
return self < 0x80 || (self >= 0xC2 && self < 0xF5)
260+
}
261+
}

0 commit comments

Comments
 (0)