Skip to content

Commit 14e23d5

Browse files
committed
Display multi-byte unicode characters in lexer errors correctly
1 parent 2a7722b commit 14e23d5

File tree

6 files changed

+104
-85
lines changed

6 files changed

+104
-85
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 1 addition & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -674,62 +674,7 @@ extension Lexer.Cursor {
674674
/// that case bytes are consumed until we reach the next start of a UTF-8
675675
/// character.
676676
mutating func advanceValidatingUTF8Character() -> Unicode.Scalar? {
677-
guard let curByte = self.advance() else {
678-
return nil
679-
}
680-
681-
if (curByte < 0x80) {
682-
return Unicode.Scalar(curByte)
683-
}
684-
685-
// Read the number of high bits set, which indicates the number of bytes in
686-
// the character.
687-
let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
688-
689-
// If this is 0b10XXXXXX, then it is a continuation character.
690-
if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
691-
// Skip until we get the start of another character. This is guaranteed to
692-
// at least stop at the nul at the end of the buffer.
693-
self.advance(while: { !$0.isStartOfUTF8Character })
694-
return nil
695-
}
696-
697-
// Drop the high bits indicating the # bytes of the result.
698-
var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
699-
700-
// Read and validate the continuation bytes.
701-
for _ in 1..<encodedBytes {
702-
guard let curByte = self.peek() else {
703-
return nil
704-
}
705-
// If the high bit isn't set or the second bit isn't clear, then this is not
706-
// a continuation byte!
707-
if (curByte < 0x80 || curByte >= 0xC0) {
708-
return nil
709-
}
710-
711-
// Accumulate our result.
712-
charValue <<= 6
713-
charValue |= UInt32(curByte & 0x3F)
714-
_ = self.advance()
715-
}
716-
717-
// UTF-16 surrogate pair values are not valid code points.
718-
if (charValue >= 0xD800 && charValue <= 0xDFFF) {
719-
return nil
720-
}
721-
722-
// If we got here, we read the appropriate number of accumulated bytes.
723-
// Verify that the encoding was actually minimal.
724-
// Number of bits in the value, ignoring leading zeros.
725-
let numBits = 32 - charValue.leadingZeroBitCount
726-
if numBits <= 5 + 6 {
727-
return encodedBytes == 2 ? Unicode.Scalar(charValue) : nil
728-
}
729-
if numBits <= 4 + 6 + 6 {
730-
return encodedBytes == 3 ? Unicode.Scalar(charValue) : nil
731-
}
732-
return encodedBytes == 4 ? Unicode.Scalar(charValue) : nil
677+
return Unicode.Scalar.lexing(advance: { self.advance() }, peek: { self.peek(at: 0) })
733678
}
734679

735680
/// Rever the lexer by `offset` bytes. This should only be used by `resetForSplit`.

Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,91 @@ extension Unicode.Scalar {
157157
return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
158158
}
159159
}
160+
161+
extension Unicode.Scalar {
162+
/// Lex a single unicode scalar, which might consists of multiple bytes.
163+
/// `advance` returns the current byte in the lexer and advances the lexer by
164+
/// one byte.
165+
/// `peek` returns the current byte in the lexer without advancing it.
166+
@inline(__always)
167+
static func lexing(advance: () -> UInt8?, peek: () -> UInt8?) -> Self? {
168+
guard let curByte = advance() else {
169+
return nil
170+
}
171+
172+
if (curByte < 0x80) {
173+
return Unicode.Scalar(curByte)
174+
}
175+
176+
// Read the number of high bits set, which indicates the number of bytes in
177+
// the character.
178+
let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
179+
180+
// If this is 0b10XXXXXX, then it is a continuation character.
181+
if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
182+
// Skip until we get the start of another character. This is guaranteed to
183+
// at least stop at the nul at the end of the buffer.
184+
while let peeked = peek(), Unicode.Scalar(peeked).isStartOfUTF8Character {
185+
_ = advance()
186+
}
187+
return nil
188+
}
189+
190+
// Drop the high bits indicating the # bytes of the result.
191+
var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
192+
193+
// Read and validate the continuation bytes.
194+
for _ in 1..<encodedBytes {
195+
guard let curByte = peek() else {
196+
return nil
197+
}
198+
// If the high bit isn't set or the second bit isn't clear, then this is not
199+
// a continuation byte!
200+
if (curByte < 0x80 || curByte >= 0xC0) {
201+
return nil
202+
}
203+
204+
// Accumulate our result.
205+
charValue <<= 6
206+
charValue |= UInt32(curByte & 0x3F)
207+
_ = advance()
208+
}
209+
210+
// UTF-16 surrogate pair values are not valid code points.
211+
if (charValue >= 0xD800 && charValue <= 0xDFFF) {
212+
return nil
213+
}
214+
215+
// If we got here, we read the appropriate number of accumulated bytes.
216+
// Verify that the encoding was actually minimal.
217+
// Number of bits in the value, ignoring leading zeros.
218+
let numBits = 32 - charValue.leadingZeroBitCount
219+
if numBits <= 5 + 6 {
220+
return encodedBytes == 2 ? Unicode.Scalar(charValue) : nil
221+
}
222+
if numBits <= 4 + 6 + 6 {
223+
return encodedBytes == 3 ? Unicode.Scalar(charValue) : nil
224+
}
225+
return encodedBytes == 4 ? Unicode.Scalar(charValue) : nil
226+
}
227+
228+
/// Returns the first unicode scalar in `byteSequence`, which may span multiple bytes.
229+
public static func lexing<S: Collection>(from byteSequence: S) -> Self? where S.Element == UInt8 {
230+
var index = byteSequence.startIndex
231+
let peek = { () -> UInt8? in
232+
if index < byteSequence.endIndex {
233+
return byteSequence[index]
234+
} else {
235+
return nil
236+
}
237+
}
238+
let advance = { () -> UInt8? in
239+
defer {
240+
index = byteSequence.index(after: index)
241+
}
242+
return peek()
243+
}
244+
245+
return self.lexing(advance: advance, peek: peek)
246+
}
247+
}

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,11 @@ public extension SwiftSyntax.LexerError {
9898
/// `tokenText` is the entire text of the token in which the `LexerError`
9999
/// occurred, including trivia.
100100
@_spi(RawSyntax)
101-
func diagnostic(wholeText: SyntaxText) -> DiagnosticMessage {
101+
func diagnostic(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
102102
var scalarAtErrorOffset: UnicodeScalar {
103-
Unicode.Scalar(wholeText[Int(self.byteOffset)])
103+
// Fall back to the Unicode replacement character U+FFFD in case we can't
104+
// lex the unicode character at `byteOffset`. It's the best we can do
105+
Unicode.Scalar.lexing(from: wholeTextBytes[Int(self.byteOffset)...]) ?? UnicodeScalar("")
104106
}
105107

106108
switch self.kind {
@@ -130,8 +132,6 @@ public extension SwiftSyntax.LexerError {
130132
}
131133

132134
func diagnostic(in token: TokenSyntax) -> DiagnosticMessage {
133-
return token.tokenView.wholeText { wholeText in
134-
return self.diagnostic(wholeText: token.tokenView.rawText)
135-
}
135+
return self.diagnostic(wholeTextBytes: token.syntaxTextBytes)
136136
}
137137
}

Sources/SwiftSyntax/Raw/RawSyntaxTokenView.swift

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -65,29 +65,6 @@ public struct RawSyntaxTokenView {
6565
}
6666
}
6767

68-
@_spi(RawSyntax)
69-
public func wholeText<T>(_ body: (SyntaxText) -> T) -> T {
70-
switch raw.rawData.payload {
71-
case .parsedToken(let dat):
72-
return body(dat.wholeText)
73-
case .materializedToken(let dat):
74-
var wholeText: [UInt8] = []
75-
wholeText.reserveCapacity(leadingTriviaByteLength + textByteLength + trailingTriviaByteLength)
76-
for leadingTriviaPiece in dat.leadingTrivia {
77-
leadingTriviaPiece.withSyntaxText { wholeText.append(contentsOf: $0) }
78-
}
79-
wholeText.append(contentsOf: self.rawText)
80-
for trailingTriviaPiece in dat.trailingTrivia {
81-
trailingTriviaPiece.withSyntaxText { wholeText.append(contentsOf: $0) }
82-
}
83-
return wholeText.withUnsafeBufferPointer { buffer in
84-
return body(SyntaxText(buffer: buffer))
85-
}
86-
case .layout(_):
87-
preconditionFailure("'wholeText' is not available for non-token node")
88-
}
89-
}
90-
9168
/// The UTF-8 byte length of the leading trivia.
9269
@_spi(RawSyntax)
9370
public var leadingTriviaByteLength: Int {

Tests/SwiftParserTest/Assertions.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ private func AssertTokens(
143143
)
144144
case (let actualError?, let expectedError?):
145145
AssertStringsEqualWithDiff(
146-
actualError.diagnostic(wholeText: actualLexeme.wholeText).message,
146+
actualError.diagnostic(wholeTextBytes: Array(actualLexeme.wholeText)).message,
147147
expectedError,
148148
file: expectedLexeme.file,
149149
line: expectedLexeme.line

Tests/SwiftParserTest/LexerTests.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,15 @@ public class LexerTests: XCTestCase {
718718
)
719719
}
720720

721+
func testInvalidCharacterSpanningMultipleBytes() {
722+
AssertLexemes(
723+
"121️⃣😡",
724+
lexemes: [
725+
LexemeSpec(.integerLiteral, text: "12😡", error: "'😡' is not a valid digit in integer literal")
726+
]
727+
)
728+
}
729+
721730
func testBadNumericLiteralDigits() {
722731
AssertLexemes(
723732
"01️⃣a1234567",

0 commit comments

Comments
 (0)