Display multi-byte unicode characters in lexer errors correctly

ahoppen · ahoppen · commit 14e23d51f825 · 2023-01-27T19:08:46.000+01:00
diff --git a/Sources/SwiftParser/Lexer/Cursor.swift b/Sources/SwiftParser/Lexer/Cursor.swift
@@ -674,62 +674,7 @@ extension Lexer.Cursor {
   ///    that case bytes are consumed until we reach the next start of a UTF-8
   ///    character.
   mutating func advanceValidatingUTF8Character() -> Unicode.Scalar? {
-    guard let curByte = self.advance() else {
-      return nil
-    }
-
-    if (curByte < 0x80) {
-      return Unicode.Scalar(curByte)
-    }
-
-    // Read the number of high bits set, which indicates the number of bytes in
-    // the character.
-    let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
-
-    // If this is 0b10XXXXXX, then it is a continuation character.
-    if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
-      // Skip until we get the start of another character.  This is guaranteed to
-      // at least stop at the nul at the end of the buffer.
-      self.advance(while: { !$0.isStartOfUTF8Character })
-      return nil
-    }
-
-    // Drop the high bits indicating the # bytes of the result.
-    var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
-
-    // Read and validate the continuation bytes.
-    for _ in 1..<encodedBytes {
-      guard let curByte = self.peek() else {
-        return nil
-      }
-      // If the high bit isn't set or the second bit isn't clear, then this is not
-      // a continuation byte!
-      if (curByte < 0x80 || curByte >= 0xC0) {
-        return nil
-      }
-
-      // Accumulate our result.
-      charValue <<= 6
-      charValue |= UInt32(curByte & 0x3F)
-      _ = self.advance()
-    }
-
-    // UTF-16 surrogate pair values are not valid code points.
-    if (charValue >= 0xD800 && charValue <= 0xDFFF) {
-      return nil
-    }
-
-    // If we got here, we read the appropriate number of accumulated bytes.
-    // Verify that the encoding was actually minimal.
-    // Number of bits in the value, ignoring leading zeros.
-    let numBits = 32 - charValue.leadingZeroBitCount
-    if numBits <= 5 + 6 {
-      return encodedBytes == 2 ? Unicode.Scalar(charValue) : nil
-    }
-    if numBits <= 4 + 6 + 6 {
-      return encodedBytes == 3 ? Unicode.Scalar(charValue) : nil
-    }
-    return encodedBytes == 4 ? Unicode.Scalar(charValue) : nil
+    return Unicode.Scalar.lexing(advance: { self.advance() }, peek: { self.peek(at: 0) })
   }
 
   /// Rever the lexer by `offset` bytes. This should only be used by `resetForSplit`.
diff --git a/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift b/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift
@@ -157,3 +157,91 @@ extension Unicode.Scalar {
     return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
   }
 }
+
+extension Unicode.Scalar {
+  /// Lex a single unicode scalar, which might consists of multiple bytes.
+  /// `advance` returns the current byte in the lexer and advances the lexer by
+  /// one byte.
+  /// `peek` returns the current byte in the lexer without advancing it.
+  @inline(__always)
+  static func lexing(advance: () -> UInt8?, peek: () -> UInt8?) -> Self? {
+    guard let curByte = advance() else {
+      return nil
+    }
+
+    if (curByte < 0x80) {
+      return Unicode.Scalar(curByte)
+    }
+
+    // Read the number of high bits set, which indicates the number of bytes in
+    // the character.
+    let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
+
+    // If this is 0b10XXXXXX, then it is a continuation character.
+    if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
+      // Skip until we get the start of another character.  This is guaranteed to
+      // at least stop at the nul at the end of the buffer.
+      while let peeked = peek(), Unicode.Scalar(peeked).isStartOfUTF8Character {
+        _ = advance()
+      }
+      return nil
+    }
+
+    // Drop the high bits indicating the # bytes of the result.
+    var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
+
+    // Read and validate the continuation bytes.
+    for _ in 1..<encodedBytes {
+      guard let curByte = peek() else {
+        return nil
+      }
+      // If the high bit isn't set or the second bit isn't clear, then this is not
+      // a continuation byte!
+      if (curByte < 0x80 || curByte >= 0xC0) {
+        return nil
+      }
+
+      // Accumulate our result.
+      charValue <<= 6
+      charValue |= UInt32(curByte & 0x3F)
+      _ = advance()
+    }
+
+    // UTF-16 surrogate pair values are not valid code points.
+    if (charValue >= 0xD800 && charValue <= 0xDFFF) {
+      return nil
+    }
+
+    // If we got here, we read the appropriate number of accumulated bytes.
+    // Verify that the encoding was actually minimal.
+    // Number of bits in the value, ignoring leading zeros.
+    let numBits = 32 - charValue.leadingZeroBitCount
+    if numBits <= 5 + 6 {
+      return encodedBytes == 2 ? Unicode.Scalar(charValue) : nil
+    }
+    if numBits <= 4 + 6 + 6 {
+      return encodedBytes == 3 ? Unicode.Scalar(charValue) : nil
+    }
+    return encodedBytes == 4 ? Unicode.Scalar(charValue) : nil
+  }
+
+  /// Returns the first unicode scalar in `byteSequence`, which may span multiple bytes.
+  public static func lexing<S: Collection>(from byteSequence: S) -> Self? where S.Element == UInt8 {
+    var index = byteSequence.startIndex
+    let peek = { () -> UInt8? in
+      if index < byteSequence.endIndex {
+        return byteSequence[index]
+      } else {
+        return nil
+      }
+    }
+    let advance = { () -> UInt8? in
+      defer {
+        index = byteSequence.index(after: index)
+      }
+      return peek()
+    }
+
+    return self.lexing(advance: advance, peek: peek)
+  }
+}
diff --git a/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift b/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift
@@ -98,9 +98,11 @@ public extension SwiftSyntax.LexerError {
   /// `tokenText` is the entire text of the token in which the `LexerError`
   /// occurred, including trivia.
   @_spi(RawSyntax)
-  func diagnostic(wholeText: SyntaxText) -> DiagnosticMessage {
+  func diagnostic(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
     var scalarAtErrorOffset: UnicodeScalar {
-      Unicode.Scalar(wholeText[Int(self.byteOffset)])
+      // Fall back to the Unicode replacement character U+FFFD in case we can't
+      // lex the unicode character at `byteOffset`. It's the best we can do
+      Unicode.Scalar.lexing(from: wholeTextBytes[Int(self.byteOffset)...]) ?? UnicodeScalar("�")
     }
 
     switch self.kind {
@@ -130,8 +132,6 @@ public extension SwiftSyntax.LexerError {
   }
 
   func diagnostic(in token: TokenSyntax) -> DiagnosticMessage {
-    return token.tokenView.wholeText { wholeText in
-      return self.diagnostic(wholeText: token.tokenView.rawText)
-    }
+    return self.diagnostic(wholeTextBytes: token.syntaxTextBytes)
   }
 }
diff --git a/Sources/SwiftSyntax/Raw/RawSyntaxTokenView.swift b/Sources/SwiftSyntax/Raw/RawSyntaxTokenView.swift
@@ -65,29 +65,6 @@ public struct RawSyntaxTokenView {
     }
   }
 
-  @_spi(RawSyntax)
-  public func wholeText<T>(_ body: (SyntaxText) -> T) -> T {
-    switch raw.rawData.payload {
-    case .parsedToken(let dat):
-      return body(dat.wholeText)
-    case .materializedToken(let dat):
-      var wholeText: [UInt8] = []
-      wholeText.reserveCapacity(leadingTriviaByteLength + textByteLength + trailingTriviaByteLength)
-      for leadingTriviaPiece in dat.leadingTrivia {
-        leadingTriviaPiece.withSyntaxText { wholeText.append(contentsOf: $0) }
-      }
-      wholeText.append(contentsOf: self.rawText)
-      for trailingTriviaPiece in dat.trailingTrivia {
-        trailingTriviaPiece.withSyntaxText { wholeText.append(contentsOf: $0) }
-      }
-      return wholeText.withUnsafeBufferPointer { buffer in
-        return body(SyntaxText(buffer: buffer))
-      }
-    case .layout(_):
-      preconditionFailure("'wholeText' is not available for non-token node")
-    }
-  }
-
   /// The UTF-8 byte length of the leading trivia.
   @_spi(RawSyntax)
   public var leadingTriviaByteLength: Int {
diff --git a/Tests/SwiftParserTest/Assertions.swift b/Tests/SwiftParserTest/Assertions.swift
@@ -143,7 +143,7 @@ private func AssertTokens(
       )
     case (let actualError?, let expectedError?):
       AssertStringsEqualWithDiff(
-        actualError.diagnostic(wholeText: actualLexeme.wholeText).message,
+        actualError.diagnostic(wholeTextBytes: Array(actualLexeme.wholeText)).message,
         expectedError,
         file: expectedLexeme.file,
         line: expectedLexeme.line
diff --git a/Tests/SwiftParserTest/LexerTests.swift b/Tests/SwiftParserTest/LexerTests.swift
@@ -718,6 +718,15 @@ public class LexerTests: XCTestCase {
     )
   }
 
+  func testInvalidCharacterSpanningMultipleBytes() {
+    AssertLexemes(
+      "121️⃣😡",
+      lexemes: [
+        LexemeSpec(.integerLiteral, text: "12😡", error: "'😡' is not a valid digit in integer literal")
+      ]
+    )
+  }
+
   func testBadNumericLiteralDigits() {
     AssertLexemes(
       "01️⃣a1234567",

Original file line number	Diff line number	Diff line change
`@@ -98,9 +98,11 @@ public extension SwiftSyntax.LexerError {`
`98`	`98`	/// `tokenText` is the entire text of the token in which the `LexerError`
`99`	`99`	`/// occurred, including trivia.`
`100`	`100`	`@_spi(RawSyntax)`
`101`		`- func diagnostic(wholeText: SyntaxText) -> DiagnosticMessage {`
	`101`	`+ func diagnostic(wholeTextBytes: [UInt8]) -> DiagnosticMessage {`
`102`	`102`	`var scalarAtErrorOffset: UnicodeScalar {`
`103`		`- Unicode.Scalar(wholeText[Int(self.byteOffset)])`
	`103`	`+ // Fall back to the Unicode replacement character U+FFFD in case we can't`
	`104`	+ // lex the unicode character at `byteOffset`. It's the best we can do
	`105`	`+ Unicode.Scalar.lexing(from: wholeTextBytes[Int(self.byteOffset)...]) ?? UnicodeScalar("�")`
`104`	`106`	`}`
`105`	`107`
`106`	`108`	`switch self.kind {`
`@@ -130,8 +132,6 @@ public extension SwiftSyntax.LexerError {`
`130`	`132`	`}`
`131`	`133`
`132`	`134`	`func diagnostic(in token: TokenSyntax) -> DiagnosticMessage {`
`133`		`- return token.tokenView.wholeText { wholeText in`
`134`		`- return self.diagnostic(wholeText: token.tokenView.rawText)`
`135`		`- }`
	`135`	`+ return self.diagnostic(wholeTextBytes: token.syntaxTextBytes)`
`136`	`136`	`}`
`137`	`137`	`}`
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ private func AssertTokens(`
`143`	`143`	`)`
`144`	`144`	`case (let actualError?, let expectedError?):`
`145`	`145`	`AssertStringsEqualWithDiff(`
`146`		`- actualError.diagnostic(wholeText: actualLexeme.wholeText).message,`
	`146`	`+ actualError.diagnostic(wholeTextBytes: Array(actualLexeme.wholeText)).message,`
`147`	`147`	`expectedError,`
`148`	`148`	`file: expectedLexeme.file,`
`149`	`149`	`line: expectedLexeme.line`