Fix a bug that caused the lexer’s cursor to end up in an incorrect state after resetForSplit

ahoppen · ahoppen · commit 5d455f482ae0 · 2023-02-12T21:31:01.000+01:00
diff --git a/Sources/SwiftParser/Lexer/Cursor.swift b/Sources/SwiftParser/Lexer/Cursor.swift
@@ -695,16 +695,6 @@ extension Lexer.Cursor {
   mutating func advanceValidatingUTF8Character() -> Unicode.Scalar? {
     return Unicode.Scalar.lexing(advance: { self.advance() }, peek: { self.peek(at: 0) })
   }
-
-  /// Rever the lexer by `offset` bytes. This should only be used by `resetForSplit`.
-  /// This must not back up by more bytes than the last token because that would
-  /// require us to also update `previousTokenKind`, which we don't do in this
-  /// function
-  mutating func backUp(by offset: Int) {
-    assert(!self.isAtStartOfFile)
-    self.previous = self.input.baseAddress!.advanced(by: -(offset + 1)).pointee
-    self.input = UnsafeBufferPointer(start: self.input.baseAddress!.advanced(by: -offset), count: self.input.count + offset)
-  }
 }
 
 // MARK: - Boundness of operators
diff --git a/Sources/SwiftParser/Lexer/LexemeSequence.swift b/Sources/SwiftParser/Lexer/LexemeSequence.swift
@@ -61,18 +61,14 @@ extension Lexer {
       return self.nextToken
     }
 
+    /// Reset the lexeme sequence to the state we were in when lexing `splitToken`
+    /// but after we consumed `consumedPrefix` bytes from `splitToken`.
     /// - Warning: Do not add more usages of this function.
-    mutating func resetForSplit(of bytes: Int) -> Lexer.Lexeme {
-      guard bytes > 0 else {
-        return self.advance()
+    mutating func resetForSplit(splitToken: Lexeme, consumedPrefix: Int) -> Lexer.Lexeme {
+      self.cursor = splitToken.cursor
+      for _ in 0..<consumedPrefix {
+        _ = self.cursor.advance()
       }
-
-      // FIXME: This is kind of ridiculous. We shouldn't have to look backwards
-      // in the token stream. We should be fusing together runs of operator and
-      // identifier characters in the parser, not splitting and backing up
-      // again in the lexer.
-      let backUpLength = self.nextToken.byteLength + bytes
-      self.cursor.backUp(by: backUpLength)
       self.nextToken = self.cursor.nextToken(sourceBufferStart: self.sourceBufferStart, stateAllocator: lexerStateAllocator)
       return self.advance()
     }
diff --git a/Sources/SwiftParser/Lookahead.swift b/Sources/SwiftParser/Lookahead.swift
@@ -122,12 +122,10 @@ extension Parser.Lookahead {
     }
     assert(tokenText.hasPrefix(prefix))
 
-    // See also: Parser.consumePrefix(_:as:)
-    let offset =
-      (self.currentToken.trailingTriviaByteLength
-        + tokenText.count
-        - prefix.count)
-    self.currentToken = self.lexemes.resetForSplit(of: offset)
+    self.currentToken = self.lexemes.resetForSplit(
+      splitToken: self.currentToken,
+      consumedPrefix: self.currentToken.leadingTriviaByteLength + prefix.count
+    )
   }
 }
 
diff --git a/Sources/SwiftParser/Parser.swift b/Sources/SwiftParser/Parser.swift
@@ -555,36 +555,10 @@ extension Parser {
 
     self.adjustNestingLevel(for: tokenKind)
 
-    // ... or a multi-character token with the first N characters being the one
-    // that we want to consume as a separate token.
-    // Careful: We need to reset the lexer to a point just before it saw the
-    // current token, plus the split point. That means we need to take trailing
-    // trivia into account for the current token, but we only need to take the
-    // number of UTF-8 bytes of the text of the split - no trivia necessary.
-    //
-    // <TOKEN<trailing trivia>> <NEXT TOKEN> ... -> <T> <OKEN<trailing trivia>> <NEXT TOKEN>
-    //
-    // The current calculation is:
-    //
-    //        <<leading trivia>TOKEN<trailing trivia>>
-    //                                        CURSOR ^
-    // + trailing trivia length
-    //
-    //        <<leading trivia>TOKEN<trailing trivia>>
-    //                       CURSOR ^
-    // + content length
-    //
-    //        <<leading trivia>TOKEN<trailing trivia>>
-    //                  CURSOR ^
-    // - split point length
-    //
-    //        <<leading trivia>TOKEN<trailing trivia>>
-    //                   CURSOR ^
-    let offset =
-      (self.currentToken.trailingTriviaByteLength
-        + tokenText.count
-        - prefix.count)
-    self.currentToken = self.lexemes.resetForSplit(of: offset)
+    self.currentToken = self.lexemes.resetForSplit(
+      splitToken: self.currentToken,
+      consumedPrefix: self.currentToken.leadingTriviaByteLength + prefix.count
+    )
     return tok
   }
 }
diff --git a/Tests/SwiftParserTest/ExpressionTests.swift b/Tests/SwiftParserTest/ExpressionTests.swift
@@ -1666,4 +1666,15 @@ final class StatementExpressionTests: XCTestCase {
       ]
     )
   }
+
+  func testStringLiteralAfterKeyPath() {
+    AssertParse(
+      #"""
+      \String.?1️⃣""
+      """#,
+      diagnostics: [
+        DiagnosticSpec(message: "consecutive statements on a line must be separated by ';'")
+      ]
+    )
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -122,12 +122,10 @@ extension Parser.Lookahead {`
`122`	`122`	`}`
`123`	`123`	`assert(tokenText.hasPrefix(prefix))`
`124`	`124`
`125`		`- // See also: Parser.consumePrefix(_:as:)`
`126`		`- let offset =`
`127`		`- (self.currentToken.trailingTriviaByteLength`
`128`		`- + tokenText.count`
`129`		`- - prefix.count)`
`130`		`- self.currentToken = self.lexemes.resetForSplit(of: offset)`
	`125`	`+ self.currentToken = self.lexemes.resetForSplit(`
	`126`	`+ splitToken: self.currentToken,`
	`127`	`+ consumedPrefix: self.currentToken.leadingTriviaByteLength + prefix.count`
	`128`	`+ )`
`131`	`129`	`}`
`132`	`130`	`}`
`133`	`131`
Original file line number	Diff line number	Diff line change
`@@ -1666,4 +1666,15 @@ final class StatementExpressionTests: XCTestCase {`
`1666`	`1666`	`]`
`1667`	`1667`	`)`
`1668`	`1668`	`}`
	`1669`	`+`
	`1670`	`+ func testStringLiteralAfterKeyPath() {`
	`1671`	`+ AssertParse(`
	`1672`	`+ #"""`
	`1673`	`+ \String.?1️⃣""`
	`1674`	`+ """#,`
	`1675`	`+ diagnostics: [`
	`1676`	`+ DiagnosticSpec(message: "consecutive statements on a line must be separated by ';'")`
	`1677`	`+ ]`
	`1678`	`+ )`
	`1679`	`+ }`
`1669`	`1680`	`}`