@@ -483,11 +483,84 @@ func (l *Lexer) NextToken() Token {
483483 return tok
484484}
485485
486+ // isWhitespace checks if the current position contains whitespace.
487+ // T-SQL treats many control characters and Unicode spaces as whitespace.
488+ func (l * Lexer ) isWhitespace () bool {
489+ if l .ch == 0 {
490+ return false
491+ }
492+ // ASCII whitespace and control characters (0x01-0x20 range, excluding 0x00)
493+ // T-SQL treats most ASCII control characters as whitespace
494+ if l .ch <= 0x20 {
495+ return true
496+ }
497+ // Check for multi-byte UTF-8 whitespace sequences
498+ if l .ch >= 0x80 {
499+ // Try to decode rune at current position
500+ r , _ := l .peekRune ()
501+ // unicode.IsSpace covers most whitespace, but T-SQL also treats
502+ // Zero Width Space (U+200B) as whitespace
503+ return unicode .IsSpace (r ) || r == 0x200B
504+ }
505+ return false
506+ }
507+
508+ // peekRune returns the rune at the current position without advancing.
509+ func (l * Lexer ) peekRune () (rune , int ) {
510+ if l .pos >= len (l .input ) {
511+ return 0 , 0
512+ }
513+ // Fast path for ASCII
514+ if l .input [l .pos ] < 0x80 {
515+ return rune (l .input [l .pos ]), 1
516+ }
517+ // Decode UTF-8
518+ r , size := decodeRuneAt (l .input , l .pos )
519+ return r , size
520+ }
521+
522+ // decodeRuneAt decodes a UTF-8 rune at the given position.
523+ func decodeRuneAt (s string , pos int ) (rune , int ) {
524+ if pos >= len (s ) {
525+ return 0 , 0
526+ }
527+ b := s [pos ]
528+ if b < 0x80 {
529+ return rune (b ), 1
530+ }
531+ // 2-byte sequence
532+ if b & 0xE0 == 0xC0 && pos + 1 < len (s ) {
533+ return rune (b & 0x1F )<< 6 | rune (s [pos + 1 ]& 0x3F ), 2
534+ }
535+ // 3-byte sequence
536+ if b & 0xF0 == 0xE0 && pos + 2 < len (s ) {
537+ return rune (b & 0x0F )<< 12 | rune (s [pos + 1 ]& 0x3F )<< 6 | rune (s [pos + 2 ]& 0x3F ), 3
538+ }
539+ // 4-byte sequence
540+ if b & 0xF8 == 0xF0 && pos + 3 < len (s ) {
541+ return rune (b & 0x07 )<< 18 | rune (s [pos + 1 ]& 0x3F )<< 12 | rune (s [pos + 2 ]& 0x3F )<< 6 | rune (s [pos + 3 ]& 0x3F ), 4
542+ }
543+ return rune (b ), 1
544+ }
545+
546+ // skipWhitespaceChar advances past one whitespace character (which may be multi-byte).
547+ func (l * Lexer ) skipWhitespaceChar () {
548+ if l .ch < 0x80 {
549+ l .readChar ()
550+ return
551+ }
552+ // Multi-byte UTF-8: advance by rune size
553+ _ , size := l .peekRune ()
554+ for i := 0 ; i < size ; i ++ {
555+ l .readChar ()
556+ }
557+ }
558+
486559func (l * Lexer ) skipWhitespaceAndComments () {
487560 for {
488- // Skip whitespace
489- for l .ch != 0 && ( l . ch == ' ' || l . ch == '\t' || l . ch == '\n' || l . ch == '\r' ) {
490- l .readChar ()
561+ // Skip whitespace (including Unicode whitespace)
562+ for l .ch != 0 && l . isWhitespace ( ) {
563+ l .skipWhitespaceChar ()
491564 }
492565
493566 // Skip line comments (-- ...)
@@ -641,7 +714,8 @@ func isHexDigit(ch byte) bool {
641714}
642715
643716func isLetter (ch byte ) bool {
644- return unicode .IsLetter (rune (ch ))
717+ // Only ASCII letters - don't treat UTF-8 leading bytes as letters
718+ return (ch >= 'a' && ch <= 'z' ) || (ch >= 'A' && ch <= 'Z' )
645719}
646720
647721func isDigit (ch byte ) bool {
0 commit comments