Skip to content

Commit 94a3c62

Browse files
authored
Add Unicode and control character whitespace support in lexer (#47)
1 parent efd14ee commit 94a3c62

File tree

2 files changed

+79
-5
lines changed

2 files changed

+79
-5
lines changed

parser/lexer.go

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -483,11 +483,84 @@ func (l *Lexer) NextToken() Token {
483483
return tok
484484
}
485485

486+
// isWhitespace checks if the current position contains whitespace.
487+
// T-SQL treats many control characters and Unicode spaces as whitespace.
488+
func (l *Lexer) isWhitespace() bool {
489+
if l.ch == 0 {
490+
return false
491+
}
492+
// ASCII whitespace and control characters (0x01-0x20 range, excluding 0x00)
493+
// T-SQL treats most ASCII control characters as whitespace
494+
if l.ch <= 0x20 {
495+
return true
496+
}
497+
// Check for multi-byte UTF-8 whitespace sequences
498+
if l.ch >= 0x80 {
499+
// Try to decode rune at current position
500+
r, _ := l.peekRune()
501+
// unicode.IsSpace covers most whitespace, but T-SQL also treats
502+
// Zero Width Space (U+200B) as whitespace
503+
return unicode.IsSpace(r) || r == 0x200B
504+
}
505+
return false
506+
}
507+
508+
// peekRune returns the rune at the current position without advancing.
509+
func (l *Lexer) peekRune() (rune, int) {
510+
if l.pos >= len(l.input) {
511+
return 0, 0
512+
}
513+
// Fast path for ASCII
514+
if l.input[l.pos] < 0x80 {
515+
return rune(l.input[l.pos]), 1
516+
}
517+
// Decode UTF-8
518+
r, size := decodeRuneAt(l.input, l.pos)
519+
return r, size
520+
}
521+
522+
// decodeRuneAt decodes a UTF-8 rune at the given position.
523+
func decodeRuneAt(s string, pos int) (rune, int) {
524+
if pos >= len(s) {
525+
return 0, 0
526+
}
527+
b := s[pos]
528+
if b < 0x80 {
529+
return rune(b), 1
530+
}
531+
// 2-byte sequence
532+
if b&0xE0 == 0xC0 && pos+1 < len(s) {
533+
return rune(b&0x1F)<<6 | rune(s[pos+1]&0x3F), 2
534+
}
535+
// 3-byte sequence
536+
if b&0xF0 == 0xE0 && pos+2 < len(s) {
537+
return rune(b&0x0F)<<12 | rune(s[pos+1]&0x3F)<<6 | rune(s[pos+2]&0x3F), 3
538+
}
539+
// 4-byte sequence
540+
if b&0xF8 == 0xF0 && pos+3 < len(s) {
541+
return rune(b&0x07)<<18 | rune(s[pos+1]&0x3F)<<12 | rune(s[pos+2]&0x3F)<<6 | rune(s[pos+3]&0x3F), 4
542+
}
543+
return rune(b), 1
544+
}
545+
546+
// skipWhitespaceChar advances past one whitespace character (which may be multi-byte).
547+
func (l *Lexer) skipWhitespaceChar() {
548+
if l.ch < 0x80 {
549+
l.readChar()
550+
return
551+
}
552+
// Multi-byte UTF-8: advance by rune size
553+
_, size := l.peekRune()
554+
for i := 0; i < size; i++ {
555+
l.readChar()
556+
}
557+
}
558+
486559
func (l *Lexer) skipWhitespaceAndComments() {
487560
for {
488-
// Skip whitespace
489-
for l.ch != 0 && (l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r') {
490-
l.readChar()
561+
// Skip whitespace (including Unicode whitespace)
562+
for l.ch != 0 && l.isWhitespace() {
563+
l.skipWhitespaceChar()
491564
}
492565

493566
// Skip line comments (-- ...)
@@ -641,7 +714,8 @@ func isHexDigit(ch byte) bool {
641714
}
642715

643716
func isLetter(ch byte) bool {
644-
return unicode.IsLetter(rune(ch))
717+
// Only ASCII letters - don't treat UTF-8 leading bytes as letters
718+
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
645719
}
646720

647721
func isDigit(ch byte) bool {
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"todo": true}
1+
{}

0 commit comments

Comments
 (0)