@@ -1136,12 +1136,24 @@ impl<'a> Tokenizer<'a> {
11361136 }
11371137 // numbers and period
11381138 '0' ..='9' | '.' => {
1139- let mut s = peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
1139+ // Some dialects support underscore as number separator
1140+ // There can only be one at a time and it must be followed by another digit
1141+ let is_number_separator = |ch : char , next_char : Option < char > | {
1142+ self . dialect . supports_numeric_literal_underscores ( )
1143+ && ch == '_'
1144+ && next_char. is_some_and ( |next_ch| next_ch. is_ascii_hexdigit ( ) )
1145+ } ;
1146+
1147+ let mut s = peeking_next_take_while ( chars, |ch, next_ch| {
1148+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1149+ } ) ;
11401150
11411151 // match binary literal that starts with 0x
11421152 if s == "0" && chars. peek ( ) == Some ( & 'x' ) {
11431153 chars. next ( ) ;
1144- let s2 = peeking_take_while ( chars, |ch| ch. is_ascii_hexdigit ( ) ) ;
1154+ let s2 = peeking_next_take_while ( chars, |ch, next_ch| {
1155+ ch. is_ascii_hexdigit ( ) || is_number_separator ( ch, next_ch)
1156+ } ) ;
11451157 return Ok ( Some ( Token :: HexStringLiteral ( s2) ) ) ;
11461158 }
11471159
@@ -1150,7 +1162,10 @@ impl<'a> Tokenizer<'a> {
11501162 s. push ( '.' ) ;
11511163 chars. next ( ) ;
11521164 }
1153- s += & peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
1165+
1166+ s += & peeking_next_take_while ( chars, |ch, next_ch| {
1167+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1168+ } ) ;
11541169
11551170 // No number -> Token::Period
11561171 if s == "." {
@@ -1946,6 +1961,24 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
19461961 s
19471962}
19481963
1964+ /// Same as peeking_take_while, but also passes the next character to the predicate.
1965+ fn peeking_next_take_while (
1966+ chars : & mut State ,
1967+ mut predicate : impl FnMut ( char , Option < char > ) -> bool ,
1968+ ) -> String {
1969+ let mut s = String :: new ( ) ;
1970+ while let Some ( & ch) = chars. peek ( ) {
1971+ let next_char = chars. peekable . clone ( ) . nth ( 1 ) ;
1972+ if predicate ( ch, next_char) {
1973+ chars. next ( ) ; // consume
1974+ s. push ( ch) ;
1975+ } else {
1976+ break ;
1977+ }
1978+ }
1979+ s
1980+ }
1981+
19491982fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
19501983 Unescape :: new ( chars) . unescape ( )
19511984}
@@ -2227,6 +2260,41 @@ mod tests {
22272260 compare ( expected, tokens) ;
22282261 }
22292262
2263+ #[ test]
2264+ fn tokenize_numeric_literal_underscore ( ) {
2265+ let dialect = GenericDialect { } ;
2266+ let sql = String :: from ( "SELECT 10_000" ) ;
2267+ let mut tokenizer = Tokenizer :: new ( & dialect, & sql) ;
2268+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
2269+ let expected = vec ! [
2270+ Token :: make_keyword( "SELECT" ) ,
2271+ Token :: Whitespace ( Whitespace :: Space ) ,
2272+ Token :: Number ( "10" . to_string( ) , false ) ,
2273+ Token :: make_word( "_000" , None ) ,
2274+ ] ;
2275+ compare ( expected, tokens) ;
2276+
2277+ all_dialects_where ( |dialect| dialect. supports_numeric_literal_underscores ( ) ) . tokenizes_to (
2278+ "SELECT 10_000, _10_000, 10_00_, 10___0" ,
2279+ vec ! [
2280+ Token :: make_keyword( "SELECT" ) ,
2281+ Token :: Whitespace ( Whitespace :: Space ) ,
2282+ Token :: Number ( "10_000" . to_string( ) , false ) ,
2283+ Token :: Comma ,
2284+ Token :: Whitespace ( Whitespace :: Space ) ,
2285+ Token :: make_word( "_10_000" , None ) , // leading underscore tokenizes as a word (parsed as column identifier)
2286+ Token :: Comma ,
2287+ Token :: Whitespace ( Whitespace :: Space ) ,
2288+ Token :: Number ( "10_00" . to_string( ) , false ) ,
2289+ Token :: make_word( "_" , None ) , // trailing underscores tokenizes as a word (syntax error in some dialects)
2290+ Token :: Comma ,
2291+ Token :: Whitespace ( Whitespace :: Space ) ,
2292+ Token :: Number ( "10" . to_string( ) , false ) ,
2293+ Token :: make_word( "___0" , None ) , // multiple underscores tokenizes as a word (syntax error in some dialects)
2294+ ] ,
2295+ ) ;
2296+ }
2297+
22302298 #[ test]
22312299 fn tokenize_select_exponent ( ) {
22322300 let sql = String :: from ( "SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10" ) ;
0 commit comments