@@ -1186,8 +1186,8 @@ impl<'a> Tokenizer<'a> {
11861186
11871187 Ok ( Some ( Token :: make_word ( & word. concat ( ) , Some ( quote_start) ) ) )
11881188 }
1189- // numbers and period
1190- '0' ..='9' | '.' => {
1189+ // Numbers
1190+ '0' ..='9' => {
11911191 // Some dialects support underscore as number separator
11921192 // There can only be one at a time and it must be followed by another digit
11931193 let is_number_separator = |ch : char , next_char : Option < char > | {
@@ -1196,11 +1196,12 @@ impl<'a> Tokenizer<'a> {
11961196 && next_char. is_some_and ( |next_ch| next_ch. is_ascii_hexdigit ( ) )
11971197 } ;
11981198
1199+ // Start with number or potential separator
11991200 let mut s = peeking_next_take_while ( chars, |ch, next_ch| {
12001201 ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
12011202 } ) ;
12021203
1203- // match binary literal that starts with 0x
1204+ // Match binary literal that starts with 0x
12041205 if s == "0" && chars. peek ( ) == Some ( & 'x' ) {
12051206 chars. next ( ) ;
12061207 let s2 = peeking_next_take_while ( chars, |ch, next_ch| {
@@ -1209,60 +1210,41 @@ impl<'a> Tokenizer<'a> {
12091210 return Ok ( Some ( Token :: HexStringLiteral ( s2) ) ) ;
12101211 }
12111212
1212- // match one period
1213+ // Match fractional part after a dot
12131214 if let Some ( '.' ) = chars. peek ( ) {
12141215 s. push ( '.' ) ;
12151216 chars. next ( ) ;
12161217 }
12171218
1218- // If the dialect supports identifiers that start with a numeric prefix
1219- // and we have now consumed a dot, check if the previous token was a Word.
1220- // If so, what follows is definitely not part of a decimal number and
1221- // we should yield the dot as a dedicated token so compound identifiers
1222- // starting with digits can be parsed correctly.
1223- if s == "." && self . dialect . supports_numeric_prefix ( ) {
1224- if let Some ( Token :: Word ( _) ) = prev_token {
1225- return Ok ( Some ( Token :: Period ) ) ;
1226- }
1227- }
1228-
1229- // Consume fractional digits.
1219+ // Consume fractional digits
12301220 s += & peeking_next_take_while ( chars, |ch, next_ch| {
12311221 ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
12321222 } ) ;
12331223
1234- // No fraction -> Token::Period
1235- if s == "." {
1236- return Ok ( Some ( Token :: Period ) ) ;
1237- }
1238-
1239- // Parse exponent as number
1224+ // Parse exponent part (e.g., e+10 or E-5)
12401225 let mut exponent_part = String :: new ( ) ;
12411226 if chars. peek ( ) == Some ( & 'e' ) || chars. peek ( ) == Some ( & 'E' ) {
12421227 let mut char_clone = chars. peekable . clone ( ) ;
1243- exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ;
1228+ exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ; // consume 'e' or 'E'
12441229
12451230 // Optional sign
1246- match char_clone. peek ( ) {
1247- Some ( & c ) if matches ! ( c , '+' | '-' ) => {
1231+ if let Some ( & c ) = char_clone. peek ( ) {
1232+ if c == '+' || c == '-' {
12481233 exponent_part. push ( c) ;
12491234 char_clone. next ( ) ;
12501235 }
1251- _ => ( ) ,
12521236 }
12531237
1254- match char_clone . peek ( ) {
1255- // Definitely an exponent, get original iterator up to speed and use it
1256- Some ( & c ) if c. is_ascii_digit ( ) => {
1238+ // Parse digits after the exponent
1239+ if let Some ( & c ) = char_clone . peek ( ) {
1240+ if c. is_ascii_digit ( ) {
12571241 for _ in 0 ..exponent_part. len ( ) {
12581242 chars. next ( ) ;
12591243 }
12601244 exponent_part +=
12611245 & peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
12621246 s += exponent_part. as_str ( ) ;
12631247 }
1264- // Not an exponent, discard the work done
1265- _ => ( ) ,
12661248 }
12671249 }
12681250
@@ -1271,8 +1253,7 @@ impl<'a> Tokenizer<'a> {
12711253 // be tokenized as a word.
12721254 if self . dialect . supports_numeric_prefix ( ) {
12731255 if exponent_part. is_empty ( ) {
1274- // If it is not a number with an exponent, it may be
1275- // an identifier starting with digits.
1256+ // Handle as potential word if no exponent part
12761257 let word =
12771258 peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
12781259
@@ -1281,20 +1262,84 @@ impl<'a> Tokenizer<'a> {
12811262 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
12821263 }
12831264 } else if prev_token == Some ( & Token :: Period ) {
1284- // If the previous token was a period, thus not belonging to a number,
1285- // the value we have is part of an identifier.
1265+ // Handle as word if it follows a period
12861266 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
12871267 }
12881268 }
12891269
1270+ // Handle "L" suffix for long numbers
12901271 let long = if chars. peek ( ) == Some ( & 'L' ) {
12911272 chars. next ( ) ;
12921273 true
12931274 } else {
12941275 false
12951276 } ;
1277+
1278+ // Return the final token for the number
12961279 Ok ( Some ( Token :: Number ( s, long) ) )
12971280 }
1281+
1282+ // Period (`.`) handling
1283+ '.' => {
1284+ chars. next ( ) ; // consume the dot
1285+
1286+ match chars. peek ( ) {
1287+ Some ( '_' ) => {
1288+ // Handle "._" case as a period (special token) followed by identifier
1289+ Ok ( Some ( Token :: Period ) )
1290+ }
1291+ Some ( ch)
1292+ // Hive and mysql dialects allow numeric prefixes for identifers
1293+ if ch. is_ascii_digit ( )
1294+ && self . dialect . supports_numeric_prefix ( )
1295+ && matches ! ( prev_token, Some ( Token :: Word ( _) ) ) =>
1296+ {
1297+ Ok ( Some ( Token :: Period ) )
1298+ }
1299+ Some ( ch) if ch. is_ascii_digit ( ) => {
1300+ // Handle numbers starting with a dot (e.g., ".123")
1301+ let mut s = String :: from ( "." ) ;
1302+ let is_number_separator = |ch : char , next_char : Option < char > | {
1303+ self . dialect . supports_numeric_literal_underscores ( )
1304+ && ch == '_'
1305+ && next_char. is_some_and ( |c| c. is_ascii_digit ( ) )
1306+ } ;
1307+
1308+ s += & peeking_next_take_while ( chars, |ch, next_ch| {
1309+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1310+ } ) ;
1311+
1312+ // Handle exponent part
1313+ if matches ! ( chars. peek( ) , Some ( 'e' | 'E' ) ) {
1314+ let mut exp = String :: new ( ) ;
1315+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1316+
1317+ if matches ! ( chars. peek( ) , Some ( '+' | '-' ) ) {
1318+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1319+ }
1320+
1321+ if matches ! ( chars. peek( ) , Some ( c) if c. is_ascii_digit( ) ) {
1322+ exp += & peeking_take_while ( chars, |c| c. is_ascii_digit ( ) ) ;
1323+ s += & exp;
1324+ }
1325+ }
1326+
1327+ // Handle "L" suffix for long numbers
1328+ let long = if chars. peek ( ) == Some ( & 'L' ) {
1329+ chars. next ( ) ;
1330+ true
1331+ } else {
1332+ false
1333+ } ;
1334+
1335+ Ok ( Some ( Token :: Number ( s, long) ) )
1336+ }
1337+ _ => {
1338+ // Just a plain period
1339+ Ok ( Some ( Token :: Period ) )
1340+ }
1341+ }
1342+ }
12981343 // punctuation
12991344 '(' => self . consume_and_return ( chars, Token :: LParen ) ,
13001345 ')' => self . consume_and_return ( chars, Token :: RParen ) ,
@@ -2429,6 +2474,32 @@ mod tests {
24292474 compare ( expected, tokens) ;
24302475 }
24312476
2477+ #[ test]
2478+ fn tokenize_period_underscore ( ) {
2479+ let sql = String :: from ( "SELECT table._col" ) ;
2480+ // a dialect that supports underscores in numeric literals
2481+ let dialect = PostgreSqlDialect { } ;
2482+ let tokens = Tokenizer :: new ( & dialect, & sql) . tokenize ( ) . unwrap ( ) ;
2483+
2484+ let expected = vec ! [
2485+ Token :: make_keyword( "SELECT" ) ,
2486+ Token :: Whitespace ( Whitespace :: Space ) ,
2487+ Token :: Word ( Word {
2488+ value: "table" . to_string( ) ,
2489+ quote_style: None ,
2490+ keyword: Keyword :: TABLE ,
2491+ } ) ,
2492+ Token :: Period ,
2493+ Token :: Word ( Word {
2494+ value: "_col" . to_string( ) ,
2495+ quote_style: None ,
2496+ keyword: Keyword :: NoKeyword ,
2497+ } ) ,
2498+ ] ;
2499+
2500+ compare ( expected, tokens) ;
2501+ }
2502+
24322503 #[ test]
24332504 fn tokenize_select_float ( ) {
24342505 let sql = String :: from ( "SELECT .1" ) ;
0 commit comments