From f200db90b37be9f49486e2013e364cde059c41bc Mon Sep 17 00:00:00 2001 From: Hamilton Ulmer Date: Mon, 11 Aug 2025 16:31:12 -0700 Subject: [PATCH] Add support for DuckDB underscore numeric literals - Add numberRegex option to TokenizerOptions to allow custom number patterns - Update Tokenizer to use custom number regex when provided - Configure DuckDB formatter to support underscore separators in numbers (1_000_000) - Add test for underscore numeric literals in DuckDB --- src/languages/duckdb/duckdb.formatter.ts | 3 +++ src/lexer/Tokenizer.ts | 1 + src/lexer/TokenizerOptions.ts | 2 ++ test/duckdb.test.ts | 11 +++++++++++ 4 files changed, 17 insertions(+) diff --git a/src/languages/duckdb/duckdb.formatter.ts b/src/languages/duckdb/duckdb.formatter.ts index 303e7c116..8a604bc6e 100644 --- a/src/languages/duckdb/duckdb.formatter.ts +++ b/src/languages/duckdb/duckdb.formatter.ts @@ -155,6 +155,9 @@ export const duckdb: DialectOptions = { reservedFunctionNames: functions, nestedBlockComments: true, extraParens: ['[]', '{}'], + // Support underscore separators in numeric literals (e.g., 1_000_000) + numberRegex: + /(?:0x[0-9a-fA-F_]+|0b[01_]+|(?:-\s*)?(?:[0-9_]*\.[0-9_]+|[0-9_]+(?:\.[0-9_]*)?)(?:[eE][-+]?[0-9_]+(?:\.[0-9_]+)?)?)(?![\w\p{Alphabetic}])/uy, stringTypes: [ '$$', "''-qq", diff --git a/src/lexer/Tokenizer.ts b/src/lexer/Tokenizer.ts index bfbe111d0..75dc8543c 100644 --- a/src/lexer/Tokenizer.ts +++ b/src/lexer/Tokenizer.ts @@ -51,6 +51,7 @@ export default class Tokenizer { { type: TokenType.NUMBER, regex: + cfg.numberRegex ?? /(?:0x[0-9a-fA-F]+|0b[01]+|(?:-\s*)?(?:[0-9]*\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][-+]?[0-9]+(?:\.[0-9]+)?)?)(?![\w\p{Alphabetic}])/uy, }, // RESERVED_PHRASE is matched before all other keyword tokens diff --git a/src/lexer/TokenizerOptions.ts b/src/lexer/TokenizerOptions.ts index f7bd3b25f..c8add8fcb 100644 --- a/src/lexer/TokenizerOptions.ts +++ b/src/lexer/TokenizerOptions.ts @@ -100,6 +100,8 @@ export interface TokenizerOptions { propertyAccessOperators?: string[]; // Enables PostgreSQL-specific OPERATOR(...) syntax operatorKeyword?: boolean; + // Custom regex pattern for number tokens (defaults to standard SQL number pattern) + numberRegex?: RegExp; // Allows custom modifications on the token array. // Called after the whole input string has been split into tokens. // The result of this will be the output of the tokenizer. diff --git a/test/duckdb.test.ts b/test/duckdb.test.ts index b6ea26583..162bdb539 100644 --- a/test/duckdb.test.ts +++ b/test/duckdb.test.ts @@ -214,4 +214,15 @@ describe('DuckDBFormatter', () => { 1 IS NOT NULL; `); }); + + it('supports underscore separators in numeric literals', () => { + expect(format('SELECT 1_000_000, 3.14_159, 0x1A_2B_3C, 0b1010_0001, 1.5e+1_0;')).toBe(dedent` + SELECT + 1_000_000, + 3.14_159, + 0x1A_2B_3C, + 0b1010_0001, + 1.5e+1_0; + `); + }); });