1+ import { parse } from 'libpg-query' ;
2+ import { ParseResult , RawStmt } from '@pgsql/types' ;
3+
4+ export interface ExtractedStatement {
5+ statement : string ;
6+ index : number ;
7+ location ?: number ;
8+ length ?: number ;
9+ }
10+
11+ export interface StatementSplitterOptions {
12+ /** Skip validation for malformed statements */
13+ skipValidation ?: boolean ;
14+ /** Strip leading comments from extracted statements */
15+ stripComments ?: boolean ;
16+ }
17+
18+ /**
19+ * Extracts a single statement from SQL using PostgreSQL's location information.
20+ * Handles Unicode properly by using byte positions instead of character positions.
21+ */
22+ export function extractStatement (
23+ originalSQL : string ,
24+ rawStmt : RawStmt ,
25+ isFirst : boolean = false ,
26+ options : StatementSplitterOptions = { }
27+ ) : string | null {
28+ let extracted : string | null = null ;
29+
30+ // Convert string to buffer to handle byte positions correctly (for Unicode)
31+ const sqlBuffer = Buffer . from ( originalSQL , 'utf8' ) ;
32+
33+ if ( rawStmt . stmt_location !== undefined && rawStmt . stmt_len !== undefined ) {
34+ // Use byte positions as provided by PostgreSQL
35+ const startByte = rawStmt . stmt_location ;
36+ const endByte = rawStmt . stmt_location + rawStmt . stmt_len ;
37+
38+ // Extract using byte positions and convert back to string
39+ const extractedBuffer = sqlBuffer . slice ( startByte , endByte ) ;
40+ extracted = extractedBuffer . toString ( 'utf8' ) ;
41+ } else if ( rawStmt . stmt_location !== undefined && rawStmt . stmt_len === undefined ) {
42+ // We have location but no length - extract from location to end of file
43+ const extractedBuffer = sqlBuffer . slice ( rawStmt . stmt_location ) ;
44+ extracted = extractedBuffer . toString ( 'utf8' ) ;
45+ } else if ( isFirst && rawStmt . stmt_len !== undefined ) {
46+ // For first statement when location is missing but we have length
47+ const extractedBuffer = sqlBuffer . slice ( 0 , rawStmt . stmt_len ) ;
48+ extracted = extractedBuffer . toString ( 'utf8' ) ;
49+ } else if ( isFirst && rawStmt . stmt_location === undefined && rawStmt . stmt_len === undefined ) {
50+ // For first statement when both location and length are missing, use entire SQL
51+ extracted = originalSQL ;
52+ }
53+
54+ if ( extracted && options . stripComments !== false ) {
55+ // Split into lines to handle leading whitespace and comments properly
56+ const lines = extracted . split ( '\n' ) ;
57+ let startLineIndex = 0 ;
58+
59+ // Find the first line that contains actual SQL content
60+ for ( let i = 0 ; i < lines . length ; i ++ ) {
61+ const line = lines [ i ] . trim ( ) ;
62+ // Skip empty lines and comment-only lines
63+ if ( line === '' || line . startsWith ( '--' ) ) {
64+ continue ;
65+ }
66+ startLineIndex = i ;
67+ break ;
68+ }
69+
70+ // Reconstruct from the first SQL line, preserving the original indentation of that line
71+ if ( startLineIndex < lines . length ) {
72+ const resultLines = lines . slice ( startLineIndex ) ;
73+ extracted = resultLines . join ( '\n' ) . trim ( ) ;
74+ }
75+ }
76+
77+ // Final validation unless skipped
78+ if ( extracted && ! options . skipValidation ) {
79+ const firstLine = extracted . split ( '\n' ) [ 0 ] . trim ( ) ;
80+ const firstWord = firstLine . split ( / \s + / ) [ 0 ] . toUpperCase ( ) ;
81+
82+ // Only check for most obvious malformed patterns at the BEGINNING
83+ if (
84+ // Check if it starts with truncated patterns (not just contains anywhere)
85+ extracted . trim ( ) . startsWith ( 'ELECT ' ) || // Missing S from SELECT
86+ extracted . trim ( ) . startsWith ( 'REATE ' ) || // Missing C from CREATE
87+ extracted . trim ( ) . startsWith ( 'NSERT ' ) || // Missing I from INSERT
88+ // Completely empty or whitespace only
89+ extracted . trim ( ) . length === 0
90+ ) {
91+ return null ; // Invalid extraction, skip this statement
92+ }
93+ }
94+
95+ return extracted ;
96+ }
97+
98+ /**
99+ * Splits SQL text into individual statements using PostgreSQL's parser.
100+ * Handles Unicode characters properly and provides detailed location information.
101+ */
102+ export async function splitStatements (
103+ sql : string ,
104+ options : StatementSplitterOptions = { }
105+ ) : Promise < ExtractedStatement [ ] > {
106+ const parseResult : ParseResult = await parse ( sql ) ;
107+ const statements : ExtractedStatement [ ] = [ ] ;
108+
109+ if ( ! parseResult . stmts ) {
110+ return statements ;
111+ }
112+
113+ for ( let idx = 0 ; idx < parseResult . stmts . length ; idx ++ ) {
114+ const stmt = parseResult . stmts [ idx ] ;
115+ const extracted = extractStatement ( sql , stmt , idx === 0 , options ) ;
116+
117+ if ( extracted ) {
118+ statements . push ( {
119+ statement : extracted ,
120+ index : idx ,
121+ location : stmt . stmt_location ,
122+ length : stmt . stmt_len
123+ } ) ;
124+ }
125+ }
126+
127+ return statements ;
128+ }
129+
130+ /**
131+ * Utility to generate statement keys for fixtures
132+ */
133+ export function generateStatementKey (
134+ relativePath : string ,
135+ statementIndex : number ,
136+ extension : string = 'sql'
137+ ) : string {
138+ return `${ relativePath . replace ( / \. s q l $ / , '' ) } -${ statementIndex + 1 } .${ extension } ` ;
139+ }
140+
141+ /**
142+ * Test utility to compare byte vs character extraction for debugging Unicode issues
143+ */
144+ export function debugUnicodeExtraction ( sql : string , rawStmt : RawStmt ) : {
145+ characterBased : string ;
146+ byteBased : string ;
147+ matches : boolean ;
148+ unicodeChars : number ;
149+ byteLength : number ;
150+ charLength : number ;
151+ } {
152+ const charLength = sql . length ;
153+ const byteLength = Buffer . from ( sql , 'utf8' ) . length ;
154+
155+ // Character-based extraction (old way)
156+ let characterBased = '' ;
157+ if ( rawStmt . stmt_location !== undefined && rawStmt . stmt_len !== undefined ) {
158+ characterBased = sql . substring ( rawStmt . stmt_location , rawStmt . stmt_location + rawStmt . stmt_len ) ;
159+ }
160+
161+ // Byte-based extraction (new way)
162+ let byteBased = '' ;
163+ if ( rawStmt . stmt_location !== undefined && rawStmt . stmt_len !== undefined ) {
164+ const sqlBuffer = Buffer . from ( sql , 'utf8' ) ;
165+ const extractedBuffer = sqlBuffer . slice ( rawStmt . stmt_location , rawStmt . stmt_location + rawStmt . stmt_len ) ;
166+ byteBased = extractedBuffer . toString ( 'utf8' ) ;
167+ }
168+
169+ return {
170+ characterBased,
171+ byteBased,
172+ matches : characterBased === byteBased ,
173+ unicodeChars : byteLength - charLength ,
174+ byteLength,
175+ charLength
176+ } ;
177+ }
0 commit comments