Skip to content

Commit 0b6ba21

Browse files
authored
Merge pull request #36 from launchql/feat/full-parse
Feat/full parse
2 parents f305d4e + f9cd746 commit 0b6ba21

File tree

8 files changed

+17635
-15744
lines changed

8 files changed

+17635
-15744
lines changed

__fixtures__/generated/generated.json

Lines changed: 15724 additions & 15724 deletions
Large diffs are not rendered by default.

__fixtures__/generated/upstream-diff.json

Lines changed: 1594 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
COPY (SELECT 1) TO 'test.csv' WITH (FORMAT 'CSV');
1+
COPY (SELECT 1) TO '/test.csv' WITH (FORMAT CSV);

packages/deparser/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"fixtures:ast": "ts-node scripts/make-fixtures-ast.ts",
3131
"fixtures:sql": "ts-node scripts/make-fixtures-sql.ts",
3232
"fixtures": "ts-node scripts/make-fixtures.ts",
33+
"fixtures:upstream-diff": "ts-node scripts/make-upstream-diff.ts",
3334
"lint": "eslint . --fix",
3435
"test": "jest",
3536
"test:watch": "jest --watch"

packages/deparser/scripts/make-fixtures.ts

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
import * as path from 'path';
33
import * as fs from 'fs';
44
import { sync as globSync } from 'glob';
5-
import { parse, deparse } from 'libpg-query';
6-
import { ParseResult, RawStmt } from '@pgsql/types';
5+
import { parse } from 'libpg-query';
6+
import { splitStatements, generateStatementKey } from '../src/utils/statement-splitter';
77

88
const FIXTURE_DIR = path.join(__dirname, '../../../__fixtures__/kitchen-sink');
99
const OUT_DIR = path.join(__dirname, '../../../__fixtures__/generated');
@@ -19,32 +19,33 @@ ensureDir(OUT_DIR);
1919
const fixtures = globSync(path.join(FIXTURE_DIR, '**/*.sql'));
2020

2121
async function main() {
22-
// Collect deparsed SQL in a single JSON
22+
// Collect original SQL in a single JSON
2323
const results: Record<string, string> = {};
2424

2525
for (const fixturePath of fixtures) {
2626
const relPath = path.relative(FIXTURE_DIR, fixturePath);
2727
const sql = fs.readFileSync(fixturePath, 'utf-8');
28-
let parseResult: ParseResult;
28+
2929
try {
30-
parseResult = await parse(sql);
30+
const statements = await splitStatements(sql);
31+
32+
for (const stmt of statements) {
33+
const key = generateStatementKey(relPath, stmt.index);
34+
35+
// Validate that the extracted statement parses correctly on its own
36+
try {
37+
await parse(stmt.statement);
38+
results[key] = stmt.statement;
39+
} catch (parseErr: any) {
40+
console.error(`Failed to parse extracted statement ${key}:`, parseErr.message);
41+
console.error(`Statement: ${stmt.statement.substring(0, 200)}${stmt.statement.length > 200 ? '...' : ''}`);
42+
// Skip this statement - don't add it to results
43+
}
44+
}
3145
} catch (err: any) {
3246
console.error(`Failed to parse ${relPath}:`, err);
3347
continue;
3448
}
35-
36-
for (let idx = 0; idx < parseResult.stmts.length; idx++) {
37-
const stmt = parseResult.stmts[idx];
38-
let deparsedSql: string;
39-
try {
40-
deparsedSql = await deparse({ version: 170000, stmts: [stmt] });
41-
} catch (err: any) {
42-
console.error(`Failed to deparse statement ${idx + 1} in ${relPath}:`, err);
43-
continue;
44-
}
45-
const key = `${relPath.replace(/\.sql$/, '')}-${idx + 1}.sql`;
46-
results[key] = deparsedSql;
47-
}
4849
}
4950

5051
// Write aggregated JSON to output file
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env ts-node
2+
import * as path from 'path';
3+
import * as fs from 'fs';
4+
import { sync as globSync } from 'glob';
5+
import { parse, deparse } from 'libpg-query';
6+
import { ParseResult, RawStmt } from '@pgsql/types';
7+
import { deparse as ourDeparse } from '../src';
8+
import { cleanTree } from '../src/utils';
9+
import { splitStatements, generateStatementKey } from '../src/utils/statement-splitter';
10+
11+
const FIXTURE_DIR = path.join(__dirname, '../../../__fixtures__/kitchen-sink');
12+
const OUT_DIR = path.join(__dirname, '../../../__fixtures__/generated');
13+
14+
function ensureDir(dir: string) {
15+
if (!fs.existsSync(dir)) {
16+
fs.mkdirSync(dir, { recursive: true });
17+
}
18+
}
19+
20+
ensureDir(OUT_DIR);
21+
22+
const fixtures = globSync(path.join(FIXTURE_DIR, '**/*.sql'));
23+
24+
async function main() {
25+
// Collect only files with differences between deparsers
26+
const results: Record<string, { upstream?: string; deparsed?: string; original: string }> = {};
27+
28+
for (const fixturePath of fixtures) {
29+
const relPath = path.relative(FIXTURE_DIR, fixturePath);
30+
const sql = fs.readFileSync(fixturePath, 'utf-8');
31+
32+
try {
33+
const statements = await splitStatements(sql);
34+
35+
for (const stmt of statements) {
36+
// We need the original statement to get the RawStmt for deparsing
37+
const parseResult = await parse(sql);
38+
const rawStmt = parseResult.stmts[stmt.index];
39+
40+
// Get source of truth: cleanTree(parse(original))
41+
let sourceOfTruthAst: any;
42+
try {
43+
const originalParsed = await parse(stmt.statement);
44+
sourceOfTruthAst = cleanTree(originalParsed.stmts?.[0]?.stmt);
45+
} catch (err: any) {
46+
console.error(`Failed to parse original SQL for statement ${stmt.index + 1} in ${relPath}:`, err);
47+
continue;
48+
}
49+
50+
// Get upstream deparse and its AST
51+
let upstreamSql: string | undefined;
52+
let upstreamAst: any;
53+
try {
54+
upstreamSql = await deparse({ version: 170000, stmts: [rawStmt] });
55+
const upstreamParsed = await parse(upstreamSql);
56+
upstreamAst = cleanTree(upstreamParsed.stmts?.[0]?.stmt);
57+
} catch (err: any) {
58+
console.error(`Failed to process upstream deparse for statement ${stmt.index + 1} in ${relPath}:`, err);
59+
continue;
60+
}
61+
62+
// Get our deparse and its AST
63+
let ourDeparsedSql: string | undefined;
64+
let ourAst: any;
65+
let ourDeParseError = false;
66+
try {
67+
ourDeparsedSql = ourDeparse(rawStmt.stmt);
68+
const ourParsed = await parse(ourDeparsedSql);
69+
ourAst = cleanTree(ourParsed.stmts?.[0]?.stmt);
70+
} catch (err: any) {
71+
console.error(`Failed to process our deparse for statement ${stmt.index + 1} in ${relPath}:`, err);
72+
ourDeParseError = true;
73+
// Keep ourDeparsedSql so we can still show it in results even if it doesn't parse
74+
}
75+
76+
// Compare ASTs to source of truth only
77+
const upstreamMatches = JSON.stringify(upstreamAst) === JSON.stringify(sourceOfTruthAst);
78+
const ourMatches = ourAst ? JSON.stringify(ourAst) === JSON.stringify(sourceOfTruthAst) : false;
79+
80+
81+
// Only include if either deparser differs from original OR our deparser failed to parse
82+
if (!upstreamMatches || !ourMatches || ourDeParseError) {
83+
const key = generateStatementKey(relPath, stmt.index);
84+
results[key] = {
85+
original: stmt.statement,
86+
// Show upstream only if it differs from original
87+
...(!upstreamMatches && upstreamSql && { upstream: upstreamSql }),
88+
// Show our deparser if it differs from original OR if it failed to parse (both indicate issues)
89+
...((!ourMatches || ourDeParseError) && ourDeparsedSql && { deparsed: ourDeparsedSql })
90+
};
91+
}
92+
}
93+
} catch (err: any) {
94+
console.error(`Failed to parse ${relPath}:`, err);
95+
continue;
96+
}
97+
}
98+
99+
// Write aggregated JSON to output file
100+
const outputFile = path.join(OUT_DIR, 'upstream-diff.json');
101+
fs.writeFileSync(outputFile, JSON.stringify(results, null, 2));
102+
console.log(`Wrote JSON to ${outputFile}`);
103+
}
104+
105+
main().catch(console.error);

packages/deparser/src/deparser.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5461,7 +5461,20 @@ export class Deparser implements DeparserVisitor {
54615461
: argValue;
54625462
return `${node.defname} = ${quotedValue}`;
54635463
}
5464-
5464+
5465+
// Handle CopyStmt WITH clause options - uppercase format without quotes
5466+
if (context.parentNodeTypes.includes('CopyStmt')) {
5467+
if (node.defname === 'format' && node.arg && this.getNodeType(node.arg) === 'String') {
5468+
const stringData = this.getNodeData(node.arg);
5469+
return `FORMAT ${stringData.sval.toUpperCase()}`;
5470+
}
5471+
// Handle other COPY options with uppercase defname
5472+
if (node.arg) {
5473+
return `${node.defname.toUpperCase()} ${argValue}`;
5474+
}
5475+
return node.defname.toUpperCase();
5476+
}
5477+
54655478
// Handle CREATE OPERATOR and CREATE TYPE context
54665479
if (context.parentNodeTypes.includes('DefineStmt')) {
54675480
const preservedName = this.preserveOperatorDefElemCase(node.defname);
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import { parse } from 'libpg-query';
2+
import { ParseResult, RawStmt } from '@pgsql/types';
3+
4+
export interface ExtractedStatement {
5+
statement: string;
6+
index: number;
7+
location?: number;
8+
length?: number;
9+
}
10+
11+
export interface StatementSplitterOptions {
12+
/** Skip validation for malformed statements */
13+
skipValidation?: boolean;
14+
/** Strip leading comments from extracted statements */
15+
stripComments?: boolean;
16+
}
17+
18+
/**
19+
* Extracts a single statement from SQL using PostgreSQL's location information.
20+
* Handles Unicode properly by using byte positions instead of character positions.
21+
*/
22+
export function extractStatement(
23+
originalSQL: string,
24+
rawStmt: RawStmt,
25+
isFirst: boolean = false,
26+
options: StatementSplitterOptions = {}
27+
): string | null {
28+
let extracted: string | null = null;
29+
30+
// Convert string to buffer to handle byte positions correctly (for Unicode)
31+
const sqlBuffer = Buffer.from(originalSQL, 'utf8');
32+
33+
if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len !== undefined) {
34+
// Use byte positions as provided by PostgreSQL
35+
const startByte = rawStmt.stmt_location;
36+
const endByte = rawStmt.stmt_location + rawStmt.stmt_len;
37+
38+
// Extract using byte positions and convert back to string
39+
const extractedBuffer = sqlBuffer.slice(startByte, endByte);
40+
extracted = extractedBuffer.toString('utf8');
41+
} else if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len === undefined) {
42+
// We have location but no length - extract from location to end of file
43+
const extractedBuffer = sqlBuffer.slice(rawStmt.stmt_location);
44+
extracted = extractedBuffer.toString('utf8');
45+
} else if (isFirst && rawStmt.stmt_len !== undefined) {
46+
// For first statement when location is missing but we have length
47+
const extractedBuffer = sqlBuffer.slice(0, rawStmt.stmt_len);
48+
extracted = extractedBuffer.toString('utf8');
49+
} else if (isFirst && rawStmt.stmt_location === undefined && rawStmt.stmt_len === undefined) {
50+
// For first statement when both location and length are missing, use entire SQL
51+
extracted = originalSQL;
52+
}
53+
54+
if (extracted && options.stripComments !== false) {
55+
// Split into lines to handle leading whitespace and comments properly
56+
const lines = extracted.split('\n');
57+
let startLineIndex = 0;
58+
59+
// Find the first line that contains actual SQL content
60+
for (let i = 0; i < lines.length; i++) {
61+
const line = lines[i].trim();
62+
// Skip empty lines and comment-only lines
63+
if (line === '' || line.startsWith('--')) {
64+
continue;
65+
}
66+
startLineIndex = i;
67+
break;
68+
}
69+
70+
// Reconstruct from the first SQL line, preserving the original indentation of that line
71+
if (startLineIndex < lines.length) {
72+
const resultLines = lines.slice(startLineIndex);
73+
extracted = resultLines.join('\n').trim();
74+
}
75+
}
76+
77+
// Final validation unless skipped
78+
if (extracted && !options.skipValidation) {
79+
const firstLine = extracted.split('\n')[0].trim();
80+
const firstWord = firstLine.split(/\s+/)[0].toUpperCase();
81+
82+
// Only check for most obvious malformed patterns at the BEGINNING
83+
if (
84+
// Check if it starts with truncated patterns (not just contains anywhere)
85+
extracted.trim().startsWith('ELECT ') || // Missing S from SELECT
86+
extracted.trim().startsWith('REATE ') || // Missing C from CREATE
87+
extracted.trim().startsWith('NSERT ') || // Missing I from INSERT
88+
// Completely empty or whitespace only
89+
extracted.trim().length === 0
90+
) {
91+
return null; // Invalid extraction, skip this statement
92+
}
93+
}
94+
95+
return extracted;
96+
}
97+
98+
/**
99+
* Splits SQL text into individual statements using PostgreSQL's parser.
100+
* Handles Unicode characters properly and provides detailed location information.
101+
*/
102+
export async function splitStatements(
103+
sql: string,
104+
options: StatementSplitterOptions = {}
105+
): Promise<ExtractedStatement[]> {
106+
const parseResult: ParseResult = await parse(sql);
107+
const statements: ExtractedStatement[] = [];
108+
109+
if (!parseResult.stmts) {
110+
return statements;
111+
}
112+
113+
for (let idx = 0; idx < parseResult.stmts.length; idx++) {
114+
const stmt = parseResult.stmts[idx];
115+
const extracted = extractStatement(sql, stmt, idx === 0, options);
116+
117+
if (extracted) {
118+
statements.push({
119+
statement: extracted,
120+
index: idx,
121+
location: stmt.stmt_location,
122+
length: stmt.stmt_len
123+
});
124+
}
125+
}
126+
127+
return statements;
128+
}
129+
130+
/**
131+
* Utility to generate statement keys for fixtures
132+
*/
133+
export function generateStatementKey(
134+
relativePath: string,
135+
statementIndex: number,
136+
extension: string = 'sql'
137+
): string {
138+
return `${relativePath.replace(/\.sql$/, '')}-${statementIndex + 1}.${extension}`;
139+
}
140+
141+
/**
142+
* Test utility to compare byte vs character extraction for debugging Unicode issues
143+
*/
144+
export function debugUnicodeExtraction(sql: string, rawStmt: RawStmt): {
145+
characterBased: string;
146+
byteBased: string;
147+
matches: boolean;
148+
unicodeChars: number;
149+
byteLength: number;
150+
charLength: number;
151+
} {
152+
const charLength = sql.length;
153+
const byteLength = Buffer.from(sql, 'utf8').length;
154+
155+
// Character-based extraction (old way)
156+
let characterBased = '';
157+
if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len !== undefined) {
158+
characterBased = sql.substring(rawStmt.stmt_location, rawStmt.stmt_location + rawStmt.stmt_len);
159+
}
160+
161+
// Byte-based extraction (new way)
162+
let byteBased = '';
163+
if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len !== undefined) {
164+
const sqlBuffer = Buffer.from(sql, 'utf8');
165+
const extractedBuffer = sqlBuffer.slice(rawStmt.stmt_location, rawStmt.stmt_location + rawStmt.stmt_len);
166+
byteBased = extractedBuffer.toString('utf8');
167+
}
168+
169+
return {
170+
characterBased,
171+
byteBased,
172+
matches: characterBased === byteBased,
173+
unicodeChars: byteLength - charLength,
174+
byteLength,
175+
charLength
176+
};
177+
}

0 commit comments

Comments
 (0)