Merge pull request #36 from launchql/feat/full-parse

pyramation · web-flow · commit 0b6ba2146701 · 2025-06-21T20:58:45.000-07:00
Feat/full parse
diff --git a/__fixtures__/generated/generated.json b/__fixtures__/generated/generated.json
diff --git a/__fixtures__/generated/upstream-diff.json b/__fixtures__/generated/upstream-diff.json
diff --git a/__fixtures__/kitchen-sink/original/copy.sql b/__fixtures__/kitchen-sink/original/copy.sql
@@ -1 +1 @@
-COPY (SELECT 1) TO 'test.csv' WITH (FORMAT 'CSV');
+COPY (SELECT 1) TO '/test.csv' WITH (FORMAT CSV);
diff --git a/packages/deparser/package.json b/packages/deparser/package.json
@@ -30,6 +30,7 @@
     "fixtures:ast": "ts-node scripts/make-fixtures-ast.ts",
     "fixtures:sql": "ts-node scripts/make-fixtures-sql.ts",
     "fixtures": "ts-node scripts/make-fixtures.ts",
+    "fixtures:upstream-diff": "ts-node scripts/make-upstream-diff.ts",
     "lint": "eslint . --fix",
     "test": "jest",
     "test:watch": "jest --watch"
diff --git a/packages/deparser/scripts/make-fixtures.ts b/packages/deparser/scripts/make-fixtures.ts
@@ -2,8 +2,8 @@
 import * as path from 'path';
 import * as fs from 'fs';
 import { sync as globSync } from 'glob';
-import { parse, deparse } from 'libpg-query';
-import { ParseResult, RawStmt } from '@pgsql/types';
+import { parse } from 'libpg-query';
+import { splitStatements, generateStatementKey } from '../src/utils/statement-splitter';
 
 const FIXTURE_DIR = path.join(__dirname, '../../../__fixtures__/kitchen-sink');
 const OUT_DIR = path.join(__dirname, '../../../__fixtures__/generated');
@@ -19,32 +19,33 @@ ensureDir(OUT_DIR);
 const fixtures = globSync(path.join(FIXTURE_DIR, '**/*.sql'));
 
 async function main() {
-  // Collect deparsed SQL in a single JSON
+  // Collect original SQL in a single JSON
   const results: Record<string, string> = {};
   
   for (const fixturePath of fixtures) {
     const relPath = path.relative(FIXTURE_DIR, fixturePath);
     const sql = fs.readFileSync(fixturePath, 'utf-8');
-    let parseResult: ParseResult;
+    
     try {
-      parseResult = await parse(sql);
+      const statements = await splitStatements(sql);
+      
+      for (const stmt of statements) {
+        const key = generateStatementKey(relPath, stmt.index);
+        
+        // Validate that the extracted statement parses correctly on its own
+        try {
+          await parse(stmt.statement);
+          results[key] = stmt.statement;
+        } catch (parseErr: any) {
+          console.error(`Failed to parse extracted statement ${key}:`, parseErr.message);
+          console.error(`Statement: ${stmt.statement.substring(0, 200)}${stmt.statement.length > 200 ? '...' : ''}`);
+          // Skip this statement - don't add it to results
+        }
+      }
     } catch (err: any) {
       console.error(`Failed to parse ${relPath}:`, err);
       continue;
     }
-    
-    for (let idx = 0; idx < parseResult.stmts.length; idx++) {
-      const stmt = parseResult.stmts[idx];
-      let deparsedSql: string;
-      try {
-        deparsedSql = await deparse({ version: 170000, stmts: [stmt] });
-      } catch (err: any) {
-        console.error(`Failed to deparse statement ${idx + 1} in ${relPath}:`, err);
-        continue;
-      }
-      const key = `${relPath.replace(/\.sql$/, '')}-${idx + 1}.sql`;
-      results[key] = deparsedSql;
-    }
   }
 
   // Write aggregated JSON to output file
diff --git a/packages/deparser/scripts/make-upstream-diff.ts b/packages/deparser/scripts/make-upstream-diff.ts
@@ -0,0 +1,105 @@
+#!/usr/bin/env ts-node
+import * as path from 'path';
+import * as fs from 'fs';
+import { sync as globSync } from 'glob';
+import { parse, deparse } from 'libpg-query';
+import { ParseResult, RawStmt } from '@pgsql/types';
+import { deparse as ourDeparse } from '../src';
+import { cleanTree } from '../src/utils';
+import { splitStatements, generateStatementKey } from '../src/utils/statement-splitter';
+
+const FIXTURE_DIR = path.join(__dirname, '../../../__fixtures__/kitchen-sink');
+const OUT_DIR = path.join(__dirname, '../../../__fixtures__/generated');
+
+function ensureDir(dir: string) {
+  if (!fs.existsSync(dir)) {
+    fs.mkdirSync(dir, { recursive: true });
+  }
+}
+
+ensureDir(OUT_DIR);
+
+const fixtures = globSync(path.join(FIXTURE_DIR, '**/*.sql'));
+
+async function main() {
+  // Collect only files with differences between deparsers
+  const results: Record<string, { upstream?: string; deparsed?: string; original: string }> = {};
+  
+  for (const fixturePath of fixtures) {
+    const relPath = path.relative(FIXTURE_DIR, fixturePath);
+    const sql = fs.readFileSync(fixturePath, 'utf-8');
+    
+    try {
+      const statements = await splitStatements(sql);
+      
+      for (const stmt of statements) {
+        // We need the original statement to get the RawStmt for deparsing
+        const parseResult = await parse(sql);
+        const rawStmt = parseResult.stmts[stmt.index];
+        
+        // Get source of truth: cleanTree(parse(original))
+        let sourceOfTruthAst: any;
+        try {
+          const originalParsed = await parse(stmt.statement);
+          sourceOfTruthAst = cleanTree(originalParsed.stmts?.[0]?.stmt);
+        } catch (err: any) {
+          console.error(`Failed to parse original SQL for statement ${stmt.index + 1} in ${relPath}:`, err);
+          continue;
+        }
+        
+        // Get upstream deparse and its AST
+        let upstreamSql: string | undefined;
+        let upstreamAst: any;
+        try {
+          upstreamSql = await deparse({ version: 170000, stmts: [rawStmt] });
+          const upstreamParsed = await parse(upstreamSql);
+          upstreamAst = cleanTree(upstreamParsed.stmts?.[0]?.stmt);
+        } catch (err: any) {
+          console.error(`Failed to process upstream deparse for statement ${stmt.index + 1} in ${relPath}:`, err);
+          continue;
+        }
+        
+        // Get our deparse and its AST
+        let ourDeparsedSql: string | undefined;
+        let ourAst: any;
+        let ourDeParseError = false;
+        try {
+          ourDeparsedSql = ourDeparse(rawStmt.stmt);
+          const ourParsed = await parse(ourDeparsedSql);
+          ourAst = cleanTree(ourParsed.stmts?.[0]?.stmt);
+        } catch (err: any) {
+          console.error(`Failed to process our deparse for statement ${stmt.index + 1} in ${relPath}:`, err);
+          ourDeParseError = true;
+          // Keep ourDeparsedSql so we can still show it in results even if it doesn't parse
+        }
+        
+        // Compare ASTs to source of truth only
+        const upstreamMatches = JSON.stringify(upstreamAst) === JSON.stringify(sourceOfTruthAst);
+        const ourMatches = ourAst ? JSON.stringify(ourAst) === JSON.stringify(sourceOfTruthAst) : false;
+        
+        
+        // Only include if either deparser differs from original OR our deparser failed to parse
+        if (!upstreamMatches || !ourMatches || ourDeParseError) {
+          const key = generateStatementKey(relPath, stmt.index);
+          results[key] = {
+            original: stmt.statement,
+            // Show upstream only if it differs from original
+            ...(!upstreamMatches && upstreamSql && { upstream: upstreamSql }),
+            // Show our deparser if it differs from original OR if it failed to parse (both indicate issues)
+            ...((!ourMatches || ourDeParseError) && ourDeparsedSql && { deparsed: ourDeparsedSql })
+          };
+        }
+      }
+    } catch (err: any) {
+      console.error(`Failed to parse ${relPath}:`, err);
+      continue;
+    }
+  }
+
+  // Write aggregated JSON to output file
+  const outputFile = path.join(OUT_DIR, 'upstream-diff.json');
+  fs.writeFileSync(outputFile, JSON.stringify(results, null, 2));
+  console.log(`Wrote JSON to ${outputFile}`);
+}
+
+main().catch(console.error);
diff --git a/packages/deparser/src/deparser.ts b/packages/deparser/src/deparser.ts
@@ -5461,7 +5461,20 @@ export class Deparser implements DeparserVisitor {
           : argValue;
         return `${node.defname} = ${quotedValue}`;
       }
-      
+            
+      // Handle CopyStmt WITH clause options - uppercase format without quotes
+      if (context.parentNodeTypes.includes('CopyStmt')) {
+        if (node.defname === 'format' && node.arg && this.getNodeType(node.arg) === 'String') {
+          const stringData = this.getNodeData(node.arg);
+          return `FORMAT ${stringData.sval.toUpperCase()}`;
+        }
+        // Handle other COPY options with uppercase defname
+        if (node.arg) {
+          return `${node.defname.toUpperCase()} ${argValue}`;
+        }
+        return node.defname.toUpperCase();
+      }
+
       // Handle CREATE OPERATOR and CREATE TYPE context
       if (context.parentNodeTypes.includes('DefineStmt')) {
         const preservedName = this.preserveOperatorDefElemCase(node.defname);
diff --git a/packages/deparser/src/utils/statement-splitter.ts b/packages/deparser/src/utils/statement-splitter.ts
@@ -0,0 +1,177 @@
+import { parse } from 'libpg-query';
+import { ParseResult, RawStmt } from '@pgsql/types';
+
+export interface ExtractedStatement {
+  statement: string;
+  index: number;
+  location?: number;
+  length?: number;
+}
+
+export interface StatementSplitterOptions {
+  /** Skip validation for malformed statements */
+  skipValidation?: boolean;
+  /** Strip leading comments from extracted statements */
+  stripComments?: boolean;
+}
+
+/**
+ * Extracts a single statement from SQL using PostgreSQL's location information.
+ * Handles Unicode properly by using byte positions instead of character positions.
+ */
+export function extractStatement(
+  originalSQL: string, 
+  rawStmt: RawStmt, 
+  isFirst: boolean = false,
+  options: StatementSplitterOptions = {}
+): string | null {
+  let extracted: string | null = null;
+  
+  // Convert string to buffer to handle byte positions correctly (for Unicode)
+  const sqlBuffer = Buffer.from(originalSQL, 'utf8');
+  
+  if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len !== undefined) {
+    // Use byte positions as provided by PostgreSQL
+    const startByte = rawStmt.stmt_location;
+    const endByte = rawStmt.stmt_location + rawStmt.stmt_len;
+    
+    // Extract using byte positions and convert back to string
+    const extractedBuffer = sqlBuffer.slice(startByte, endByte);
+    extracted = extractedBuffer.toString('utf8');
+  } else if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len === undefined) {
+    // We have location but no length - extract from location to end of file
+    const extractedBuffer = sqlBuffer.slice(rawStmt.stmt_location);
+    extracted = extractedBuffer.toString('utf8');
+  } else if (isFirst && rawStmt.stmt_len !== undefined) {
+    // For first statement when location is missing but we have length
+    const extractedBuffer = sqlBuffer.slice(0, rawStmt.stmt_len);
+    extracted = extractedBuffer.toString('utf8');
+  } else if (isFirst && rawStmt.stmt_location === undefined && rawStmt.stmt_len === undefined) {
+    // For first statement when both location and length are missing, use entire SQL
+    extracted = originalSQL;
+  }
+  
+  if (extracted && options.stripComments !== false) {
+    // Split into lines to handle leading whitespace and comments properly
+    const lines = extracted.split('\n');
+    let startLineIndex = 0;
+    
+    // Find the first line that contains actual SQL content
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i].trim();
+      // Skip empty lines and comment-only lines
+      if (line === '' || line.startsWith('--')) {
+        continue;
+      }
+      startLineIndex = i;
+      break;
+    }
+    
+    // Reconstruct from the first SQL line, preserving the original indentation of that line
+    if (startLineIndex < lines.length) {
+      const resultLines = lines.slice(startLineIndex);
+      extracted = resultLines.join('\n').trim();
+    }
+  }
+  
+  // Final validation unless skipped
+  if (extracted && !options.skipValidation) {
+    const firstLine = extracted.split('\n')[0].trim();
+    const firstWord = firstLine.split(/\s+/)[0].toUpperCase();
+    
+    // Only check for most obvious malformed patterns at the BEGINNING
+    if (
+      // Check if it starts with truncated patterns (not just contains anywhere)
+      extracted.trim().startsWith('ELECT ') || // Missing S from SELECT
+      extracted.trim().startsWith('REATE ') || // Missing C from CREATE  
+      extracted.trim().startsWith('NSERT ') || // Missing I from INSERT
+      // Completely empty or whitespace only
+      extracted.trim().length === 0
+    ) {
+      return null; // Invalid extraction, skip this statement
+    }
+  }
+  
+  return extracted;
+}
+
+/**
+ * Splits SQL text into individual statements using PostgreSQL's parser.
+ * Handles Unicode characters properly and provides detailed location information.
+ */
+export async function splitStatements(
+  sql: string, 
+  options: StatementSplitterOptions = {}
+): Promise<ExtractedStatement[]> {
+  const parseResult: ParseResult = await parse(sql);
+  const statements: ExtractedStatement[] = [];
+  
+  if (!parseResult.stmts) {
+    return statements;
+  }
+  
+  for (let idx = 0; idx < parseResult.stmts.length; idx++) {
+    const stmt = parseResult.stmts[idx];
+    const extracted = extractStatement(sql, stmt, idx === 0, options);
+    
+    if (extracted) {
+      statements.push({
+        statement: extracted,
+        index: idx,
+        location: stmt.stmt_location,
+        length: stmt.stmt_len
+      });
+    }
+  }
+  
+  return statements;
+}
+
+/**
+ * Utility to generate statement keys for fixtures
+ */
+export function generateStatementKey(
+  relativePath: string, 
+  statementIndex: number, 
+  extension: string = 'sql'
+): string {
+  return `${relativePath.replace(/\.sql$/, '')}-${statementIndex + 1}.${extension}`;
+}
+
+/**
+ * Test utility to compare byte vs character extraction for debugging Unicode issues
+ */
+export function debugUnicodeExtraction(sql: string, rawStmt: RawStmt): {
+  characterBased: string;
+  byteBased: string;
+  matches: boolean;
+  unicodeChars: number;
+  byteLength: number;
+  charLength: number;
+} {
+  const charLength = sql.length;
+  const byteLength = Buffer.from(sql, 'utf8').length;
+  
+  // Character-based extraction (old way)
+  let characterBased = '';
+  if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len !== undefined) {
+    characterBased = sql.substring(rawStmt.stmt_location, rawStmt.stmt_location + rawStmt.stmt_len);
+  }
+  
+  // Byte-based extraction (new way)
+  let byteBased = '';
+  if (rawStmt.stmt_location !== undefined && rawStmt.stmt_len !== undefined) {
+    const sqlBuffer = Buffer.from(sql, 'utf8');
+    const extractedBuffer = sqlBuffer.slice(rawStmt.stmt_location, rawStmt.stmt_location + rawStmt.stmt_len);
+    byteBased = extractedBuffer.toString('utf8');
+  }
+  
+  return {
+    characterBased,
+    byteBased,
+    matches: characterBased === byteBased,
+    unicodeChars: byteLength - charLength,
+    byteLength,
+    charLength
+  };
+}

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-COPY (SELECT 1) TO 'test.csv' WITH (FORMAT 'CSV');`
	`1`	`+COPY (SELECT 1) TO '/test.csv' WITH (FORMAT CSV);`