diff --git a/.gitignore b/.gitignore index 6a2a4d41..079f990d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ wasm/libpg-query.js *.wasm .cache esm/ -cjs/ \ No newline at end of file +cjs/ +.claude diff --git a/Makefile b/Makefile index 0f467a0b..ba93a976 100644 --- a/Makefile +++ b/Makefile @@ -54,8 +54,9 @@ ifdef EMSCRIPTEN -v \ $(CXXFLAGS) \ -I$(LIBPG_QUERY_DIR) \ + -I$(LIBPG_QUERY_DIR)/vendor \ -L$(LIBPG_QUERY_DIR) \ - -sEXPORTED_FUNCTIONS="['_malloc','_free','_wasm_parse_query','_wasm_parse_query_protobuf','_wasm_get_protobuf_len','_wasm_deparse_protobuf','_wasm_parse_plpgsql','_wasm_fingerprint','_wasm_normalize_query','_wasm_parse_query_detailed','_wasm_free_detailed_result','_wasm_free_string']" \ + -sEXPORTED_FUNCTIONS="['_malloc','_free','_wasm_parse_query','_wasm_parse_query_protobuf','_wasm_get_protobuf_len','_wasm_deparse_protobuf','_wasm_parse_plpgsql','_wasm_fingerprint','_wasm_normalize_query','_wasm_scan','_wasm_parse_query_detailed','_wasm_free_detailed_result','_wasm_free_string']" \ -sEXPORTED_RUNTIME_METHODS="['lengthBytesUTF8','stringToUTF8','UTF8ToString','HEAPU8','HEAPU32']" \ -sEXPORT_NAME="$(WASM_MODULE_NAME)" \ -sENVIRONMENT="web,node" \ diff --git a/README.md b/README.md index 45f1fb59..5e8b06c8 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,29 @@ const normalized = normalizeSync('SELECT * FROM users WHERE active = true'); // Returns: string - normalized SQL query ``` +### `scan(sql: string): Promise` + +Scans (tokenizes) a SQL query and returns detailed information about each token. Returns a Promise for a ScanResult containing all tokens with their positions, types, and classifications. + +```typescript +import { scan } from 'libpg-query'; + +const result = await scan('SELECT * FROM users WHERE id = $1'); +// Returns: ScanResult - detailed tokenization information +console.log(result.tokens[0]); // { start: 0, end: 6, text: "SELECT", tokenType: 651, tokenName: "UNKNOWN", keywordKind: 4, keywordName: "RESERVED_KEYWORD" } +``` + +### `scanSync(sql: string): ScanResult` + +Synchronous version that scans (tokenizes) a SQL query directly. + +```typescript +import { scanSync } from 'libpg-query'; + +const result = scanSync('SELECT * FROM users WHERE id = $1'); +// Returns: ScanResult - detailed tokenization information +``` + ### Initialization The library provides both async and sync methods. Async methods handle initialization automatically, while sync methods require explicit initialization. @@ -166,11 +189,12 @@ The library provides both async and sync methods. Async methods handle initializ Async methods handle initialization automatically and are always safe to use: ```typescript -import { parse, deparse } from 'libpg-query'; +import { parse, deparse, scan } from 'libpg-query'; // These handle initialization automatically const result = await parse('SELECT * FROM users'); const sql = await deparse(result); +const tokens = await scan('SELECT * FROM users'); ``` #### Sync Methods @@ -178,13 +202,14 @@ const sql = await deparse(result); Sync methods require explicit initialization using `loadModule()`: ```typescript -import { loadModule, parseSync } from 'libpg-query'; +import { loadModule, parseSync, scanSync } from 'libpg-query'; // Initialize first await loadModule(); // Now safe to use sync methods const result = parseSync('SELECT * FROM users'); +const tokens = scanSync('SELECT * FROM users'); ``` ### `loadModule(): Promise` @@ -192,11 +217,12 @@ const result = parseSync('SELECT * FROM users'); Explicitly initializes the WASM module. Required before using any sync methods. ```typescript -import { loadModule, parseSync } from 'libpg-query'; +import { loadModule, parseSync, scanSync } from 'libpg-query'; // Initialize before using sync methods await loadModule(); const result = parseSync('SELECT * FROM users'); +const tokens = scanSync('SELECT * FROM users'); ``` Note: We recommend using async methods as they handle initialization automatically. Use sync methods only when necessary, and always call `loadModule()` first. @@ -215,6 +241,21 @@ interface Statement { stmt_location: number; query: string; } + +interface ScanResult { + version: number; + tokens: ScanToken[]; +} + +interface ScanToken { + start: number; // Starting position in the SQL string + end: number; // Ending position in the SQL string + text: string; // The actual token text + tokenType: number; // Numeric token type identifier + tokenName: string; // Human-readable token type name + keywordKind: number; // Numeric keyword classification + keywordName: string; // Human-readable keyword classification +} ``` **Note:** The return value is an array, as multiple queries may be provided in a single string (semicolon-delimited, as PostgreSQL expects). diff --git a/SCAN.md b/SCAN.md new file mode 100644 index 00000000..aafbc4fe --- /dev/null +++ b/SCAN.md @@ -0,0 +1,439 @@ +# Scan API Documentation + +## Overview + +The scan API provides detailed tokenization of PostgreSQL SQL queries, returning information about each token including its position, type, and keyword classification. This document explains how to use the scan functionality and how it relates to the parsing API. + +## Basic Usage + +### Async Scanning +```typescript +import { scan } from 'libpg-query'; + +const result = await scan('SELECT id, name FROM users WHERE active = true'); +console.log(result.tokens); +``` + +### Sync Scanning +```typescript +import { scanSync, loadModule } from 'libpg-query'; + +// Initialize WASM module first +await loadModule(); + +const result = scanSync('SELECT id, name FROM users WHERE active = true'); +console.log(result.tokens); +``` + +## Token Information + +Each token in the scan result contains: + +```typescript +interface ScanToken { + start: number; // Starting position in the SQL string (0-based) + end: number; // Ending position in the SQL string (exclusive) + text: string; // The actual token text extracted from SQL + tokenType: number; // Numeric token type identifier + tokenName: string; // Human-readable token type name + keywordKind: number; // Numeric keyword classification + keywordName: string; // Human-readable keyword classification +} +``` + +## Token Types + +### Common Token Types + +| tokenName | Description | Example | +|-----------|-------------|---------| +| `IDENT` | Regular identifier | `users`, `id`, `column_name` | +| `SCONST` | String constant | `'hello world'`, `'value'` | +| `ICONST` | Integer constant | `42`, `123`, `0` | +| `FCONST` | Float constant | `3.14`, `2.718` | +| `PARAM` | Parameter marker | `$1`, `$2`, `$3` | +| `ASCII_40` | Left parenthesis | `(` | +| `ASCII_41` | Right parenthesis | `)` | +| `ASCII_42` | Asterisk | `*` | +| `ASCII_44` | Comma | `,` | +| `ASCII_59` | Semicolon | `;` | +| `ASCII_61` | Equals sign | `=` | +| `TYPECAST` | Type casting operator | `::` | +| `GREATER_EQUALS` | Greater than or equal | `>=` | +| `LESS_EQUALS` | Less than or equal | `<=` | +| `NOT_EQUALS` | Not equal operator | `<>`, `!=` | +| `SQL_COMMENT` | SQL comment | `-- comment` | +| `C_COMMENT` | C-style comment | `/* comment */` | +| `UNKNOWN` | Keywords and other tokens | `SELECT`, `FROM`, `WHERE` | + +### Keyword Classifications + +| keywordName | Description | Examples | +|-------------|-------------|----------| +| `NO_KEYWORD` | Not a keyword | identifiers, operators, literals | +| `UNRESERVED_KEYWORD` | Unreserved keyword | `INSERT`, `UPDATE`, `name` | +| `COL_NAME_KEYWORD` | Column name keyword | `VALUES`, `BETWEEN` | +| `TYPE_FUNC_NAME_KEYWORD` | Type/function name keyword | `INNER`, `JOIN`, `IS` | +| `RESERVED_KEYWORD` | Reserved keyword | `SELECT`, `FROM`, `WHERE` | + +## Relationship with Parse Tree + +The scan token positions directly correspond to `location` fields in the Abstract Syntax Tree (AST) produced by the parse API. This allows you to map between tokens and AST nodes. + +### Example Mapping + +Given this SQL: +```sql +SELECT id, name FROM users WHERE active = true +``` + +**Scan tokens:** +``` +[0] "SELECT" (0-6) +[1] "id" (7-9) +[2] "," (9-10) +[3] "name" (11-15) +[4] "FROM" (16-20) +[5] "users" (21-26) +[6] "WHERE" (27-32) +[7] "active" (33-39) +[8] "=" (40-41) +[9] "true" (42-46) +``` + +**Corresponding AST locations:** +```json +{ + "SelectStmt": { + "targetList": [ + { + "ResTarget": { + "val": { + "ColumnRef": { + "fields": [{"String": {"sval": "id"}}], + "location": 7 // matches "id" token at position 7-9 + } + }, + "location": 7 + } + }, + { + "ResTarget": { + "val": { + "ColumnRef": { + "fields": [{"String": {"sval": "name"}}], + "location": 11 // matches "name" token at position 11-15 + } + }, + "location": 11 + } + } + ], + "fromClause": [ + { + "RangeVar": { + "relname": "users", + "location": 21 // matches "users" token at position 21-26 + } + } + ], + "whereClause": { + "A_Expr": { + "lexpr": { + "ColumnRef": { + "fields": [{"String": {"sval": "active"}}], + "location": 33 // matches "active" token at position 33-39 + } + }, + "rexpr": { + "A_Const": { + "boolval": {"boolval": true}, + "location": 42 // matches "true" token at position 42-46 + } + }, + "location": 40 // matches "=" token at position 40-41 + } + } + } +} +``` + +### Token-to-AST Mapping Function + +Here's a utility function to map between scan tokens and AST nodes: + +```typescript +import { scan, parse } from 'libpg-query'; + +interface TokenASTMapping { + token: ScanToken; + astNodes: Array<{path: string; node: any}>; +} + +async function mapTokensToAST(sql: string): Promise { + const [scanResult, parseResult] = await Promise.all([ + scan(sql), + parse(sql) + ]); + + // Collect all AST nodes with locations + const astNodes: Array<{path: string; location: number; node: any}> = []; + + function traverse(obj: any, path: string = '') { + if (obj && typeof obj === 'object') { + if (typeof obj.location === 'number') { + astNodes.push({path, location: obj.location, node: obj}); + } + + for (const [key, value] of Object.entries(obj)) { + const newPath = path ? `${path}.${key}` : key; + traverse(value, newPath); + } + } + } + + traverse(parseResult); + + // Map tokens to AST nodes + return scanResult.tokens.map(token => { + const matchingNodes = astNodes.filter(astNode => { + // AST location points to start of token + return astNode.location >= token.start && astNode.location < token.end; + }); + + return { + token, + astNodes: matchingNodes.map(n => ({path: n.path, node: n.node})) + }; + }); +} + +// Usage +const mappings = await mapTokensToAST('SELECT id FROM users WHERE active = true'); +mappings.forEach(({token, astNodes}) => { + console.log(`Token "${token.text}" (${token.start}-${token.end}):`); + astNodes.forEach(({path, node}) => { + console.log(` AST: ${path} at location ${node.location}`); + }); +}); +``` + +## Use Cases + +### 1. Syntax Highlighting + +Use scan results to apply syntax highlighting in code editors: + +```typescript +function applySyntaxHighlighting(sql: string, tokens: ScanToken[]): string { + let highlighted = ''; + let lastEnd = 0; + + for (const token of tokens) { + // Add any whitespace between tokens + highlighted += sql.substring(lastEnd, token.start); + + // Apply highlighting based on token type + const cssClass = getHighlightClass(token); + highlighted += `${token.text}`; + + lastEnd = token.end; + } + + // Add remaining SQL + highlighted += sql.substring(lastEnd); + + return highlighted; +} + +function getHighlightClass(token: ScanToken): string { + if (token.keywordName === 'RESERVED_KEYWORD') return 'sql-keyword'; + if (token.tokenName === 'SCONST') return 'sql-string'; + if (token.tokenName === 'ICONST' || token.tokenName === 'FCONST') return 'sql-number'; + if (token.tokenName === 'PARAM') return 'sql-parameter'; + if (token.tokenName === 'SQL_COMMENT' || token.tokenName === 'C_COMMENT') return 'sql-comment'; + return 'sql-default'; +} +``` + +### 2. Parameter Extraction + +Extract all parameters from a query: + +```typescript +function extractParameters(sql: string): Array<{param: string; position: number}> { + const result = scanSync(sql); + return result.tokens + .filter(token => token.tokenName === 'PARAM') + .map(token => ({ + param: token.text, + position: token.start + })); +} + +const params = extractParameters('SELECT * FROM users WHERE id = $1 AND name = $2'); +// Returns: [{param: '$1', position: 38}, {param: '$2', position: 52}] +``` + +### 3. Query Complexity Analysis + +Analyze query complexity based on token types: + +```typescript +function analyzeQueryComplexity(sql: string): { + joins: number; + subqueries: number; + parameters: number; + aggregates: string[]; + totalTokens: number; +} { + const result = scanSync(sql); + const tokens = result.tokens; + + const joins = tokens.filter(t => + t.text.toUpperCase() === 'JOIN' || + t.text.toUpperCase() === 'INNER' || + t.text.toUpperCase() === 'LEFT' || + t.text.toUpperCase() === 'RIGHT' + ).length; + + const subqueries = tokens.filter(t => t.text.toUpperCase() === 'SELECT').length - 1; + + const parameters = tokens.filter(t => t.tokenName === 'PARAM').length; + + const aggregates = tokens + .filter(t => ['COUNT', 'SUM', 'AVG', 'MIN', 'MAX'].includes(t.text.toUpperCase())) + .map(t => t.text.toUpperCase()); + + return { + joins, + subqueries: Math.max(0, subqueries), + parameters, + aggregates, + totalTokens: tokens.length + }; +} +``` + +### 4. SQL Formatting + +Use token positions for intelligent SQL formatting: + +```typescript +function formatSQL(sql: string): string { + const result = scanSync(sql); + let formatted = ''; + let indentLevel = 0; + let lastEnd = 0; + + for (const token of result.tokens) { + // Add whitespace between tokens + const gap = sql.substring(lastEnd, token.start); + + // Format based on token type + if (token.text.toUpperCase() === 'SELECT') { + formatted += '\\n' + ' '.repeat(indentLevel) + token.text; + } else if (token.text.toUpperCase() === 'FROM') { + formatted += '\\n' + ' '.repeat(indentLevel) + token.text; + } else if (token.text.toUpperCase() === 'WHERE') { + formatted += '\\n' + ' '.repeat(indentLevel) + token.text; + } else if (token.text === '(') { + formatted += token.text; + indentLevel++; + } else if (token.text === ')') { + indentLevel--; + formatted += token.text; + } else { + formatted += (gap.trim() ? ' ' : '') + token.text; + } + + lastEnd = token.end; + } + + return formatted.trim(); +} +``` + +## Performance Considerations + +- Scanning is generally faster than full parsing +- For large SQL strings, consider streaming or chunked processing +- Token positions are 0-based and use exclusive end positions +- The scan operation is stateless and thread-safe + +## Error Handling + +The scan API is more permissive than the parse API and will attempt to tokenize even malformed SQL: + +```typescript +try { + const result = scanSync('SELECT * FROM invalid$$$'); + // May still return tokens for recognizable parts + console.log(result.tokens); +} catch (error) { + console.error('Scan error:', error.message); +} +``` + +## Integration with Other Tools + +### ESLint Rules +Create custom ESLint rules for SQL: + +```typescript +function createSQLLintRule() { + return { + meta: { + type: 'problem', + docs: { description: 'Detect SQL injection risks' } + }, + create(context) { + return { + TemplateLiteral(node) { + if (node.quasiSConst) { + const sql = node.quasiSConst[0].value.raw; + const tokens = scanSync(sql); + + // Check for unparameterized string concatenation + const hasStringLiterals = tokens.some(t => t.tokenName === 'SCONST'); + const hasParameters = tokens.some(t => t.tokenName === 'PARAM'); + + if (hasStringLiterals && !hasParameters) { + context.report({ + node, + message: 'Potential SQL injection: use parameterized queries' + }); + } + } + } + }; + } + }; +} +``` + +### Database Migration Tools +Analyze schema changes: + +```typescript +function detectSchemaChanges(oldSQL: string, newSQL: string): string[] { + const oldTokens = scanSync(oldSQL); + const newTokens = scanSync(newSQL); + + const changes: string[] = []; + + // Detect table name changes + const oldTables = extractTableNames(oldTokens); + const newTables = extractTableNames(newTokens); + + const addedTables = newTables.filter(t => !oldTables.includes(t)); + const removedTables = oldTables.filter(t => !newTables.includes(t)); + + changes.push(...addedTables.map(t => `Added table: ${t}`)); + changes.push(...removedTables.map(t => `Removed table: ${t}`)); + + return changes; +} +``` + +This comprehensive relationship between scan tokens and AST locations enables powerful SQL analysis, transformation, and tooling capabilities. \ No newline at end of file diff --git a/libpg_query.md b/libpg_query.md new file mode 100644 index 00000000..2b4e4e8b --- /dev/null +++ b/libpg_query.md @@ -0,0 +1,558 @@ +# libpg_query API Documentation + +This document provides comprehensive documentation for the libpg_query API, focusing on the core parsing, scanning, and deparsing functionality. + +## Overview + +libpg_query is a C library that provides PostgreSQL SQL parsing functionality. It exposes the PostgreSQL parser as a standalone library, allowing you to parse SQL statements into parse trees, scan for tokens, and deparse parse trees back into SQL. + +## Core Data Structures + +### PgQueryError +```c +typedef struct { + char* message; // exception message + char* funcname; // source function of exception (e.g. SearchSysCache) + char* filename; // source of exception (e.g. parse.l) + int lineno; // source of exception (e.g. 104) + int cursorpos; // char in query at which exception occurred + char* context; // additional context (optional, can be NULL) +} PgQueryError; +``` + +### PgQueryProtobuf +```c +typedef struct { + size_t len; + char* data; +} PgQueryProtobuf; +``` + +### Parser Options +```c +typedef enum { + PG_QUERY_PARSE_DEFAULT = 0, + PG_QUERY_PARSE_TYPE_NAME, + PG_QUERY_PARSE_PLPGSQL_EXPR, + PG_QUERY_PARSE_PLPGSQL_ASSIGN1, + PG_QUERY_PARSE_PLPGSQL_ASSIGN2, + PG_QUERY_PARSE_PLPGSQL_ASSIGN3 +} PgQueryParseMode; +``` + +### Parser Option Flags +- `PG_QUERY_DISABLE_BACKSLASH_QUOTE` (16) - backslash_quote = off +- `PG_QUERY_DISABLE_STANDARD_CONFORMING_STRINGS` (32) - standard_conforming_strings = off +- `PG_QUERY_DISABLE_ESCAPE_STRING_WARNING` (64) - escape_string_warning = off + +## Core API Functions + +### Scanning Functions + +#### pg_query_scan +```c +PgQueryScanResult pg_query_scan(const char* input); +``` +**Description**: Scans SQL input and returns tokens in protobuf format. + +**Parameters**: +- `input`: SQL string to scan + +**Returns**: `PgQueryScanResult` containing: +- `pbuf`: Protobuf data with scan results +- `stderr_buffer`: Any stderr output during scanning +- `error`: Error information if scanning failed + +**Usage**: Use this when you need to tokenize SQL without full parsing. + +## Scanning and Token Processing + +### Working with Scan Results + +The `pg_query_scan` function returns tokens in protobuf format that need to be unpacked to access individual tokens. Here's the complete workflow: + +### Step 1: Scan SQL +```c +const char* sql = "SELECT * FROM users WHERE id = $1"; +PgQueryScanResult result = pg_query_scan(sql); + +if (result.error) { + printf("Scan error: %s at position %d\n", + result.error->message, result.error->cursorpos); + pg_query_free_scan_result(result); + return; +} +``` + +### Step 2: Unpack Protobuf Data +```c +#include "protobuf/pg_query.pb-c.h" + +PgQuery__ScanResult *scan_result = pg_query__scan_result__unpack( + NULL, // Use default allocator + result.pbuf.len, // Length of protobuf data + (void *) result.pbuf.data // Protobuf data +); + +printf("Found %zu tokens\n", scan_result->n_tokens); +``` + +### Step 3: Process Individual Tokens +```c +for (size_t i = 0; i < scan_result->n_tokens; i++) { + PgQuery__ScanToken *token = scan_result->tokens[i]; + + // Extract token text from original SQL + int token_length = token->end - token->start; + char token_text[token_length + 1]; + strncpy(token_text, &sql[token->start], token_length); + token_text[token_length] = '\0'; + + // Get token type name + const ProtobufCEnumValue *token_kind = + protobuf_c_enum_descriptor_get_value(&pg_query__token__descriptor, token->token); + + // Get keyword classification + const ProtobufCEnumValue *keyword_kind = + protobuf_c_enum_descriptor_get_value(&pg_query__keyword_kind__descriptor, token->keyword_kind); + + printf("Token %zu: \"%s\" [%d-%d] Type: %s, Keyword: %s\n", + i, token_text, token->start, token->end, + token_kind->name, keyword_kind->name); +} +``` + +### Step 4: Clean Up Memory +```c +// Free the unpacked protobuf data +pg_query__scan_result__free_unpacked(scan_result, NULL); + +// Free the original scan result +pg_query_free_scan_result(result); +``` + +## Token Structure Details + +### PgQuery__ScanResult Structure +```c +struct PgQuery__ScanResult { + ProtobufCMessage base; + int32_t version; // Protocol version + size_t n_tokens; // Number of tokens + PgQuery__ScanToken **tokens; // Array of token pointers +}; +``` + +### PgQuery__ScanToken Structure +```c +struct PgQuery__ScanToken { + ProtobufCMessage base; + int32_t start; // Starting position in SQL string + int32_t end; // Ending position in SQL string + PgQuery__Token token; // Token type enum + PgQuery__KeywordKind keyword_kind; // Keyword classification +}; +``` + +## Token Types and Classifications + +### Keyword Classifications (PgQuery__KeywordKind) +- `PG_QUERY__KEYWORD_KIND__NO_KEYWORD` (0) - Not a keyword +- `PG_QUERY__KEYWORD_KIND__UNRESERVED_KEYWORD` (1) - Unreserved keyword +- `PG_QUERY__KEYWORD_KIND__COL_NAME_KEYWORD` (2) - Column name keyword +- `PG_QUERY__KEYWORD_KIND__TYPE_FUNC_NAME_KEYWORD` (3) - Type/function name keyword +- `PG_QUERY__KEYWORD_KIND__RESERVED_KEYWORD` (4) - Reserved keyword + +### Common Token Types (PgQuery__Token) +**Special/Control Tokens:** +- `PG_QUERY__TOKEN__NUL` (0) - Null token + +**Single-Character Operators:** +- `PG_QUERY__TOKEN__ASCII_40` (40) - "(" +- `PG_QUERY__TOKEN__ASCII_41` (41) - ")" +- `PG_QUERY__TOKEN__ASCII_42` (42) - "*" +- `PG_QUERY__TOKEN__ASCII_44` (44) - "," +- `PG_QUERY__TOKEN__ASCII_59` (59) - ";" +- `PG_QUERY__TOKEN__ASCII_61` (61) - "=" + +**Named Lexical Tokens:** +- `PG_QUERY__TOKEN__IDENT` (258) - Regular identifier +- `PG_QUERY__TOKEN__SCONST` (261) - String constant +- `PG_QUERY__TOKEN__ICONST` (266) - Integer constant +- `PG_QUERY__TOKEN__FCONST` (260) - Float constant +- `PG_QUERY__TOKEN__PARAM` (267) - Parameter marker ($1, $2, etc.) + +**Multi-Character Operators:** +- `PG_QUERY__TOKEN__TYPECAST` (268) - "::" +- `PG_QUERY__TOKEN__DOT_DOT` (269) - ".." +- `PG_QUERY__TOKEN__LESS_EQUALS` (272) - "<=" +- `PG_QUERY__TOKEN__GREATER_EQUALS` (273) - ">=" +- `PG_QUERY__TOKEN__NOT_EQUALS` (274) - "!=" or "<>" + +**Common SQL Keywords:** +- `PG_QUERY__TOKEN__SELECT` - SELECT keyword +- `PG_QUERY__TOKEN__FROM` - FROM keyword +- `PG_QUERY__TOKEN__WHERE` - WHERE keyword +- `PG_QUERY__TOKEN__INSERT` - INSERT keyword +- `PG_QUERY__TOKEN__UPDATE` - UPDATE keyword +- `PG_QUERY__TOKEN__DELETE` - DELETE keyword + +**Comments:** +- `PG_QUERY__TOKEN__SQL_COMMENT` (275) - SQL-style comment (-- comment) +- `PG_QUERY__TOKEN__C_COMMENT` (276) - C-style comment (/* comment */) + +## Protobuf Helper Functions + +### Unpacking Functions +```c +// Unpack scan result +PgQuery__ScanResult *pg_query__scan_result__unpack( + ProtobufCAllocator *allocator, // NULL for default + size_t len, // Length of data + const uint8_t *data // Protobuf data +); + +// Free unpacked scan result +void pg_query__scan_result__free_unpacked( + PgQuery__ScanResult *message, // Message to free + ProtobufCAllocator *allocator // NULL for default +); +``` + +### Enum Value Lookup Functions +```c +// Get token type name +const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value( + const ProtobufCEnumDescriptor *desc, // &pg_query__token__descriptor + int value // token->token +); + +// Get keyword kind name +const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value( + const ProtobufCEnumDescriptor *desc, // &pg_query__keyword_kind__descriptor + int value // token->keyword_kind +); +``` + +## Complete Example: SQL Tokenizer + +```c +#include +#include +#include +#include +#include "protobuf/pg_query.pb-c.h" + +void print_tokens(const char* sql) { + PgQueryScanResult result = pg_query_scan(sql); + + if (result.error) { + printf("Error: %s at position %d\n", + result.error->message, result.error->cursorpos); + pg_query_free_scan_result(result); + return; + } + + PgQuery__ScanResult *scan_result = pg_query__scan_result__unpack( + NULL, result.pbuf.len, (void *) result.pbuf.data); + + printf("SQL: %s\n", sql); + printf("Tokens (%zu):\n", scan_result->n_tokens); + + for (size_t i = 0; i < scan_result->n_tokens; i++) { + PgQuery__ScanToken *token = scan_result->tokens[i]; + + // Extract token text + int len = token->end - token->start; + printf(" [%zu] \"%.*s\" (%d-%d) ", + i, len, &sql[token->start], token->start, token->end); + + // Get token type + const ProtobufCEnumValue *token_kind = + protobuf_c_enum_descriptor_get_value(&pg_query__token__descriptor, token->token); + printf("Type: %s", token_kind->name); + + // Get keyword classification if applicable + if (token->keyword_kind != PG_QUERY__KEYWORD_KIND__NO_KEYWORD) { + const ProtobufCEnumValue *keyword_kind = + protobuf_c_enum_descriptor_get_value(&pg_query__keyword_kind__descriptor, token->keyword_kind); + printf(", Keyword: %s", keyword_kind->name); + } + printf("\n"); + } + + pg_query__scan_result__free_unpacked(scan_result, NULL); + pg_query_free_scan_result(result); +} + +int main() { + print_tokens("SELECT * FROM users WHERE id = $1"); + print_tokens("INSERT INTO table VALUES (1, 'text', 3.14)"); + print_tokens("-- Comment\nUPDATE table SET col = col + 1"); + + pg_query_exit(); + return 0; +} +``` + +## Build Requirements + +To use scanning functionality, compile with: +```bash +gcc -I. -I./protobuf your_program.c -lpg_query -lprotobuf-c +``` + +Make sure to include: +- `pg_query.h` - Main API header +- `protobuf/pg_query.pb-c.h` - Protobuf definitions + +### Parsing Functions + +#### pg_query_parse +```c +PgQueryParseResult pg_query_parse(const char* input); +``` +**Description**: Parses SQL input into a JSON parse tree. + +**Parameters**: +- `input`: SQL string to parse + +**Returns**: `PgQueryParseResult` containing: +- `parse_tree`: JSON representation of the parse tree +- `stderr_buffer`: Any stderr output during parsing +- `error`: Error information if parsing failed + +#### pg_query_parse_opts +```c +PgQueryParseResult pg_query_parse_opts(const char* input, int parser_options); +``` +**Description**: Parses SQL input with custom parser options. + +**Parameters**: +- `input`: SQL string to parse +- `parser_options`: Bitwise OR of parser options and flags + +**Returns**: Same as `pg_query_parse` + +#### pg_query_parse_protobuf +```c +PgQueryProtobufParseResult pg_query_parse_protobuf(const char* input); +``` +**Description**: Parses SQL input into protobuf format parse tree. + +**Parameters**: +- `input`: SQL string to parse + +**Returns**: `PgQueryProtobufParseResult` containing: +- `parse_tree`: Protobuf representation of the parse tree +- `stderr_buffer`: Any stderr output during parsing +- `error`: Error information if parsing failed + +#### pg_query_parse_protobuf_opts +```c +PgQueryProtobufParseResult pg_query_parse_protobuf_opts(const char* input, int parser_options); +``` +**Description**: Parses SQL input into protobuf format with custom options. + +**Parameters**: +- `input`: SQL string to parse +- `parser_options`: Bitwise OR of parser options and flags + +**Returns**: Same as `pg_query_parse_protobuf` + +#### pg_query_parse_plpgsql +```c +PgQueryPlpgsqlParseResult pg_query_parse_plpgsql(const char* input); +``` +**Description**: Parses PL/pgSQL code. + +**Parameters**: +- `input`: PL/pgSQL code to parse + +**Returns**: `PgQueryPlpgsqlParseResult` containing: +- `plpgsql_funcs`: JSON representation of PL/pgSQL functions +- `error`: Error information if parsing failed + +### Deparsing Functions + +#### pg_query_deparse_protobuf +```c +PgQueryDeparseResult pg_query_deparse_protobuf(PgQueryProtobuf parse_tree); +``` +**Description**: Converts a protobuf parse tree back into SQL. + +**Parameters**: +- `parse_tree`: Protobuf parse tree to deparse + +**Returns**: `PgQueryDeparseResult` containing: +- `query`: Deparsed SQL string +- `error`: Error information if deparsing failed + +**Usage**: Use this to convert parse trees back to SQL, useful for query transformation. + +### Utility Functions + +#### pg_query_normalize +```c +PgQueryNormalizeResult pg_query_normalize(const char* input); +``` +**Description**: Normalizes a SQL query by removing comments and standardizing formatting. + +**Parameters**: +- `input`: SQL string to normalize + +**Returns**: `PgQueryNormalizeResult` containing: +- `normalized_query`: Normalized SQL string +- `error`: Error information if normalization failed + +#### pg_query_normalize_utility +```c +PgQueryNormalizeResult pg_query_normalize_utility(const char* input); +``` +**Description**: Normalizes utility statements (DDL, etc.). + +**Parameters**: +- `input`: SQL string to normalize + +**Returns**: Same as `pg_query_normalize` + +#### pg_query_fingerprint +```c +PgQueryFingerprintResult pg_query_fingerprint(const char* input); +``` +**Description**: Generates a fingerprint for a SQL query. + +**Parameters**: +- `input`: SQL string to fingerprint + +**Returns**: `PgQueryFingerprintResult` containing: +- `fingerprint`: 64-bit fingerprint hash +- `fingerprint_str`: String representation of fingerprint +- `stderr_buffer`: Any stderr output +- `error`: Error information if fingerprinting failed + +#### pg_query_fingerprint_opts +```c +PgQueryFingerprintResult pg_query_fingerprint_opts(const char* input, int parser_options); +``` +**Description**: Generates a fingerprint with custom parser options. + +**Parameters**: +- `input`: SQL string to fingerprint +- `parser_options`: Bitwise OR of parser options and flags + +**Returns**: Same as `pg_query_fingerprint` + +### Statement Splitting Functions + +#### pg_query_split_with_scanner +```c +PgQuerySplitResult pg_query_split_with_scanner(const char *input); +``` +**Description**: Splits multi-statement SQL using the scanner. Use when statements may contain parse errors. + +**Parameters**: +- `input`: SQL string containing multiple statements + +**Returns**: `PgQuerySplitResult` containing: +- `stmts`: Array of statement locations and lengths +- `n_stmts`: Number of statements found +- `stderr_buffer`: Any stderr output +- `error`: Error information if splitting failed + +#### pg_query_split_with_parser +```c +PgQuerySplitResult pg_query_split_with_parser(const char *input); +``` +**Description**: Splits multi-statement SQL using the parser (recommended for better accuracy). + +**Parameters**: +- `input`: SQL string containing multiple statements + +**Returns**: Same as `pg_query_split_with_scanner` + +## Memory Management + +### Cleanup Functions +All result structures must be freed using their corresponding cleanup functions: + +```c +void pg_query_free_normalize_result(PgQueryNormalizeResult result); +void pg_query_free_scan_result(PgQueryScanResult result); +void pg_query_free_parse_result(PgQueryParseResult result); +void pg_query_free_split_result(PgQuerySplitResult result); +void pg_query_free_deparse_result(PgQueryDeparseResult result); +void pg_query_free_protobuf_parse_result(PgQueryProtobufParseResult result); +void pg_query_free_plpgsql_parse_result(PgQueryPlpgsqlParseResult result); +void pg_query_free_fingerprint_result(PgQueryFingerprintResult result); +``` + +### Global Cleanup +```c +void pg_query_exit(void); +``` +**Description**: Optional cleanup of top-level memory context. Automatically done for threads that exit. + +## Error Handling + +All functions return result structures that include an `error` field. Always check this field before using other result data: + +```c +PgQueryParseResult result = pg_query_parse(sql); +if (result.error) { + printf("Parse error: %s\n", result.error->message); + printf("Location: %s:%d\n", result.error->filename, result.error->lineno); + if (result.error->cursorpos > 0) { + printf("Position: %d\n", result.error->cursorpos); + } +} else { + // Use result.parse_tree +} +pg_query_free_parse_result(result); +``` + +## Example Usage + +### Basic Parsing +```c +#include "pg_query.h" + +const char* sql = "SELECT * FROM users WHERE id = $1"; +PgQueryParseResult result = pg_query_parse(sql); + +if (result.error) { + printf("Error: %s\n", result.error->message); +} else { + printf("Parse tree: %s\n", result.parse_tree); +} + +pg_query_free_parse_result(result); +``` + +### Parse and Deparse Cycle +```c +// Parse to protobuf +PgQueryProtobufParseResult parse_result = pg_query_parse_protobuf(sql); +if (!parse_result.error) { + // Deparse back to SQL + PgQueryDeparseResult deparse_result = pg_query_deparse_protobuf(parse_result.parse_tree); + if (!deparse_result.error) { + printf("Deparsed: %s\n", deparse_result.query); + } + pg_query_free_deparse_result(deparse_result); +} +pg_query_free_protobuf_parse_result(parse_result); +``` + +## Version Information + +- PostgreSQL Version: 17.4 (PG_VERSION_NUM: 170004) +- Major Version: 17 + +## Notes + +- The library is thread-safe +- Always free result structures to avoid memory leaks +- Use protobuf format for better performance when doing parse/deparse cycles +- Scanner-based splitting is more robust for malformed SQL +- Parser-based splitting is more accurate for well-formed SQL \ No newline at end of file diff --git a/src/index.ts b/src/index.ts index c45bb014..f5fdd84d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,21 @@ import { ParseResult } from "@pgsql/types"; export * from "@pgsql/types"; +export interface ScanToken { + start: number; + end: number; + text: string; + tokenType: number; + tokenName: string; + keywordKind: number; + keywordName: string; +} + +export interface ScanResult { + version: number; + tokens: ScanToken[]; +} + // @ts-ignore import PgQueryModule from './libpg-query.js'; // @ts-ignore @@ -15,6 +30,7 @@ interface WasmModule { _wasm_parse_plpgsql: (queryPtr: number) => number; _wasm_fingerprint: (queryPtr: number) => number; _wasm_normalize_query: (queryPtr: number) => number; + _wasm_scan: (queryPtr: number) => number; lengthBytesUTF8: (str: string) => number; stringToUTF8: (str: string, ptr: number, len: number) => void; UTF8ToString: (ptr: number) => string; @@ -296,4 +312,49 @@ export function normalizeSync(query: string): string { wasmModule._wasm_free_string(resultPtr); } } +} + +export const scan = awaitInit(async (query: string): Promise => { + const queryPtr = stringToPtr(query); + let resultPtr = 0; + + try { + resultPtr = wasmModule._wasm_scan(queryPtr); + const resultStr = ptrToString(resultPtr); + + if (resultStr.startsWith('syntax error') || resultStr.startsWith('deparse error') || resultStr.includes('ERROR')) { + throw new Error(resultStr); + } + + return JSON.parse(resultStr); + } finally { + wasmModule._free(queryPtr); + if (resultPtr) { + wasmModule._wasm_free_string(resultPtr); + } + } +}); + +export function scanSync(query: string): ScanResult { + if (!wasmModule) { + throw new Error('WASM module not initialized. Call loadModule() first.'); + } + const queryPtr = stringToPtr(query); + let resultPtr = 0; + + try { + resultPtr = wasmModule._wasm_scan(queryPtr); + const resultStr = ptrToString(resultPtr); + + if (resultStr.startsWith('syntax error') || resultStr.startsWith('deparse error') || resultStr.includes('ERROR')) { + throw new Error(resultStr); + } + + return JSON.parse(resultStr); + } finally { + wasmModule._free(queryPtr); + if (resultPtr) { + wasmModule._wasm_free_string(resultPtr); + } + } } \ No newline at end of file diff --git a/src/wasm_wrapper.c b/src/wasm_wrapper.c index 9e1b02c6..9b6db3ff 100644 --- a/src/wasm_wrapper.c +++ b/src/wasm_wrapper.c @@ -1,4 +1,5 @@ #include "pg_query.h" +#include "protobuf/pg_query.pb-c.h" #include #include #include @@ -281,6 +282,154 @@ void wasm_free_detailed_result(WasmDetailedResult* result) { } } +static const char* get_token_name(PgQuery__Token token_type) { + // Map some common token types to readable names + // Note: This is a simplified mapping - full enum lookup would require more complexity + switch(token_type) { + case 258: return "IDENT"; + case 261: return "SCONST"; + case 266: return "ICONST"; + case 260: return "FCONST"; + case 267: return "PARAM"; + case 40: return "ASCII_40"; // ( + case 41: return "ASCII_41"; // ) + case 42: return "ASCII_42"; // * + case 44: return "ASCII_44"; // , + case 59: return "ASCII_59"; // ; + case 61: return "ASCII_61"; // = + case 268: return "TYPECAST"; + case 272: return "LESS_EQUALS"; + case 273: return "GREATER_EQUALS"; + case 274: return "NOT_EQUALS"; + case 275: return "SQL_COMMENT"; + case 276: return "C_COMMENT"; + default: return "UNKNOWN"; + } +} + +static const char* get_keyword_name(PgQuery__KeywordKind keyword_kind) { + switch(keyword_kind) { + case 0: return "NO_KEYWORD"; + case 1: return "UNRESERVED_KEYWORD"; + case 2: return "COL_NAME_KEYWORD"; + case 3: return "TYPE_FUNC_NAME_KEYWORD"; + case 4: return "RESERVED_KEYWORD"; + default: return "UNKNOWN_KEYWORD"; + } +} + +static char* build_scan_json(PgQuery__ScanResult *scan_result, const char* original_sql) { + if (!scan_result || !original_sql) { + return safe_strdup("{\"version\":0,\"tokens\":[]}"); + } + + // Calculate rough JSON size estimate + size_t estimated_size = 1024 + (scan_result->n_tokens * 200); + char* json = safe_malloc(estimated_size); + if (!json) { + return safe_strdup("{\"version\":0,\"tokens\":[]}"); + } + + // Start building JSON + int pos = snprintf(json, estimated_size, "{\"version\":%d,\"tokens\":[", scan_result->version); + + for (size_t i = 0; i < scan_result->n_tokens; i++) { + PgQuery__ScanToken *token = scan_result->tokens[i]; + + // Extract token text from original SQL + int token_length = token->end - token->start; + if (token_length < 0) token_length = 0; // Safety check + + char* token_text = safe_malloc(token_length + 1); + if (!token_text) continue; + + if (token_length > 0) { + strncpy(token_text, &original_sql[token->start], token_length); + } + token_text[token_length] = '\0'; + + // Escape token text for JSON + char* escaped_text = safe_malloc(token_length * 2 + 1); + if (!escaped_text) { + free(token_text); + continue; + } + + int escaped_pos = 0; + for (int j = 0; j < token_length; j++) { + char c = token_text[j]; + if (c == '"' || c == '\\') { + escaped_text[escaped_pos++] = '\\'; + } + escaped_text[escaped_pos++] = c; + } + escaped_text[escaped_pos] = '\0'; + + // Get token type name and keyword kind name + const char* token_name = get_token_name(token->token); + const char* keyword_name = get_keyword_name(token->keyword_kind); + + // Add comma if not first token + if (i > 0) { + pos += snprintf(json + pos, estimated_size - pos, ","); + } + + // Add token object to JSON + pos += snprintf(json + pos, estimated_size - pos, + "{\"start\":%d,\"end\":%d,\"text\":\"%s\",\"tokenType\":%d,\"tokenName\":\"%s\",\"keywordKind\":%d,\"keywordName\":\"%s\"}", + token->start, token->end, escaped_text, token->token, token_name, token->keyword_kind, keyword_name); + + free(token_text); + free(escaped_text); + + // Check if we're running out of space + if (pos >= estimated_size - 200) { + char* new_json = realloc(json, estimated_size * 2); + if (!new_json) break; + json = new_json; + estimated_size *= 2; + } + } + + // Close JSON + snprintf(json + pos, estimated_size - pos, "]}"); + + return json; +} + +EMSCRIPTEN_KEEPALIVE +char* wasm_scan(const char* input) { + if (!validate_input(input)) { + return safe_strdup("Invalid input: query cannot be null or empty"); + } + + PgQueryScanResult result = pg_query_scan(input); + + if (result.error) { + char* error_msg = safe_strdup(result.error->message); + pg_query_free_scan_result(result); + return error_msg ? error_msg : safe_strdup("Memory allocation failed"); + } + + // Unpack protobuf data + PgQuery__ScanResult *scan_result = pg_query__scan_result__unpack( + NULL, result.pbuf.len, (void *) result.pbuf.data); + + if (!scan_result) { + pg_query_free_scan_result(result); + return safe_strdup("Failed to unpack scan result"); + } + + // Convert to JSON + char* json_result = build_scan_json(scan_result, input); + + // Clean up + pg_query__scan_result__free_unpacked(scan_result, NULL); + pg_query_free_scan_result(result); + + return json_result ? json_result : safe_strdup("{\"version\":0,\"tokens\":[]}"); +} + EMSCRIPTEN_KEEPALIVE void wasm_free_string(char* str) { free(str); diff --git a/test/scan.test.js b/test/scan.test.js new file mode 100644 index 00000000..f693290f --- /dev/null +++ b/test/scan.test.js @@ -0,0 +1,228 @@ +const query = require("../"); +const { expect } = require("chai"); + +describe("Query Scanning", () => { + before(async () => { + await query.parse("SELECT 1"); + }); + + describe("Sync Scanning", () => { + it("should return a scan result with version and tokens", () => { + const result = query.scanSync("SELECT 1"); + + expect(result).to.be.an("object"); + expect(result).to.have.property("version"); + expect(result).to.have.property("tokens"); + expect(result.version).to.be.a("number"); + expect(result.tokens).to.be.an("array"); + }); + + it("should scan a simple SELECT query correctly", () => { + const result = query.scanSync("SELECT 1"); + + expect(result.tokens).to.have.lengthOf(2); + + // First token should be SELECT + const selectToken = result.tokens[0]; + expect(selectToken.text).to.eq("SELECT"); + expect(selectToken.start).to.eq(0); + expect(selectToken.end).to.eq(6); + expect(selectToken.tokenName).to.eq("UNKNOWN"); // SELECT is mapped as UNKNOWN in our simplified mapping + expect(selectToken.keywordName).to.eq("RESERVED_KEYWORD"); + + // Second token should be 1 + const numberToken = result.tokens[1]; + expect(numberToken.text).to.eq("1"); + expect(numberToken.start).to.eq(7); + expect(numberToken.end).to.eq(8); + expect(numberToken.tokenName).to.eq("ICONST"); + expect(numberToken.keywordName).to.eq("NO_KEYWORD"); + }); + + it("should scan tokens with correct positions", () => { + const sql = "SELECT * FROM users"; + const result = query.scanSync(sql); + + expect(result.tokens).to.have.lengthOf(4); + + // Verify each token position matches the original SQL + result.tokens.forEach(token => { + const actualText = sql.substring(token.start, token.end); + expect(token.text).to.eq(actualText); + }); + }); + + it("should identify different token types", () => { + const result = query.scanSync("SELECT 'string', 123, 3.14, $1 FROM users"); + + const tokenTypes = result.tokens.map(t => t.tokenName); + expect(tokenTypes).to.include("SCONST"); // String constant + expect(tokenTypes).to.include("ICONST"); // Integer constant + expect(tokenTypes).to.include("FCONST"); // Float constant + expect(tokenTypes).to.include("PARAM"); // Parameter marker + // Note: keywords like FROM may be tokenized as UNKNOWN in our simplified mapping + expect(tokenTypes).to.include("UNKNOWN"); // Keywords and identifiers + }); + + it("should identify operators and punctuation", () => { + const result = query.scanSync("SELECT * FROM users WHERE id = 1"); + + const operators = result.tokens.filter(t => + t.tokenName.startsWith("ASCII_") || t.text === "=" + ); + + expect(operators).to.have.length.greaterThan(0); + expect(operators.some(t => t.text === "*")).to.be.true; + expect(operators.some(t => t.text === "=")).to.be.true; + }); + + it("should classify keyword types correctly", () => { + const result = query.scanSync("SELECT COUNT(*) FROM users WHERE active = true"); + + const reservedKeywords = result.tokens.filter(t => + t.keywordName === "RESERVED_KEYWORD" + ); + const unreservedKeywords = result.tokens.filter(t => + t.keywordName === "UNRESERVED_KEYWORD" + ); + + expect(reservedKeywords.length).to.be.greaterThan(0); + // SELECT, FROM, WHERE should be reserved keywords + expect(reservedKeywords.some(t => t.text === "SELECT")).to.be.true; + expect(reservedKeywords.some(t => t.text === "FROM")).to.be.true; + expect(reservedKeywords.some(t => t.text === "WHERE")).to.be.true; + }); + + it("should handle complex queries with parameters", () => { + const result = query.scanSync("SELECT * FROM users WHERE id = $1 AND name = $2"); + + const params = result.tokens.filter(t => t.tokenName === "PARAM"); + expect(params).to.have.lengthOf(2); + expect(params[0].text).to.eq("$1"); + expect(params[1].text).to.eq("$2"); + }); + + it("should handle string escaping in JSON output", () => { + const result = query.scanSync("SELECT 'text with \"quotes\" and \\backslash'"); + + const stringToken = result.tokens.find(t => t.tokenName === "SCONST"); + expect(stringToken).to.exist; + expect(stringToken.text).to.include('"'); + expect(stringToken.text).to.include('\\'); + }); + + it("should scan INSERT statements", () => { + const result = query.scanSync("INSERT INTO table VALUES (1, 'text', 3.14)"); + + expect(result.tokens.some(t => t.text === "INSERT")).to.be.true; + expect(result.tokens.some(t => t.text === "INTO")).to.be.true; + expect(result.tokens.some(t => t.text === "VALUES")).to.be.true; + expect(result.tokens.some(t => t.tokenName === "ICONST")).to.be.true; + expect(result.tokens.some(t => t.tokenName === "SCONST")).to.be.true; + expect(result.tokens.some(t => t.tokenName === "FCONST")).to.be.true; + }); + + it("should scan UPDATE statements", () => { + const result = query.scanSync("UPDATE users SET name = 'John' WHERE id = 1"); + + expect(result.tokens.some(t => t.text === "UPDATE")).to.be.true; + expect(result.tokens.some(t => t.text === "SET")).to.be.true; + expect(result.tokens.some(t => t.text === "=")).to.be.true; + }); + + it("should scan DELETE statements", () => { + const result = query.scanSync("DELETE FROM users WHERE active = false"); + + expect(result.tokens.some(t => t.text === "DELETE")).to.be.true; + expect(result.tokens.some(t => t.text === "FROM")).to.be.true; + expect(result.tokens.some(t => t.text === "WHERE")).to.be.true; + }); + + it("should handle empty or whitespace-only input", () => { + const result = query.scanSync(" "); + expect(result.tokens).to.have.lengthOf(0); + }); + + it("should handle unusual input gracefully", () => { + // The scanner is more permissive than the parser and may tokenize unusual input + const result = query.scanSync("$$$INVALID$$$"); + expect(result).to.be.an("object"); + expect(result.tokens).to.be.an("array"); + // Scanner may still produce tokens even for unusual input + }); + + it("should preserve original token order", () => { + const sql = "SELECT id, name FROM users ORDER BY name"; + const result = query.scanSync(sql); + + // Tokens should be in order of appearance + for (let i = 1; i < result.tokens.length; i++) { + expect(result.tokens[i].start).to.be.at.least(result.tokens[i-1].end); + } + }); + }); + + describe("Async Scanning", () => { + it("should return a promise resolving to same result as sync", async () => { + const testQuery = "SELECT * FROM users WHERE id = $1"; + const resultPromise = query.scan(testQuery); + const result = await resultPromise; + + expect(resultPromise).to.be.instanceof(Promise); + expect(result).to.deep.eq(query.scanSync(testQuery)); + }); + + it("should handle complex queries asynchronously", async () => { + const testQuery = "SELECT COUNT(*) as total FROM orders WHERE status = 'completed' AND created_at > '2023-01-01'"; + const result = await query.scan(testQuery); + + expect(result).to.be.an("object"); + expect(result.tokens).to.be.an("array"); + expect(result.tokens.length).to.be.greaterThan(10); + }); + + it("should handle unusual input asynchronously", async () => { + // Scanner is more permissive than parser + const result = await query.scan("$$$INVALID$$$"); + expect(result).to.be.an("object"); + expect(result.tokens).to.be.an("array"); + }); + }); + + describe("Edge Cases", () => { + it("should handle queries with comments", () => { + const result = query.scanSync("SELECT 1 -- this is a comment"); + + // Should have at least SELECT and 1 tokens + expect(result.tokens.length).to.be.at.least(2); + expect(result.tokens.some(t => t.text === "SELECT")).to.be.true; + expect(result.tokens.some(t => t.text === "1")).to.be.true; + }); + + it("should handle very long identifiers", () => { + const longIdentifier = "a".repeat(100); + const result = query.scanSync(`SELECT ${longIdentifier} FROM table`); + + const identToken = result.tokens.find(t => t.text === longIdentifier); + expect(identToken).to.exist; + expect(identToken.tokenName).to.eq("IDENT"); + }); + + it("should handle special PostgreSQL operators", () => { + const result = query.scanSync("SELECT id::text FROM users"); + + expect(result.tokens.some(t => t.text === "::")).to.be.true; + const typecastToken = result.tokens.find(t => t.text === "::"); + expect(typecastToken?.tokenName).to.eq("TYPECAST"); + }); + + it("should provide consistent version information", () => { + const result1 = query.scanSync("SELECT 1"); + const result2 = query.scanSync("INSERT INTO table VALUES (1)"); + + expect(result1.version).to.eq(result2.version); + expect(result1.version).to.be.a("number"); + expect(result1.version).to.be.greaterThan(0); + }); + }); +}); \ No newline at end of file diff --git a/wasm/index.cjs b/wasm/index.cjs index 626939f8..5e435155 100644 --- a/wasm/index.cjs +++ b/wasm/index.cjs @@ -17,13 +17,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); -exports.normalize = exports.fingerprint = exports.parsePlPgSQL = exports.deparse = exports.parse = void 0; +exports.scan = exports.normalize = exports.fingerprint = exports.parsePlPgSQL = exports.deparse = exports.parse = void 0; exports.loadModule = loadModule; exports.parseSync = parseSync; exports.deparseSync = deparseSync; exports.parsePlPgSQLSync = parsePlPgSQLSync; exports.fingerprintSync = fingerprintSync; exports.normalizeSync = normalizeSync; +exports.scanSync = scanSync; __exportStar(require("@pgsql/types"), exports); // @ts-ignore const libpg_query_js_1 = __importDefault(require("./libpg-query.js")); @@ -267,3 +268,42 @@ function normalizeSync(query) { } } } +exports.scan = awaitInit(async (query) => { + const queryPtr = stringToPtr(query); + let resultPtr = 0; + try { + resultPtr = wasmModule._wasm_scan(queryPtr); + const resultStr = ptrToString(resultPtr); + if (resultStr.startsWith('syntax error') || resultStr.startsWith('deparse error') || resultStr.includes('ERROR')) { + throw new Error(resultStr); + } + return JSON.parse(resultStr); + } + finally { + wasmModule._free(queryPtr); + if (resultPtr) { + wasmModule._wasm_free_string(resultPtr); + } + } +}); +function scanSync(query) { + if (!wasmModule) { + throw new Error('WASM module not initialized. Call loadModule() first.'); + } + const queryPtr = stringToPtr(query); + let resultPtr = 0; + try { + resultPtr = wasmModule._wasm_scan(queryPtr); + const resultStr = ptrToString(resultPtr); + if (resultStr.startsWith('syntax error') || resultStr.startsWith('deparse error') || resultStr.includes('ERROR')) { + throw new Error(resultStr); + } + return JSON.parse(resultStr); + } + finally { + wasmModule._free(queryPtr); + if (resultPtr) { + wasmModule._wasm_free_string(resultPtr); + } + } +} diff --git a/wasm/index.d.ts b/wasm/index.d.ts index 83b385fa..22b11125 100644 --- a/wasm/index.d.ts +++ b/wasm/index.d.ts @@ -1,5 +1,18 @@ import { ParseResult } from "@pgsql/types"; export * from "@pgsql/types"; +export interface ScanToken { + start: number; + end: number; + text: string; + tokenType: number; + tokenName: string; + keywordKind: number; + keywordName: string; +} +export interface ScanResult { + version: number; + tokens: ScanToken[]; +} export declare function loadModule(): Promise; export declare const parse: (query: string) => Promise; export declare const deparse: (parseTree: ParseResult) => Promise; @@ -11,3 +24,5 @@ export declare function deparseSync(parseTree: ParseResult): string; export declare function parsePlPgSQLSync(query: string): ParseResult; export declare function fingerprintSync(query: string): string; export declare function normalizeSync(query: string): string; +export declare const scan: (query: string) => Promise; +export declare function scanSync(query: string): ScanResult; diff --git a/wasm/index.js b/wasm/index.js index 754ea962..065c4f29 100644 --- a/wasm/index.js +++ b/wasm/index.js @@ -241,3 +241,42 @@ export function normalizeSync(query) { } } } +export const scan = awaitInit(async (query) => { + const queryPtr = stringToPtr(query); + let resultPtr = 0; + try { + resultPtr = wasmModule._wasm_scan(queryPtr); + const resultStr = ptrToString(resultPtr); + if (resultStr.startsWith('syntax error') || resultStr.startsWith('deparse error') || resultStr.includes('ERROR')) { + throw new Error(resultStr); + } + return JSON.parse(resultStr); + } + finally { + wasmModule._free(queryPtr); + if (resultPtr) { + wasmModule._wasm_free_string(resultPtr); + } + } +}); +export function scanSync(query) { + if (!wasmModule) { + throw new Error('WASM module not initialized. Call loadModule() first.'); + } + const queryPtr = stringToPtr(query); + let resultPtr = 0; + try { + resultPtr = wasmModule._wasm_scan(queryPtr); + const resultStr = ptrToString(resultPtr); + if (resultStr.startsWith('syntax error') || resultStr.startsWith('deparse error') || resultStr.includes('ERROR')) { + throw new Error(resultStr); + } + return JSON.parse(resultStr); + } + finally { + wasmModule._free(queryPtr); + if (resultPtr) { + wasmModule._wasm_free_string(resultPtr); + } + } +}