exasol-labs
diff --git a/‎CHANGELOG.md‎
Lines changed: 15 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.lock‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benches/rust/benchmark.rs‎
Lines changed: 4 additions & 4 deletions b/‎benches/rust/benchmark.rs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benches/rust/generate_data.rs‎
Lines changed: 1 addition & 1 deletion b/‎benches/rust/generate_data.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎specs/import-export/spec.md‎
Lines changed: 103 additions & 1 deletion b/‎specs/import-export/spec.md‎
Lines changed: 103 additions & 1 deletion
diff --git a/‎specs/type-mapping/spec.md‎
Lines changed: 38 additions & 18 deletions b/‎specs/type-mapping/spec.md‎
Lines changed: 38 additions & 18 deletions
@@ -0,0 +1,15 @@
+# Changelog
+
+## 0.5.0
+
+- Schema inference for Parquet imports
+
+## 0.4.0
+
+- Parallel CSV and Parquet file imports
+
+## <=0.3.2
+
+- ADBC driver implementation
+- Import/export capability via HTTP tunneling
+- Arrow type mapping for Exasol types
@@ -1,6 +1,6 @@
 [package]
 name = "exarrow-rs"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2021"
 license = "MIT"
 authors = ["Exasol Labs"]
 
@@ -5,7 +5,7 @@
 
 use std::env;
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::time::Instant;
 
 use clap::{Parser, ValueEnum};
@@ -123,7 +123,7 @@ async fn truncate_table(conn: &mut Connection) -> Result<(), Box<dyn std::error:
 
 async fn import_csv(
     conn: &mut Connection,
-    file_path: &PathBuf,
+    file_path: &Path,
 ) -> Result<(i64, f64), Box<dyn std::error::Error>> {
     truncate_table(conn).await?;
 
@@ -140,7 +140,7 @@ async fn import_csv(
 
 async fn import_parquet(
     conn: &mut Connection,
-    file_path: &PathBuf,
+    file_path: &Path,
 ) -> Result<(i64, f64), Box<dyn std::error::Error>> {
     truncate_table(conn).await?;
 
@@ -217,7 +217,7 @@ async fn select_to_polars(
 async fn run_import_benchmark(
     conn: &mut Connection,
     operation: &Operation,
-    file_path: &PathBuf,
+    file_path: &Path,
     iterations: usize,
     warmup: usize,
     file_size_mb: f64,
 
@@ -85,7 +85,7 @@ fn generate_batch(rng: &mut StdRng, start_id: i64, count: usize) -> RecordBatch
     let ages: Vec<i32> = (0..count).map(|_| rng.gen_range(18..80)).collect();
 
     let salaries: Vec<i128> = (0..count)
-        .map(|_| rng.gen_range(30_000_00i128..500_000_00i128)) // cents
+        .map(|_| rng.gen_range(3_000_000_i128..50_000_000_i128)) // cents
         .collect();
 
     let timestamps: Vec<i64> = (0..count)
 
@@ -374,4 +374,106 @@ operations immediately upon first failure to prevent partial data imports.
 
 - **WHEN** any Parquet file fails to convert to CSV
 - **THEN** system SHALL abort all other conversion tasks immediately
-- **AND** system SHALL return error indicating which file failed conversion
+- **AND** system SHALL return error indicating which file failed conversion
+
+### Requirement: Arrow Schema Inference from Parquet Files
+
+The system SHALL support inferring Arrow schemas from Parquet file metadata without reading the full data.
+
+#### Scenario: Infer schema from single Parquet file
+
+- **WHEN** user requests schema inference from a single Parquet file
+- **THEN** system SHALL read only the Parquet metadata (not data)
+- **AND** system SHALL return the Arrow schema with field names and types
+- **AND** system SHALL include nullability information for each field
+
+#### Scenario: Infer union schema from multiple Parquet files
+
+- **WHEN** user requests schema inference from multiple Parquet files
+- **THEN** system SHALL read metadata from all files
+- **AND** system SHALL compute a union schema that accommodates all files
+- **AND** system SHALL widen types when fields have different types across files
+- **AND** type widening SHALL follow these rules:
+  - Identical types remain unchanged
+  - DECIMAL types widen to max(precision), max(scale)
+  - VARCHAR types widen to max(size)
+  - DECIMAL + DOUBLE widens to DOUBLE
+  - Incompatible types fall back to VARCHAR(2000000)
+
+#### Scenario: Schema inference error handling
+
+- **WHEN** schema inference encounters an error
+- **THEN** system SHALL return SchemaInferenceError with file path context
+- **AND** system SHALL indicate whether the error was in reading metadata or type conversion
+
+### Requirement: Arrow to Exasol DDL Generation
+
+The system SHALL support generating Exasol CREATE TABLE DDL statements from inferred schemas.
+
+#### Scenario: Column name handling with Quoted mode
+
+- **WHEN** generating DDL with Quoted column name mode
+- **THEN** column names SHALL be wrapped in double quotes
+- **AND** internal double quotes in names SHALL be escaped by doubling
+- **AND** original column names SHALL be preserved exactly
+
+#### Scenario: Column name handling with Sanitize mode
+
+- **WHEN** generating DDL with Sanitize column name mode
+- **THEN** column names SHALL be converted to uppercase
+- **AND** invalid identifier characters SHALL be replaced with underscore
+- **AND** names starting with digits SHALL be prefixed with underscore
+- **AND** Exasol reserved words SHALL be quoted
+
+#### Scenario: DDL type generation
+
+- **WHEN** generating DDL column types
+- **THEN** ExasolType SHALL be converted to valid DDL syntax
+- **AND** BOOLEAN SHALL generate "BOOLEAN"
+- **AND** VARCHAR(n) SHALL generate "VARCHAR(n)"
+- **AND** DECIMAL(p,s) SHALL generate "DECIMAL(p,s)"
+- **AND** DOUBLE SHALL generate "DOUBLE"
+- **AND** DATE SHALL generate "DATE"
+- **AND** TIMESTAMP SHALL generate "TIMESTAMP" or "TIMESTAMP WITH LOCAL TIME ZONE"
+
+#### Scenario: Complete DDL statement generation
+
+- **WHEN** generating CREATE TABLE DDL
+- **THEN** output SHALL include "CREATE TABLE schema.table (" prefix
+- **AND** output SHALL include column definitions separated by commas
+- **AND** output SHALL include closing ");"
+- **AND** schema prefix SHALL be optional (omit if not provided)
+
+### Requirement: Auto Table Creation for Parquet Import
+
+The system SHALL support automatically creating target tables before Parquet import when enabled.
+
+#### Scenario: Auto-create table option enabled
+
+- **WHEN** importing Parquet with create_table_if_not_exists=true
+- **AND** target table does not exist
+- **THEN** system SHALL infer schema from Parquet file(s)
+- **AND** system SHALL generate CREATE TABLE DDL
+- **AND** system SHALL execute DDL before IMPORT statement
+- **AND** import SHALL proceed normally after table creation
+
+#### Scenario: Auto-create with existing table
+
+- **WHEN** importing Parquet with create_table_if_not_exists=true
+- **AND** target table already exists
+- **THEN** system SHALL skip DDL execution
+- **AND** import SHALL proceed normally using existing table schema
+
+#### Scenario: Auto-create option disabled (default)
+
+- **WHEN** importing Parquet with create_table_if_not_exists=false (default)
+- **THEN** system SHALL NOT attempt schema inference
+- **AND** system SHALL NOT execute any CREATE TABLE DDL
+- **AND** import SHALL assume table already exists
+
+#### Scenario: Multi-file auto-create
+
+- **WHEN** importing multiple Parquet files with create_table_if_not_exists=true
+- **THEN** system SHALL compute union schema from all files
+- **AND** system SHALL create table with widened types
+- **AND** all files SHALL be importable into the created table
@@ -155,30 +155,50 @@ The system SHALL preserve type metadata in Arrow schemas.
 - **THEN** it SHALL provide access to original Exasol type names
 - **AND** it SHALL expose type mapping used for each column
 
-### Requirement: Exasol Type Limit Documentation
+### Requirement: Exasol Data Type Boundaries
 
-The system SHALL document Exasol's actual data type limits as defined in official documentation.
+The system SHALL enforce Exasol's documented data type limits when generating DDL or validating type mappings.
 
-#### Scenario: DECIMAL limits
+#### Scenario: VARCHAR type boundaries
 
-- **WHEN** documenting DECIMAL type limits
-- **THEN** documentation SHALL state precision range is 1-36 digits
-- **AND** documentation SHALL note this differs from Arrow Decimal128's 38-digit limit
+- **WHEN** mapping to Exasol VARCHAR
+- **THEN** the maximum length SHALL be 2,000,000 characters
+- **AND** values exceeding this limit SHALL be truncated or rejected based on configuration
 
-#### Scenario: TIMESTAMP limits
+#### Scenario: CHAR type boundaries
 
-- **WHEN** documenting TIMESTAMP type limits
-- **THEN** documentation SHALL state fractional seconds precision range is 0-9
-- **AND** documentation SHALL explain the mapping to Arrow TimeUnit
+- **WHEN** mapping to Exasol CHAR
+- **THEN** the maximum length SHALL be 2,000 characters
+- **AND** CHAR is fixed-width with space padding
 
-#### Scenario: String type limits
+#### Scenario: DECIMAL type boundaries
 
-- **WHEN** documenting string type limits
-- **THEN** documentation SHALL note VARCHAR maximum practical size
-- **AND** documentation SHALL note CHAR fixed-size semantics
+- **WHEN** mapping to Exasol DECIMAL(p, s)
+- **THEN** precision SHALL be in range 1-36
+- **AND** scale SHALL be in range 0-36
+- **AND** scale SHALL NOT exceed precision
 
-#### Scenario: INTERVAL limits
+#### Scenario: TIMESTAMP type boundaries
 
-- **WHEN** documenting INTERVAL type limits
-- **THEN** documentation SHALL state INTERVAL DAY TO SECOND precision range is 0-9 for fractional seconds
-- **AND** documentation SHALL note fixed 8-byte storage for both interval types
+- **WHEN** mapping to Exasol TIMESTAMP
+- **THEN** fractional seconds precision SHALL be in range 0-9
+- **AND** TIMESTAMP WITH LOCAL TIME ZONE SHALL be used for timezone-aware timestamps
+
+#### Scenario: Integer type mappings for DDL generation
+
+- **WHEN** mapping Arrow integer types to Exasol DDL
+- **THEN** Int8, Int16, Int32 SHALL map to DECIMAL(18,0)
+- **AND** Int64 SHALL map to DECIMAL(36,0)
+- **AND** UInt8, UInt16, UInt32 SHALL map to DECIMAL(18,0)
+- **AND** UInt64 SHALL map to DECIMAL(36,0)
+
+#### Scenario: Floating point type mappings for DDL generation
+
+- **WHEN** mapping Arrow floating point types to Exasol DDL
+- **THEN** Float32 and Float64 SHALL map to DOUBLE
+
+#### Scenario: INTERVAL type boundaries
+
+- **WHEN** mapping to Exasol INTERVAL types
+- **THEN** INTERVAL DAY TO SECOND fractional precision SHALL be in range 0-9
+- **AND** both INTERVAL types use fixed 8-byte storage