MIT-LCP
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/croissant_maker/handlers/csv_handler.py‎
Lines changed: 67 additions & 47 deletions b/‎src/croissant_maker/handlers/csv_handler.py‎
Lines changed: 67 additions & 47 deletions
diff --git a/‎src/croissant_maker/handlers/parquet_handler.py‎
Lines changed: 10 additions & 35 deletions b/‎src/croissant_maker/handlers/parquet_handler.py‎
Lines changed: 10 additions & 35 deletions
@@ -20,10 +20,10 @@ authors = [
 ]
 
 # Core runtime dependencies needed for the tool to function.
+# Note: pandas is not a direct dependency — it is available transitively via wfdb.
 dependencies = [
     "typer >= 0.9.0",
-    "mlcroissant >= 1.0.0",
-    "pandas >= 1.3.0",
+    "mlcroissant >= 1.0.20",
     "rich >= 13.0.0",
     "wfdb >= 4.0.0",
     "pyarrow >= 15.0.0",
 
@@ -1,10 +1,15 @@
 """CSV file handler for tabular data processing."""
 
 from pathlib import Path
-import pandas as pd
+
+import pyarrow as pa
+import pyarrow.csv as pa_csv
 
 from croissant_maker.handlers.base_handler import FileTypeHandler
-from croissant_maker.handlers.utils import analyze_data_sample, compute_file_hash
+from croissant_maker.handlers.utils import (
+    compute_file_hash,
+    infer_column_types_from_arrow_schema,
+)
 
 
 class CSVHandler(FileTypeHandler):
@@ -16,11 +21,27 @@ class CSVHandler(FileTypeHandler):
     - Gzip-compressed CSV files (.csv.gz)
     - Bzip2-compressed CSV files (.csv.bz2)
     - XZ-compressed CSV files (.csv.xz)
-    - Automatic column type detection using pandas
+    - Automatic column type detection using PyArrow
     - SHA256 hash computation for file integrity
-    - Sample data extraction for preview
+
+    Uses PyArrow's CSV reader which:
+    - Auto-detects compressed formats from filename extension
+    - Infers precise types (timestamp[s], date32, int64, float64, etc.)
+    - Reads multi-threaded by default for performance
     """
 
+    # Common timestamp formats for medical/clinical data beyond ISO-8601.
+    # PyArrow uses ISO8601 by default; these cover additional patterns found
+    # in datasets like MIMIC, eICU, and OMOP.
+    _TIMESTAMP_PARSERS = [
+        pa_csv.ISO8601,
+        "%Y-%m-%d %H:%M:%S",
+        "%m/%d/%Y",
+        "%d/%m/%Y",
+        "%m/%d/%Y %H:%M:%S",
+        "%Y-%m-%dT%H:%M:%S",
+    ]
+
     def can_handle(self, file_path: Path) -> bool:
         """
         Check if the file is a CSV or compressed CSV file.
@@ -43,19 +64,17 @@ def extract_metadata(self, file_path: Path) -> dict:
         """
         Extract comprehensive metadata from a CSV file.
 
-        Reads a sample of the CSV file to infer column types, extracts
-        file statistics, computes integrity hashes, and prepares all
-        metadata needed for Croissant generation.
+        Uses PyArrow to read the CSV with automatic type inference,
+        including timestamp detection and precise numeric types.
 
         Args:
             file_path: Path to the CSV file
 
         Returns:
             Dictionary containing:
             - Basic file info (path, name, size, hash)
-            - Format information (encoding, compression)
+            - Format information (encoding)
             - Data structure (columns, types, row count)
-            - Sample data for preview
 
         Raises:
             ValueError: If the CSV file cannot be read or processed
@@ -64,45 +83,46 @@ def extract_metadata(self, file_path: Path) -> dict:
         if not file_path.exists():
             raise FileNotFoundError(f"CSV file not found: {file_path}")
 
+        # Parse CSV — only this call needs error translation
         try:
-            # Read a sample for type inference (1000 rows for good accuracy)
-            df = pd.read_csv(file_path, nrows=1000)
-
-            if df.empty:
-                raise ValueError(f"CSV file is empty: {file_path}")
-
-            # Extract file properties
-            file_size = file_path.stat().st_size
-            sha256_hash = compute_file_hash(file_path)
-
-            # Analyze the data structure and types
-            data_analysis = analyze_data_sample(df)
-
-            # Determine encoding format based on file extension
-            name_lower = file_path.name.lower()
-            if name_lower.endswith(".csv.gz"):
-                encoding_format = "application/gzip"
-            elif name_lower.endswith(".csv.bz2"):
-                encoding_format = "application/x-bzip2"
-            elif name_lower.endswith(".csv.xz"):
-                encoding_format = "application/x-xz"
-            else:
-                encoding_format = "text/csv"
-
-            return {
-                "file_path": str(file_path),
-                "file_name": file_path.name,
-                "file_size": file_size,
-                "sha256": sha256_hash,
-                "encoding_format": encoding_format,
-                **data_analysis,  # Includes column_types, num_rows, columns, sample_data
-            }
-
-        except pd.errors.EmptyDataError:
-            raise ValueError(f"CSV file contains no data: {file_path}")
-        except pd.errors.ParserError as e:
+            convert_options = pa_csv.ConvertOptions(
+                timestamp_parsers=self._TIMESTAMP_PARSERS,
+            )
+            table = pa_csv.read_csv(str(file_path), convert_options=convert_options)
+        except pa.lib.ArrowInvalid as e:
             raise ValueError(f"Failed to parse CSV file {file_path}: {e}")
         except UnicodeDecodeError as e:
             raise ValueError(f"Encoding error in CSV file {file_path}: {e}")
-        except Exception as e:
-            raise ValueError(f"Failed to process CSV file {file_path}: {e}")
+
+        if table.num_rows == 0:
+            raise ValueError(f"CSV file is empty: {file_path}")
+
+        # Infer types from the Arrow schema (shared with Parquet handler)
+        column_types = infer_column_types_from_arrow_schema(table.schema)
+
+        # Extract file properties
+        file_size = file_path.stat().st_size
+        sha256_hash = compute_file_hash(file_path)
+
+        # Determine encoding format based on file extension
+        name_lower = file_path.name.lower()
+        if name_lower.endswith(".csv.gz"):
+            encoding_format = "application/gzip"
+        elif name_lower.endswith(".csv.bz2"):
+            encoding_format = "application/x-bzip2"
+        elif name_lower.endswith(".csv.xz"):
+            encoding_format = "application/x-xz"
+        else:
+            encoding_format = "text/csv"
+
+        return {
+            "file_path": str(file_path),
+            "file_name": file_path.name,
+            "file_size": file_size,
+            "sha256": sha256_hash,
+            "encoding_format": encoding_format,
+            "column_types": column_types,
+            "num_rows": table.num_rows,
+            "num_columns": table.num_columns,
+            "columns": table.column_names,
+        }
@@ -1,20 +1,22 @@
 """Parquet file handler for tabular event streams (e.g., MEDS)."""
 
 from pathlib import Path
-from typing import Dict
 
-from croissant_maker.handlers.base_handler import FileTypeHandler
-from croissant_maker.handlers.utils import compute_file_hash
 from pyarrow.parquet import ParquetFile
-import pyarrow.types as patypes
+
+from croissant_maker.handlers.base_handler import FileTypeHandler
+from croissant_maker.handlers.utils import (
+    compute_file_hash,
+    infer_column_types_from_arrow_schema,
+)
 
 
 class ParquetHandler(FileTypeHandler):
     """
     Handler for Parquet files (.parquet) with schema-based type inference.
 
     - Uses pyarrow to read schema and row count without loading full data
-    - Emits Croissant-compatible column types
+    - Emits Croissant-compatible column types via shared map_arrow_type()
     - Computes SHA256 for reproducibility
     - Keeps memory usage minimal (schema-only)
     """
@@ -32,13 +34,9 @@ def extract_metadata(self, file_path: Path) -> dict:
             schema = pq.schema_arrow
             num_rows = pq.metadata.num_rows if pq.metadata is not None else 0
 
-            column_types: Dict[str, str] = {}
-            columns = []
-            for field in schema:
-                columns.append(field.name)
-                column_types[field.name] = self._map_arrow_type_to_croissant(
-                    field.type, patypes
-                )
+            # Use the shared Arrow type mapper (same as CSV handler)
+            column_types = infer_column_types_from_arrow_schema(schema)
+            columns = [field.name for field in schema]
 
             file_size = file_path.stat().st_size
             sha256_hash = compute_file_hash(file_path)
@@ -56,26 +54,3 @@ def extract_metadata(self, file_path: Path) -> dict:
             }
         except Exception as e:
             raise ValueError(f"Failed to process Parquet file {file_path}: {e}") from e
-
-    @staticmethod
-    def _map_arrow_type_to_croissant(arrow_type, patypes) -> str:
-        """Map pyarrow types to Croissant schema.org data types."""
-        try:
-            if patypes.is_integer(arrow_type):
-                return "sc:Integer"
-            if patypes.is_floating(arrow_type) or patypes.is_decimal(arrow_type):
-                return "sc:Float"
-            if patypes.is_boolean(arrow_type):
-                return "sc:Boolean"
-            if patypes.is_timestamp(arrow_type):
-                return "sc:Date"
-            if patypes.is_date(arrow_type):
-                return "sc:Date"
-            if patypes.is_string(arrow_type) or patypes.is_large_string(arrow_type):
-                return "sc:Text"
-            if patypes.is_binary(arrow_type) or patypes.is_large_binary(arrow_type):
-                return "sc:Text"
-        except Exception:
-            # Fallback to text for any exotic or extension types
-            pass
-        return "sc:Text"