feat: PostgresCatalog and PostgresTable followups (#5508)

desmondcheongzx · web-flow · commit 5d45115cec06 · 2025-11-07T12:45:02.000Z
## Changes Made

From user feedback:
- Enable "vector" extension by default, if available
- Make Row Level Security opt-out, following Supabase behaviour
- Add docs
- Cast `vector` to `embedding` correctly on `read_sql()`
diff --git a/daft/catalog/__init__.py b/daft/catalog/__init__.py
@@ -313,21 +313,21 @@ def from_glue(
             raise ImportError("AWS Glue support not installed: pip install -U 'daft[aws]'")
 
     @staticmethod
-    def from_postgres(connection_string: str, extensions: list[str] | None = None) -> Catalog:
+    def from_postgres(connection_string: str, extensions: list[str] | None = ["vector"]) -> Catalog:
         """Create a Daft Catalog from a PostgreSQL connection string.
 
-        Note::
-            This is an experimental feature and the API may change in the future.
-
         Args:
             connection_string (str): a PostgreSQL connection string
             extensions (list[str], optional): List of PostgreSQL extensions to create if they don't exist.
                 For each extension, "CREATE EXTENSION IF NOT EXISTS <extension>" will be executed.
-                Defaults to None (no extensions).
+                Defaults to ["vector"] (pgvector extension, if available).
 
         Returns:
             Catalog: a new Catalog instance to a PostgreSQL database.
 
+        Warning:
+            This features is early in development and will likely experience API changes.
+
         Examples:
             >>> catalog = Catalog.from_postgres("postgresql://user:password@host:port/database")
             >>> catalog = Catalog.from_postgres(
diff --git a/daft/catalog/__postgres.py b/daft/catalog/__postgres.py
@@ -13,26 +13,47 @@
 from daft.datatype import DataType
 from daft.expressions import col
 from daft.io._sql import read_sql
+from daft.logical.schema import Field
 
 
 @contextmanager
-def postgres_connection(connection_string: str, extensions: list[str] | None = None) -> psycopg.Connection.connect:
+def postgres_connection(connection_string: str, extensions: list[str] | None) -> psycopg.Connection.connect:
     """Context manager that provides a PostgreSQL connection with specified extensions setup.
 
     Args:
         connection_string: PostgreSQL connection string
-        extensions: List of extension names to create if they don't exist. For each extension,
-                   "CREATE EXTENSION IF NOT EXISTS <extension>" will be executed.
+        extensions: List of extension names to create if they don't exist and are available.
+                   For each extension, availability is checked in pg_available_extensions before
+                   attempting "CREATE EXTENSION IF NOT EXISTS <extension>".
     """
     with psycopg.connect(connection_string) as conn:
         if extensions:
-            for extension in extensions:
-                conn.execute(
-                    psycopg.sql.SQL("CREATE EXTENSION IF NOT EXISTS {}").format(psycopg.sql.Identifier(extension))
-                )
-            # Special handling for vector extension - register pgvector types.
-            if "vector" in extensions:
-                register_vector(conn)
+            with conn.cursor() as cur:
+                for extension in extensions:
+                    # Check if extension is available before attempting to create it
+                    cur.execute(
+                        psycopg.sql.SQL("SELECT EXISTS(SELECT 1 FROM pg_available_extensions WHERE name = {})").format(
+                            psycopg.sql.Literal(extension)
+                        )
+                    )
+                    result = cur.fetchone()
+                    is_available = result[0] if result else False
+
+                    if is_available:
+                        cur.execute(
+                            psycopg.sql.SQL("CREATE EXTENSION IF NOT EXISTS {}").format(
+                                psycopg.sql.Identifier(extension)
+                            )
+                        )
+
+                # Register pgvector type if it was successfully created
+                if "vector" in extensions:
+                    cur.execute(psycopg.sql.SQL("SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector')"))
+                    result = cur.fetchone()
+                    vector_installed = result[0] if result else False
+
+                    if vector_installed:
+                        register_vector(conn)
         yield conn
 
 
@@ -187,7 +208,7 @@ def __init__(self) -> None:
         raise RuntimeError("PostgresCatalog.__init__ is not supported, please use `Catalog.from_postgres` instead.")
 
     @staticmethod
-    def from_uri(uri: str, extensions: list[str] | None = None, **options: str | None) -> PostgresCatalog:
+    def from_uri(uri: str, extensions: list[str] | None, **options: str | None) -> PostgresCatalog:
         """Create a PostgresCatalog from a connection string."""
         validate_connection_string(uri)
         c = PostgresCatalog.__new__(PostgresCatalog)
@@ -236,7 +257,7 @@ def _create_table(
         Args:
             identifier (Identifier): The identifier of the table to create.
             schema (Schema): The schema of the table to create.
-            properties (Properties): The properties of the table to create. One supported property is "enable_rls" (bool), which enables Row Level Security by default. See: https://www.postgresql.org/docs/current/ddl-rowsecurity.html
+            properties (Properties): The properties of the table to create. One supported property is "enable_rls" (bool), which enables Row Level Security. This property is set to True by default. See: https://www.postgresql.org/docs/current/ddl-rowsecurity.html
             partition_fields (list[PartitionField]): The partition fields of the table to create.
 
         Returns:
@@ -280,7 +301,7 @@ def _create_table(
                         )
                     )
 
-                    if properties and properties.get("enable_rls", False):
+                    if properties is None or properties.get("enable_rls", True):
                         cur.execute(
                             psycopg.sql.SQL("ALTER TABLE {} ENABLE ROW LEVEL SECURITY").format(quoted_full_table)
                         )
@@ -480,7 +501,113 @@ def name(self) -> str:
 
     def schema(self) -> Schema:
         """Returns the table's schema."""
-        return self.read().schema()
+        connection_string, identifier = self._inner
+
+        if len(identifier) == 1:
+            # When no schema is specified, PostgreSQL uses the schema search path to select the schema to use.
+            # Since this is user-configurable, we simply pass along the single identifier to PostgreSQL.
+            # See: https://www.postgresql.org/docs/current/ddl-schemas.html#DDL-SCHEMAS-PATH
+            schema_name = None
+            table_name = identifier[0]
+        elif len(identifier) == 2:
+            schema_name = identifier[0]
+            table_name = identifier[1]
+        else:
+            raise ValueError(f"Invalid table identifier: {identifier}")
+
+        # Query the database schema to get column information
+        with postgres_connection(connection_string, self._extensions) as conn:
+            with conn.cursor() as cur:
+                if schema_name:
+                    cur.execute(
+                        psycopg.sql.SQL("""
+                        SELECT
+                            c.column_name,
+                            c.data_type,
+                            c.udt_name,
+                            CASE
+                                WHEN c.data_type = 'USER-DEFINED' AND c.udt_name = 'vector'
+                                THEN a.atttypmod
+                                ELSE NULL
+                            END as vector_dimension
+                        FROM information_schema.columns c
+                        JOIN pg_class cls ON cls.relname = c.table_name
+                        JOIN pg_namespace nsp ON nsp.oid = cls.relnamespace AND nsp.nspname = c.table_schema
+                        LEFT JOIN pg_attribute a ON a.attrelid = cls.oid AND a.attname = c.column_name
+                        WHERE c.table_schema = {} AND c.table_name = {}
+                        ORDER BY c.ordinal_position
+                        """).format(psycopg.sql.Literal(schema_name), psycopg.sql.Literal(table_name)),
+                    )
+                else:
+                    cur.execute(
+                        psycopg.sql.SQL("""
+                        SELECT
+                            c.column_name,
+                            c.data_type,
+                            c.udt_name,
+                            CASE
+                                WHEN c.data_type = 'USER-DEFINED' AND c.udt_name = 'vector'
+                                THEN a.atttypmod
+                                ELSE NULL
+                            END as vector_dimension
+                        FROM information_schema.columns c
+                        JOIN pg_class cls ON cls.relname = c.table_name
+                        LEFT JOIN pg_attribute a ON a.attrelid = cls.oid AND a.attname = c.column_name
+                        WHERE c.table_name = {}
+                        ORDER BY c.ordinal_position
+                        """).format(psycopg.sql.Literal(table_name)),
+                    )
+
+                columns = cur.fetchall()
+
+        # If no columns found, fall back to data-based inference
+        if not columns:
+            return self.read().schema()
+
+        # Build schema from database metadata
+        fields = []
+        for column_name, data_type, udt_name, vector_dimension in columns:
+            if data_type == "USER-DEFINED" and udt_name == "vector":
+                # This is a pgvector column, convert to embedding type
+                # vector_dimension from atttypmod contains the dimension information
+                # For pgvector, atttypmod stores the dimension directly
+                dimension = vector_dimension if vector_dimension and vector_dimension > 0 else 0
+
+                if dimension > 0:
+                    fields.append(Field.create(column_name, DataType.embedding(DataType.float32(), dimension)))
+                else:
+                    # Fallback to list if we can't determine dimension
+                    fields.append(Field.create(column_name, DataType.list(DataType.float32())))
+            else:
+                # For non-vector columns, try direct PostgreSQL type mapping first
+                try:
+                    # Attempt to map PostgreSQL type directly to Daft type
+                    inferred_dtype = DataType.from_sql(data_type)
+                    fields.append(Field.create(column_name, inferred_dtype))
+                except Exception:
+                    # Fall back to data-based inference for unmappable types
+                    # This is inefficient but ensures we get the correct types
+                    if schema_name:
+                        single_col_query = psycopg.sql.SQL("SELECT {} FROM {}.{} LIMIT 1").format(
+                            psycopg.sql.Identifier(column_name),
+                            psycopg.sql.Identifier(schema_name),
+                            psycopg.sql.Identifier(table_name),
+                        )
+                    else:
+                        single_col_query = psycopg.sql.SQL("SELECT {} FROM {} LIMIT 1").format(
+                            psycopg.sql.Identifier(column_name),
+                            psycopg.sql.Identifier(table_name),
+                        )
+
+                    try:
+                        single_col_df = read_sql(single_col_query.as_string(), connection_string)
+                        inferred_dtype = single_col_df.schema()[column_name].dtype
+                        fields.append(Field.create(column_name, inferred_dtype))
+                    except Exception:
+                        # If inference fails, use string as fallback
+                        fields.append(Field.create(column_name, DataType.string()))
+
+        return Schema._from_fields(fields)
 
     @staticmethod
     def _from_obj(obj: object) -> PostgresTable:
@@ -510,12 +637,20 @@ def read(
 
         query = psycopg.sql.SQL("SELECT * FROM {}").format(quoted_full_table)
 
-        return read_sql(
+        df = read_sql(
             query.as_string(),
             connection_string,
             **options,
         )
 
+        # Cast any vector columns that were read as lists to embeddings
+        schema = self.schema()  # Use our custom schema method
+        for field in schema:
+            if field.dtype.is_embedding():
+                df = df.with_column(field.name, df[field.name].cast(field.dtype))
+
+        return df
+
     def append(self, df: DataFrame, **options: Any) -> None:
         """Append the DataFrame to the table."""
         connection_string, identifier = self._inner
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
@@ -35,6 +35,7 @@
         * [Delta Lake](connectors/delta_lake.md)
         * [Lance](connectors/lance.md)
         * [Hugging Face Datasets](connectors/huggingface.md)
+        * [Postgres](connectors/postgres.md)
         * [S3](connectors/aws.md)
         * [SQL Databases](connectors/sql.md)
         * [Turbopuffer](connectors/turbopuffer.md)
diff --git a/docs/connectors/index.md b/docs/connectors/index.md
@@ -73,6 +73,14 @@ See also [Lance](lance.md) for detailed integration.
 | [`write_parquet`][daft.dataframe.DataFrame.write_parquet] | Write a DataFrame to Parquet files                             |
 
 
+## PostgreSQL
+
+| Function                                                | Description                                       |
+|---------------------------------------------------------|---------------------------------------------------|
+| [`Catalog.from_postgres`][daft.catalog.Catalog.from_postgres] | Create a catalog from a PostgreSQL database       |
+
+See also [PostgreSQL](postgres.md) for detailed integration.
+
 ## SQL
 
 | Function                       | Description                                    |
diff --git a/docs/connectors/postgres.md b/docs/connectors/postgres.md
diff --git a/tests/integration/sql/test_postgres_catalog.py b/tests/integration/sql/test_postgres_catalog.py