Matt711
diff --git a/‎python/cudf/cudf/pandas/_wrappers/pandas.py‎
Lines changed: 2 additions & 1 deletion b/‎python/cudf/cudf/pandas/_wrappers/pandas.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/cudf_polars/cudf_polars/containers/dataframe.py‎
Lines changed: 14 additions & 5 deletions b/‎python/cudf_polars/cudf_polars/containers/dataframe.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎python/cudf_polars/cudf_polars/containers/datatype.py‎
Lines changed: 2 additions & 0 deletions b/‎python/cudf_polars/cudf_polars/containers/datatype.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/cudf_polars/cudf_polars/dsl/utils/aggregations.py‎
Lines changed: 33 additions & 14 deletions b/‎python/cudf_polars/cudf_polars/dsl/utils/aggregations.py‎
Lines changed: 33 additions & 14 deletions
diff --git a/‎python/cudf_polars/cudf_polars/experimental/base.py‎
Lines changed: 76 additions & 13 deletions b/‎python/cudf_polars/cudf_polars/experimental/base.py‎
Lines changed: 76 additions & 13 deletions
diff --git a/‎python/cudf_polars/cudf_polars/experimental/io.py‎
Lines changed: 7 additions & 8 deletions b/‎python/cudf_polars/cudf_polars/experimental/io.py‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎python/cudf_polars/cudf_polars/experimental/statistics.py‎
Lines changed: 1 addition & 2 deletions b/‎python/cudf_polars/cudf_polars/experimental/statistics.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/cudf_polars/cudf_polars/testing/plugin.py‎
Lines changed: 3 additions & 0 deletions b/‎python/cudf_polars/cudf_polars/testing/plugin.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/cudf_polars/docs/overview.md‎
Lines changed: 4 additions & 1 deletion b/‎python/cudf_polars/docs/overview.md‎
Lines changed: 4 additions & 1 deletion
@@ -1132,7 +1132,8 @@ def _find_user_frame():
     frame = inspect.currentframe()
     while frame:
         modname = frame.f_globals.get("__name__", "")
-        if modname == "__main__" or not modname.startswith("cudf."):
+        # TODO: Remove "nvtx." entry once we cross nvtx-0.2.11 as minimum version
+        if modname == "__main__" or not modname.startswith(("cudf.", "nvtx.")):
             return frame
         frame = frame.f_back
     raise RuntimeError("Could not find the user's frame.")
 
@@ -29,17 +29,26 @@
 def _create_polars_column_metadata(
     name: str, dtype: PolarsDataType
 ) -> plc.interop.ColumnMetadata:
-    """Create ColumnMetadata preserving pl.Struct field names."""
+    """Create ColumnMetadata preserving dtype attributes not supported by libcudf."""
+    children_meta = []
+    timezone = ""
+    precision: int | None = None
+
     if isinstance(dtype, pl.Struct):
         children_meta = [
             _create_polars_column_metadata(field.name, field.dtype)
             for field in dtype.fields
         ]
-    else:
-        children_meta = []
-    timezone = dtype.time_zone if isinstance(dtype, pl.Datetime) else None
+    elif isinstance(dtype, pl.Datetime):
+        timezone = dtype.time_zone or timezone
+    elif isinstance(dtype, pl.Decimal):
+        precision = dtype.precision
+
     return plc.interop.ColumnMetadata(
-        name=name, timezone=timezone or "", children_meta=children_meta
+        name=name,
+        timezone=timezone,
+        precision=precision,
+        children_meta=children_meta,
     )
 
 
 
@@ -81,6 +81,8 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType:
         assert_never(dtype.time_unit)
     elif isinstance(dtype, pl.String):
         return plc.DataType(plc.TypeId.STRING)
+    elif isinstance(dtype, pl.Decimal):
+        return plc.DataType(plc.TypeId.DECIMAL128, scale=-dtype.scale)
     elif isinstance(dtype, pl.Null):
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
 
@@ -163,12 +163,30 @@ def decompose_single_agg(
         is_median = agg.name == "median"
         is_quantile = agg.name == "quantile"
 
+        # quantile agg on decimal: unsupported -> keep dtype Decimal
+        # mean/median on decimal: Polars returns float -> pre-cast
+        decimal_unsupported = False
+        if plc.traits.is_fixed_point(child_dtype):
+            if is_quantile:
+                decimal_unsupported = True
+            elif agg.name in {"mean", "median"}:
+                tid = agg.dtype.plc.id()
+                if tid in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64}:
+                    cast_to = (
+                        DataType(pl.Float64)
+                        if tid == plc.TypeId.FLOAT64
+                        else DataType(pl.Float32)
+                    )
+                    child = expr.Cast(cast_to, child)
+                    child_dtype = child.dtype.plc
+
         is_group_quantile_supported = plc.traits.is_integral(
             child_dtype
         ) or plc.traits.is_floating_point(child_dtype)
 
         unsupported = (
-            (is_median or is_quantile) and not is_group_quantile_supported
+            decimal_unsupported
+            or ((is_median or is_quantile) and not is_group_quantile_supported)
         ) or (not plc.aggregation.is_valid_aggregation(child_dtype, req))
         if unsupported:
             return [], named_expr.reconstruct(expr.Literal(child.dtype, None))
@@ -177,19 +195,12 @@ def decompose_single_agg(
             # The aggregation is just reconstructed with the new
             # (potentially masked) child. This is safe because we recursed
             # to ensure there are no nested aggregations.
-            return (
-                [(named_expr.reconstruct(agg.reconstruct([child])), True)],
-                named_expr.reconstruct(expr.Col(agg.dtype, name)),
-            )
-        elif agg.name in ("mean", "median", "quantile", "std", "var"):
-            # libcudf promotes these to float64; but polars
-            # keeps Float32, so cast back in post-processing.
-            named = expr.NamedExpr(name, agg)
-            post_col: expr.Expr = expr.Col(DataType(pl.Float64()), name)
-            if agg.dtype.plc.id() == plc.TypeId.FLOAT32:
-                post_col = expr.Cast(agg.dtype, post_col)
-            return [(named, True)], expr.NamedExpr(name, post_col)
-        elif agg.name == "sum":
+
+        # rebuild the agg with the transformed child
+        new_children = [child] if not is_quantile else [child, agg.children[1]]
+        named_expr = named_expr.reconstruct(agg.reconstruct(new_children))
+
+        if agg.name == "sum":
             col = (
                 expr.Cast(agg.dtype, expr.Col(DataType(pl.datatypes.Int64()), name))
                 if (
@@ -235,6 +246,14 @@ def decompose_single_agg(
                 return [(named_expr, True), (win_len, True)], expr.NamedExpr(
                     name, post_ternary_expr
                 )
+        elif agg.name in {"mean", "median", "quantile", "std", "var"}:
+            post_agg_col: expr.Expr = expr.Col(
+                DataType(pl.Float64()), name
+            )  # libcudf promotes to float64
+            if agg.dtype.plc.id() == plc.TypeId.FLOAT32:
+                # Cast back to float32 to match Polars
+                post_agg_col = expr.Cast(agg.dtype, post_agg_col)
+            return [(named_expr, True)], named_expr.reconstruct(post_agg_col)
         else:
             return [(named_expr, True)], named_expr.reconstruct(
                 expr.Col(agg.dtype, name)
 
@@ -90,12 +90,12 @@ class UniqueStats:
 
 class DataSourceInfo:
     """
-    Datasource information.
+    Table data source information.
 
     Notes
     -----
     This class should be sub-classed for specific
-    datasource types (e.g. Parquet, DataFrame, etc.).
+    data source types (e.g. Parquet, DataFrame, etc.).
     The required properties/methods enable lazy
     sampling of the underlying datasource.
     """
@@ -117,6 +117,70 @@ def add_unique_stats_column(self, column: str) -> None:
         """Add a column needing unique-value information."""
 
 
+class ColumnSourceInfo:
+    """
+    Source column information.
+
+    Parameters
+    ----------
+    table_source_info
+        Table data source information.
+    column_name
+        Column name in the data source.
+
+    Notes
+    -----
+    This is a thin wrapper around DataSourceInfo that provides
+    direct access to column-specific information.
+    """
+
+    __slots__ = ("_allow_unique_sampling", "column_name", "table_source_info")
+    table_source_info: DataSourceInfo
+    column_name: str
+    _allow_unique_sampling: bool
+
+    def __init__(self, table_source_info: DataSourceInfo, column_name: str) -> None:
+        self.table_source_info = table_source_info
+        self.column_name = column_name
+        self._allow_unique_sampling = False
+
+    @property
+    def row_count(self) -> ColumnStat[int]:
+        """Data source row-count estimate."""
+        return self.table_source_info.row_count
+
+    def unique_stats(self, *, force: bool = False) -> UniqueStats:
+        """
+        Return unique-value statistics for a column.
+
+        Parameters
+        ----------
+        force
+            If True, return unique-value statistics even if the column
+            wasn't marked as needing unique-value information.
+        """
+        return (
+            self.table_source_info.unique_stats(self.column_name)
+            # Avoid sampling unique-stats if this column
+            # wasn't marked as needing unique-stats.
+            if force or self._allow_unique_sampling
+            else UniqueStats()
+        )
+
+    @property
+    def storage_size(self) -> ColumnStat[int]:
+        """Return the average column size for a single file."""
+        return self.table_source_info.storage_size(self.column_name)
+
+    def add_unique_stats_column(self, column: str | None = None) -> None:
+        """Add a column needing unique-value information."""
+        if column in (None, self.column_name):
+            self._allow_unique_sampling = True
+        return self.table_source_info.add_unique_stats_column(
+            column or self.column_name
+        )
+
+
 class ColumnStats:
     """
     Column statistics.
@@ -128,34 +192,29 @@ class ColumnStats:
     children
         Child ColumnStats objects.
     source_info
-        Datasource information.
-    source_name
-        Source-column name.
+        Column source information.
     unique_stats
         Unique-value statistics.
     """
 
-    __slots__ = ("children", "name", "source_info", "source_name", "unique_stats")
+    __slots__ = ("children", "name", "source_info", "unique_stats")
 
     name: str
     children: tuple[ColumnStats, ...]
-    source_info: DataSourceInfo
-    source_name: str
+    source_info: ColumnSourceInfo
     unique_stats: UniqueStats
 
     def __init__(
         self,
         name: str,
         *,
         children: tuple[ColumnStats, ...] = (),
-        source_info: DataSourceInfo | None = None,
-        source_name: str | None = None,
+        source_info: ColumnSourceInfo | None = None,
         unique_stats: UniqueStats | None = None,
     ) -> None:
         self.name = name
         self.children = children
-        self.source_info = source_info or DataSourceInfo()
-        self.source_name = source_name or name
+        self.source_info = source_info or ColumnSourceInfo(DataSourceInfo(), name)
         self.unique_stats = unique_stats or UniqueStats()
 
     def new_parent(
@@ -184,7 +243,6 @@ def new_parent(
             children=(self,),
             # Want to reference the same DataSourceInfo
             source_info=self.source_info,
-            source_name=self.source_name,
             # Want fresh UniqueStats so we can mutate in place
             unique_stats=UniqueStats(),
         )
@@ -195,6 +253,11 @@ class StatsCollector:
 
     __slots__ = ("column_stats", "row_count")
 
+    row_count: dict[IR, ColumnStat[int]]
+    """Estimated row count for each IR node."""
+    column_stats: dict[IR, dict[str, ColumnStats]]
+    """Column statistics for each IR node."""
+
     def __init__(self) -> None:
         self.row_count: dict[IR, ColumnStat[int]] = {}
         self.column_stats: dict[IR, dict[str, ColumnStats]] = {}
@@ -19,6 +19,7 @@
 
 from cudf_polars.dsl.ir import IR, DataFrameScan, Empty, Scan, Sink, Union
 from cudf_polars.experimental.base import (
+    ColumnSourceInfo,
     ColumnStat,
     ColumnStats,
     DataSourceInfo,
@@ -118,8 +119,8 @@ def from_scan(ir: Scan, config_options: ConfigOptions) -> ScanPartitionPlan:
             blocksize: int = config_options.executor.target_partition_size
             column_stats = _extract_scan_stats(ir, config_options)
             column_sizes: list[int] = []
-            for name, cs in column_stats.items():
-                storage_size = cs.source_info.storage_size(name)
+            for cs in column_stats.values():
+                storage_size = cs.source_info.storage_size
                 if storage_size.value is not None:
                     column_sizes.append(storage_size.value)
 
@@ -821,16 +822,15 @@ def _extract_scan_stats(
 ) -> dict[str, ColumnStats]:
     """Extract base ColumnStats for a Scan node."""
     if ir.typ == "parquet":
-        source_info = _sample_pq_stats(
+        table_source_info = _sample_pq_stats(
             tuple(ir.paths),
             config_options.parquet_options.max_footer_samples,
             config_options.parquet_options.max_row_group_samples,
         )
         return {
             name: ColumnStats(
                 name=name,
-                source_info=source_info,
-                source_name=name,
+                source_info=ColumnSourceInfo(table_source_info, name),
             )
             for name in ir.schema
         }
@@ -879,12 +879,11 @@ def unique_stats(self, column: str) -> UniqueStats:
 
 def _extract_dataframescan_stats(ir: DataFrameScan) -> dict[str, ColumnStats]:
     """Extract base ColumnStats for a DataFrameScan node."""
-    source_info = DataFrameSourceInfo(ir.df)
+    table_source_info = DataFrameSourceInfo(ir.df)
     return {
         name: ColumnStats(
             name=name,
-            source_info=source_info,
-            source_name=name,
+            source_info=ColumnSourceInfo(table_source_info, name),
         )
         for name in ir.schema
     }
 
@@ -66,9 +66,8 @@ def _update_unique_stats_columns(
         if (
             name not in unique_fraction
             and (column_stats := child_column_stats.get(name)) is not None
-            and (source_stats := column_stats.source_info) is not None
         ):
-            source_stats.add_unique_stats_column(column_stats.source_name or name)
+            column_stats.source_info.add_unique_stats_column()
 
 
 @initialize_column_stats.register(IR)
 
@@ -174,6 +174,8 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899",
+    "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899",
 }
 
 
@@ -191,6 +193,7 @@ def pytest_configure(config: pytest.Config) -> None:
     # Tests performance difference of CPU engine
     "tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine",
     "tests/unit/operations/namespaces/list/test_list.py::test_list_struct_field_perf": "Tests CPU Engine perf",
+    "tests/benchmark/test_with_columns.py::test_with_columns_quadratic_19503": "Tests performance bug in CPU engine",
     # The test may segfault with the legacy streaming engine. We should
     # remove this skip when all polars tests use the new streaming engine.
     "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
 
@@ -417,8 +417,11 @@ datasource (e.g. a Parquet dataset or in-memory `DataFrame`).
   **aggregated** column sampling via sub-classing. For example,
   The `ParquetSourceInfo` sub-class uses caching to avoid
   redundant file-system access.
+- `ColumnSourceInfo`: This class wraps a `DataSourceInfo` object.
+Since `DataSourceInfo` tracks information for an entire table, we use
+`ColumnSourceInfo` to provide a single-column view of the object.
 - `ColumnStats`: This class is used to group together the "base"
-`DataSourceInfo` reference and the current `UniqueStats` estimates
+`ColumnSourceInfo` reference and the local `UniqueStats` estimates
 for a specific IR + column combination. We bundle these references
 together to simplify the design and maintenance of `StatsCollector`.
 **NOTE:** The current `UniqueStats` estimates are not yet populated.