@@ -90,12 +90,12 @@ class UniqueStats:
9090
9191class DataSourceInfo :
9292 """
93- Datasource information.
93+ Table data source information.
9494
9595 Notes
9696 -----
9797 This class should be sub-classed for specific
98- datasource types (e.g. Parquet, DataFrame, etc.).
98+ data source types (e.g. Parquet, DataFrame, etc.).
9999 The required properties/methods enable lazy
100100 sampling of the underlying datasource.
101101 """
@@ -117,6 +117,70 @@ def add_unique_stats_column(self, column: str) -> None:
117117 """Add a column needing unique-value information."""
118118
119119
120+ class ColumnSourceInfo :
121+ """
122+ Source column information.
123+
124+ Parameters
125+ ----------
126+ table_source_info
127+ Table data source information.
128+ column_name
129+ Column name in the data source.
130+
131+ Notes
132+ -----
133+ This is a thin wrapper around DataSourceInfo that provides
134+ direct access to column-specific information.
135+ """
136+
137+ __slots__ = ("_allow_unique_sampling" , "column_name" , "table_source_info" )
138+ table_source_info : DataSourceInfo
139+ column_name : str
140+ _allow_unique_sampling : bool
141+
142+ def __init__ (self , table_source_info : DataSourceInfo , column_name : str ) -> None :
143+ self .table_source_info = table_source_info
144+ self .column_name = column_name
145+ self ._allow_unique_sampling = False
146+
147+ @property
148+ def row_count (self ) -> ColumnStat [int ]:
149+ """Data source row-count estimate."""
150+ return self .table_source_info .row_count
151+
152+ def unique_stats (self , * , force : bool = False ) -> UniqueStats :
153+ """
154+ Return unique-value statistics for a column.
155+
156+ Parameters
157+ ----------
158+ force
159+ If True, return unique-value statistics even if the column
160+ wasn't marked as needing unique-value information.
161+ """
162+ return (
163+ self .table_source_info .unique_stats (self .column_name )
164+ # Avoid sampling unique-stats if this column
165+ # wasn't marked as needing unique-stats.
166+ if force or self ._allow_unique_sampling
167+ else UniqueStats ()
168+ )
169+
170+ @property
171+ def storage_size (self ) -> ColumnStat [int ]:
172+ """Return the average column size for a single file."""
173+ return self .table_source_info .storage_size (self .column_name )
174+
175+ def add_unique_stats_column (self , column : str | None = None ) -> None :
176+ """Add a column needing unique-value information."""
177+ if column in (None , self .column_name ):
178+ self ._allow_unique_sampling = True
179+ return self .table_source_info .add_unique_stats_column (
180+ column or self .column_name
181+ )
182+
183+
120184class ColumnStats :
121185 """
122186 Column statistics.
@@ -128,34 +192,29 @@ class ColumnStats:
128192 children
129193 Child ColumnStats objects.
130194 source_info
131- Datasource information.
132- source_name
133- Source-column name.
195+ Column source information.
134196 unique_stats
135197 Unique-value statistics.
136198 """
137199
138- __slots__ = ("children" , "name" , "source_info" , "source_name" , " unique_stats" )
200+ __slots__ = ("children" , "name" , "source_info" , "unique_stats" )
139201
140202 name : str
141203 children : tuple [ColumnStats , ...]
142- source_info : DataSourceInfo
143- source_name : str
204+ source_info : ColumnSourceInfo
144205 unique_stats : UniqueStats
145206
146207 def __init__ (
147208 self ,
148209 name : str ,
149210 * ,
150211 children : tuple [ColumnStats , ...] = (),
151- source_info : DataSourceInfo | None = None ,
152- source_name : str | None = None ,
212+ source_info : ColumnSourceInfo | None = None ,
153213 unique_stats : UniqueStats | None = None ,
154214 ) -> None :
155215 self .name = name
156216 self .children = children
157- self .source_info = source_info or DataSourceInfo ()
158- self .source_name = source_name or name
217+ self .source_info = source_info or ColumnSourceInfo (DataSourceInfo (), name )
159218 self .unique_stats = unique_stats or UniqueStats ()
160219
161220 def new_parent (
@@ -184,7 +243,6 @@ def new_parent(
184243 children = (self ,),
185244 # Want to reference the same DataSourceInfo
186245 source_info = self .source_info ,
187- source_name = self .source_name ,
188246 # Want fresh UniqueStats so we can mutate in place
189247 unique_stats = UniqueStats (),
190248 )
@@ -195,6 +253,11 @@ class StatsCollector:
195253
196254 __slots__ = ("column_stats" , "row_count" )
197255
256+ row_count : dict [IR , ColumnStat [int ]]
257+ """Estimated row count for each IR node."""
258+ column_stats : dict [IR , dict [str , ColumnStats ]]
259+ """Column statistics for each IR node."""
260+
198261 def __init__ (self ) -> None :
199262 self .row_count : dict [IR , ColumnStat [int ]] = {}
200263 self .column_stats : dict [IR , dict [str , ColumnStats ]] = {}
0 commit comments