diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index e7629ace..3d62f10d 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -378,7 +378,7 @@ def get_subcolumns(self, nested_columns="all") -> list[str]: return subcols @deprecated( - version="0.6.0", reason="`add_nested` will be removed in version 0.7.0, " "use `join_nested` instead." + version="0.6.0", reason="`add_nested` will be removed in version 0.7.0, use `join_nested` instead." ) def add_nested( self, @@ -1828,9 +1828,7 @@ def sort_values( return None return new_df - @deprecated( - version="0.6.0", reason="`reduce` will be removed in version 0.7.0, " "use `map_rows` instead." - ) + @deprecated(version="0.6.0", reason="`reduce` will be removed in version 0.7.0, use `map_rows` instead.") def reduce(self, func, *args, infer_nesting=True, append_columns=False, **kwargs) -> NestedFrame: # type: ignore[override] """ Takes a function and applies it to each top-level row of the NestedFrame. diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index 750e616f..c5f2ed7f 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -271,29 +271,58 @@ def _read_table_with_partial_load_check(data, columns=None, filesystem=None, **k def _validate_structs_from_schema(data, columns=None, filesystem=None): - """Validate that nested columns are structs""" - if columns is not None: - schema = pq.read_schema(data, filesystem=filesystem) - for col in columns: - # check if column is a partial load of a nested structure - if "." in col: - # first check if column exists as a top-level column - if col in schema.names: - continue - # if not, inspect the base column name type - else: - if col.split(".")[0] in schema.names: - # check if the column is a list-struct - col_type = schema.field(col.split(".")[0]).type - if not pa.types.is_struct(col_type): - base_col = col.split(".")[0] - raise ValueError( - f"The provided column '{col}' signals to partially load a nested structure, " - f"but the nested structure '{base_col}' is not a struct. " - "Partial loading of nested structures is only supported for struct of list " - f"columns. To resolve this, fully load the column '{base_col}' " - f"instead of partially loading it and perform column selection afterwards." - ) + """Validate that columns specified for partial loading are valid struct types. + + This function validates that when attempting to partially load nested columns + from a Parquet file (e.g., loading "nested.a" instead of the full "nested" + column), the base column being partially loaded is actually a struct type. + Partial loading of nested structures is only supported for struct of list + columns, not for list of struct columns. + + Parameters + ---------- + data : str, Path, UPath, or file-like object + Path to the parquet file or file-like object to inspect. + columns : list of str, optional + List of column names to validate. If None, no validation is performed. + Columns containing "." are checked to ensure their base column name + (the part before the ".") refers to a struct type in the schema. + filesystem : pyarrow.fs.FileSystem, optional + PyArrow filesystem object to use when reading the schema. If None, + the default filesystem for the given path is used. + + Raises + ------ + ValueError + If a column in the partial load format (e.g., "nested.a") is specified + but the base column ("nested") is not a struct type. This indicates + the data structure doesn't support partial loading as nested-pandas + requires struct of list columns, not list of struct columns. + """ + if columns is None: + return + schema = pq.read_schema(data, filesystem=filesystem) + for col in columns: + # check if column is a partial load of a nested structure + if "." not in col: + continue + # check if column exists as a top-level column + if col in schema.names: + continue + # if not, inspect the base column name type + base_col = col.split(".")[0] + if base_col not in schema.names: + continue + # check if the base column is a list-struct + col_type = schema.field(base_col).type + if not pa.types.is_struct(col_type): + raise ValueError( + f"The provided column '{col}' signals to partially load a nested structure, " + f"but the nested structure '{base_col}' is not a struct. " + "Partial loading of nested structures is only supported for struct of list " + f"columns. To resolve this, fully load the column '{base_col}' " + f"instead of partially loading it and perform column selection afterwards." + ) def _is_local_dir(upath: UPath, is_dir: bool | None) -> bool: diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 25511455..b78faed6 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -150,9 +150,7 @@ def flat_length(self) -> int: return self._series.array.flat_length @property - @deprecated( - version="0.6.0", reason="`fields` will be removed in version 0.7.0, " "use `columns` instead." - ) + @deprecated(version="0.6.0", reason="`fields` will be removed in version 0.7.0, use `columns` instead.") def fields(self) -> list[str]: """Names of the nested columns""" return self.columns @@ -171,7 +169,7 @@ def flat_index(self) -> pd.Index: return flat_index @deprecated( - version="0.6.0", reason="`with_field` will be removed in version 0.7.0, " "use `set_column` instead." + version="0.6.0", reason="`with_field` will be removed in version 0.7.0, use `set_column` instead." ) def with_field(self, field: str, value: ArrayLike) -> NestedSeries: """Set the field from flat-array of values and return a new series @@ -241,7 +239,7 @@ def set_column(self, column: str, value: ArrayLike) -> NestedSeries: @deprecated( version="0.6.0", - reason="`with_flat_field` will be removed in version 0.7.0, " "use `set_flat_column` instead.", + reason="`with_flat_field` will be removed in version 0.7.0, use `set_flat_column` instead.", ) def with_flat_field(self, field: str, value: ArrayLike) -> NestedSeries: """Set the field from flat-array of values and return a new series @@ -311,7 +309,7 @@ def set_flat_column(self, column: str, value: ArrayLike) -> NestedSeries: @deprecated( version="0.6.0", - reason="`with_list_field` will be removed in version 0.7.0, " "use `set_list_column` instead.", + reason="`with_list_field` will be removed in version 0.7.0, use `set_list_column` instead.", ) def with_list_field(self, field: str, value: ArrayLike) -> NestedSeries: """Set the field from list-array of values and return a new series @@ -385,7 +383,7 @@ def set_list_column(self, column: str, value: ArrayLike) -> NestedSeries: @deprecated( version="0.6.0", - reason="`with_filled_field` will be removed in version 0.7.0, " "use `set_filled_column` instead.", + reason="`with_filled_field` will be removed in version 0.7.0, use `set_filled_column` instead.", ) def with_filled_field(self, field: str, value: ArrayLike) -> NestedSeries: """Set the field by repeating values and return a new series @@ -464,7 +462,7 @@ def set_filled_column(self, column: str, value: ArrayLike) -> NestedSeries: return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name) @deprecated( - version="0.6.0", reason="`without_field` will be removed in version 0.7.0, " "use `drop` instead." + version="0.6.0", reason="`without_field` will be removed in version 0.7.0, use `drop` instead." ) def without_field(self, field: str | list[str]) -> NestedSeries: """Remove the field(s) from the series and return a new series @@ -533,9 +531,7 @@ def drop(self, column: str | list[str]) -> NestedSeries: new_array.pop_fields(column) return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name) - @deprecated( - version="0.6.0", reason="`query_flat` will be removed in version 0.7.0, " "use `query` instead." - ) + @deprecated(version="0.6.0", reason="`query_flat` will be removed in version 0.7.0, use `query` instead.") def query_flat(self, query: str) -> NestedSeries: """Query the flat arrays with a boolean expression @@ -610,7 +606,7 @@ def query(self, query: str) -> NestedSeries: @deprecated( version="0.6.0", - reason="`get_flat_index` will be removed in version 0.7.0, " "use the `flat_index` property instead.", + reason="`get_flat_index` will be removed in version 0.7.0, use the `flat_index` property instead.", ) def get_flat_index(self) -> pd.Index: """Index of the flat arrays @@ -635,7 +631,7 @@ def get_flat_index(self) -> pd.Index: @deprecated( version="0.6.0", - reason="`get_flat_series` will be removed in version 0.7.0, " "use `to_flat()[column]` instead.", + reason="`get_flat_series` will be removed in version 0.7.0, use `to_flat()[column]` instead.", ) def get_flat_series(self, field: str) -> pd.Series: """Get the flat-array field as a pd.Series @@ -692,7 +688,7 @@ def get_flat_series(self, field: str) -> pd.Series: @deprecated( version="0.6.0", - reason="`get_list_series` will be removed in version 0.7.0, " "use `to_lists()[column]` instead.", + reason="`get_list_series` will be removed in version 0.7.0, use `to_lists()[column]` instead.", ) def get_list_series(self, field: str) -> pd.Series: """Get the list-array field as a Series diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index 120f16b8..0802c300 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -216,7 +216,7 @@ def _struct_list_pa_dtype(self) -> pa.StructType: @classmethod @deprecated( version="0.6.0", - reason="`from_fields` will be removed in version 0.7.0, " "use `from_columns` instead.", + reason="`from_fields` will be removed in version 0.7.0, use `from_columns` instead.", ) def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 """Make NestedDtype from a mapping of field names and list item types. @@ -304,7 +304,7 @@ def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListT @property @deprecated( - version="0.6.0", reason="`fields` will be removed in version 0.7.0, " "use `column_dtypes` instead." + version="0.6.0", reason="`fields` will be removed in version 0.7.0, use `column_dtypes` instead." ) def fields(self) -> dict[str, pa.DataType]: """The mapping of field names and their item types.""" @@ -363,7 +363,7 @@ def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype: @deprecated( version="0.6.0", - reason="`field_dtype` will be removed in version 0.7.0, " "use `_struct_list_pa_dtype` instead.", + reason="`field_dtype` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.", ) def field_dtype(self, field: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821 """Pandas dtype of a field, pd.ArrowDType or NestedDtype. @@ -404,7 +404,7 @@ def column_dtype(self, column: str) -> pd.ArrowDtype | Self: # type: ignore[nam @property @deprecated( version="0.6.0", - reason="`field_dtypes` will be removed in version 0.7.0, " "use `_struct_list_pa_dtype` instead.", + reason="`field_dtypes` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.", ) def field_dtypes(self) -> dict[str, pd.ArrowDtype | Self]: # type: ignore[name-defined] # noqa: F821 """Pandas dtypes of this dtype's fields.""" diff --git a/src/nested_pandas/series/nestedseries.py b/src/nested_pandas/series/nestedseries.py index 4d9b5c21..439addf4 100644 --- a/src/nested_pandas/series/nestedseries.py +++ b/src/nested_pandas/series/nestedseries.py @@ -96,9 +96,7 @@ def __setitem__(self, key, value): return super().__setitem__(key, value) @nested_only - @deprecated( - version="0.6.0", reason="`to_flat` will be removed in version 0.7.0, " "use `explode` instead." - ) + @deprecated(version="0.6.0", reason="`to_flat` will be removed in version 0.7.0, use `explode` instead.") def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame: """Convert nested series into dataframe of flat arrays.