Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def get_subcolumns(self, nested_columns="all") -> list[str]:
return subcols

@deprecated(
version="0.6.0", reason="`add_nested` will be removed in version 0.7.0, " "use `join_nested` instead."
version="0.6.0", reason="`add_nested` will be removed in version 0.7.0, use `join_nested` instead."
)
def add_nested(
self,
Expand Down Expand Up @@ -1828,9 +1828,7 @@ def sort_values(
return None
return new_df

@deprecated(
version="0.6.0", reason="`reduce` will be removed in version 0.7.0, " "use `map_rows` instead."
)
@deprecated(version="0.6.0", reason="`reduce` will be removed in version 0.7.0, use `map_rows` instead.")
def reduce(self, func, *args, infer_nesting=True, append_columns=False, **kwargs) -> NestedFrame: # type: ignore[override]
"""
Takes a function and applies it to each top-level row of the NestedFrame.
Expand Down
75 changes: 52 additions & 23 deletions src/nested_pandas/nestedframe/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,29 +271,58 @@ def _read_table_with_partial_load_check(data, columns=None, filesystem=None, **k


def _validate_structs_from_schema(data, columns=None, filesystem=None):
"""Validate that nested columns are structs"""
if columns is not None:
schema = pq.read_schema(data, filesystem=filesystem)
for col in columns:
# check if column is a partial load of a nested structure
if "." in col:
# first check if column exists as a top-level column
if col in schema.names:
continue
# if not, inspect the base column name type
else:
if col.split(".")[0] in schema.names:
# check if the column is a list-struct
col_type = schema.field(col.split(".")[0]).type
if not pa.types.is_struct(col_type):
base_col = col.split(".")[0]
raise ValueError(
f"The provided column '{col}' signals to partially load a nested structure, "
f"but the nested structure '{base_col}' is not a struct. "
"Partial loading of nested structures is only supported for struct of list "
f"columns. To resolve this, fully load the column '{base_col}' "
f"instead of partially loading it and perform column selection afterwards."
)
"""Validate that columns specified for partial loading are valid struct types.

This function validates that when attempting to partially load nested columns
from a Parquet file (e.g., loading "nested.a" instead of the full "nested"
column), the base column being partially loaded is actually a struct type.
Partial loading of nested structures is only supported for struct of list
columns, not for list of struct columns.

Parameters
----------
data : str, Path, UPath, or file-like object
Path to the parquet file or file-like object to inspect.
columns : list of str, optional
List of column names to validate. If None, no validation is performed.
Columns containing "." are checked to ensure their base column name
(the part before the ".") refers to a struct type in the schema.
filesystem : pyarrow.fs.FileSystem, optional
PyArrow filesystem object to use when reading the schema. If None,
the default filesystem for the given path is used.

Raises
------
ValueError
If a column in the partial load format (e.g., "nested.a") is specified
but the base column ("nested") is not a struct type. This indicates
the data structure doesn't support partial loading as nested-pandas
requires struct of list columns, not list of struct columns.
"""
if columns is None:
return
schema = pq.read_schema(data, filesystem=filesystem)
for col in columns:
# check if column is a partial load of a nested structure
if "." not in col:
continue
# check if column exists as a top-level column
if col in schema.names:
continue
# if not, inspect the base column name type
base_col = col.split(".")[0]
if base_col not in schema.names:
continue
# check if the base column is a list-struct
col_type = schema.field(base_col).type
if not pa.types.is_struct(col_type):
raise ValueError(
f"The provided column '{col}' signals to partially load a nested structure, "
f"but the nested structure '{base_col}' is not a struct. "
"Partial loading of nested structures is only supported for struct of list "
f"columns. To resolve this, fully load the column '{base_col}' "
f"instead of partially loading it and perform column selection afterwards."
)


def _is_local_dir(upath: UPath, is_dir: bool | None) -> bool:
Expand Down
24 changes: 10 additions & 14 deletions src/nested_pandas/series/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,7 @@ def flat_length(self) -> int:
return self._series.array.flat_length

@property
@deprecated(
version="0.6.0", reason="`fields` will be removed in version 0.7.0, " "use `columns` instead."
)
@deprecated(version="0.6.0", reason="`fields` will be removed in version 0.7.0, use `columns` instead.")
def fields(self) -> list[str]:
"""Names of the nested columns"""
return self.columns
Expand All @@ -171,7 +169,7 @@ def flat_index(self) -> pd.Index:
return flat_index

@deprecated(
version="0.6.0", reason="`with_field` will be removed in version 0.7.0, " "use `set_column` instead."
version="0.6.0", reason="`with_field` will be removed in version 0.7.0, use `set_column` instead."
)
def with_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field from flat-array of values and return a new series
Expand Down Expand Up @@ -241,7 +239,7 @@ def set_column(self, column: str, value: ArrayLike) -> NestedSeries:

@deprecated(
version="0.6.0",
reason="`with_flat_field` will be removed in version 0.7.0, " "use `set_flat_column` instead.",
reason="`with_flat_field` will be removed in version 0.7.0, use `set_flat_column` instead.",
)
def with_flat_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field from flat-array of values and return a new series
Expand Down Expand Up @@ -311,7 +309,7 @@ def set_flat_column(self, column: str, value: ArrayLike) -> NestedSeries:

@deprecated(
version="0.6.0",
reason="`with_list_field` will be removed in version 0.7.0, " "use `set_list_column` instead.",
reason="`with_list_field` will be removed in version 0.7.0, use `set_list_column` instead.",
)
def with_list_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field from list-array of values and return a new series
Expand Down Expand Up @@ -385,7 +383,7 @@ def set_list_column(self, column: str, value: ArrayLike) -> NestedSeries:

@deprecated(
version="0.6.0",
reason="`with_filled_field` will be removed in version 0.7.0, " "use `set_filled_column` instead.",
reason="`with_filled_field` will be removed in version 0.7.0, use `set_filled_column` instead.",
)
def with_filled_field(self, field: str, value: ArrayLike) -> NestedSeries:
"""Set the field by repeating values and return a new series
Expand Down Expand Up @@ -464,7 +462,7 @@ def set_filled_column(self, column: str, value: ArrayLike) -> NestedSeries:
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)

@deprecated(
version="0.6.0", reason="`without_field` will be removed in version 0.7.0, " "use `drop` instead."
version="0.6.0", reason="`without_field` will be removed in version 0.7.0, use `drop` instead."
)
def without_field(self, field: str | list[str]) -> NestedSeries:
"""Remove the field(s) from the series and return a new series
Expand Down Expand Up @@ -533,9 +531,7 @@ def drop(self, column: str | list[str]) -> NestedSeries:
new_array.pop_fields(column)
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)

@deprecated(
version="0.6.0", reason="`query_flat` will be removed in version 0.7.0, " "use `query` instead."
)
@deprecated(version="0.6.0", reason="`query_flat` will be removed in version 0.7.0, use `query` instead.")
def query_flat(self, query: str) -> NestedSeries:
"""Query the flat arrays with a boolean expression

Expand Down Expand Up @@ -610,7 +606,7 @@ def query(self, query: str) -> NestedSeries:

@deprecated(
version="0.6.0",
reason="`get_flat_index` will be removed in version 0.7.0, " "use the `flat_index` property instead.",
reason="`get_flat_index` will be removed in version 0.7.0, use the `flat_index` property instead.",
)
def get_flat_index(self) -> pd.Index:
"""Index of the flat arrays
Expand All @@ -635,7 +631,7 @@ def get_flat_index(self) -> pd.Index:

@deprecated(
version="0.6.0",
reason="`get_flat_series` will be removed in version 0.7.0, " "use `to_flat()[column]` instead.",
reason="`get_flat_series` will be removed in version 0.7.0, use `to_flat()[column]` instead.",
)
def get_flat_series(self, field: str) -> pd.Series:
"""Get the flat-array field as a pd.Series
Expand Down Expand Up @@ -692,7 +688,7 @@ def get_flat_series(self, field: str) -> pd.Series:

@deprecated(
version="0.6.0",
reason="`get_list_series` will be removed in version 0.7.0, " "use `to_lists()[column]` instead.",
reason="`get_list_series` will be removed in version 0.7.0, use `to_lists()[column]` instead.",
)
def get_list_series(self, field: str) -> pd.Series:
"""Get the list-array field as a Series
Expand Down
8 changes: 4 additions & 4 deletions src/nested_pandas/series/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def _struct_list_pa_dtype(self) -> pa.StructType:
@classmethod
@deprecated(
version="0.6.0",
reason="`from_fields` will be removed in version 0.7.0, " "use `from_columns` instead.",
reason="`from_fields` will be removed in version 0.7.0, use `from_columns` instead.",
)
def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Make NestedDtype from a mapping of field names and list item types.
Expand Down Expand Up @@ -304,7 +304,7 @@ def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.ListT

@property
@deprecated(
version="0.6.0", reason="`fields` will be removed in version 0.7.0, " "use `column_dtypes` instead."
version="0.6.0", reason="`fields` will be removed in version 0.7.0, use `column_dtypes` instead."
)
def fields(self) -> dict[str, pa.DataType]:
"""The mapping of field names and their item types."""
Expand Down Expand Up @@ -363,7 +363,7 @@ def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype:

@deprecated(
version="0.6.0",
reason="`field_dtype` will be removed in version 0.7.0, " "use `_struct_list_pa_dtype` instead.",
reason="`field_dtype` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.",
)
def field_dtype(self, field: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821
"""Pandas dtype of a field, pd.ArrowDType or NestedDtype.
Expand Down Expand Up @@ -404,7 +404,7 @@ def column_dtype(self, column: str) -> pd.ArrowDtype | Self: # type: ignore[nam
@property
@deprecated(
version="0.6.0",
reason="`field_dtypes` will be removed in version 0.7.0, " "use `_struct_list_pa_dtype` instead.",
reason="`field_dtypes` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.",
)
def field_dtypes(self) -> dict[str, pd.ArrowDtype | Self]: # type: ignore[name-defined] # noqa: F821
"""Pandas dtypes of this dtype's fields."""
Expand Down
4 changes: 1 addition & 3 deletions src/nested_pandas/series/nestedseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,7 @@ def __setitem__(self, key, value):
return super().__setitem__(key, value)

@nested_only
@deprecated(
version="0.6.0", reason="`to_flat` will be removed in version 0.7.0, " "use `explode` instead."
)
@deprecated(version="0.6.0", reason="`to_flat` will be removed in version 0.7.0, use `explode` instead.")
def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
"""Convert nested series into dataframe of flat arrays.

Expand Down