Skip to content

Commit ba0d23b

Browse files
authored
feat: support multi-column assignment for DataFrame (#2028)
* feat: support multi-column assignment for DataFrame * fix lint * fix mypy * fix Sequence type checking bug
1 parent 2c72c56 commit ba0d23b

File tree

3 files changed

+131
-5
lines changed

3 files changed

+131
-5
lines changed

bigframes/dataframe.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import traceback
2727
import typing
2828
from typing import (
29+
Any,
2930
Callable,
3031
Dict,
3132
Hashable,
@@ -91,6 +92,7 @@
9192
import bigframes.session
9293

9394
SingleItemValue = Union[bigframes.series.Series, int, float, str, Callable]
95+
MultiItemValue = Union["DataFrame", Sequence[int | float | str | Callable]]
9496

9597
LevelType = typing.Hashable
9698
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
@@ -884,8 +886,13 @@ def __delitem__(self, key: str):
884886
df = self.drop(columns=[key])
885887
self._set_block(df._get_block())
886888

887-
def __setitem__(self, key: str, value: SingleItemValue):
888-
df = self._assign_single_item(key, value)
889+
def __setitem__(
890+
self, key: str | list[str], value: SingleItemValue | MultiItemValue
891+
):
892+
if isinstance(key, list):
893+
df = self._assign_multi_items(key, value)
894+
else:
895+
df = self._assign_single_item(key, value)
889896
self._set_block(df._get_block())
890897

891898
__setitem__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__setitem__)
@@ -2212,7 +2219,7 @@ def assign(self, **kwargs) -> DataFrame:
22122219
def _assign_single_item(
22132220
self,
22142221
k: str,
2215-
v: SingleItemValue,
2222+
v: SingleItemValue | MultiItemValue,
22162223
) -> DataFrame:
22172224
if isinstance(v, bigframes.series.Series):
22182225
return self._assign_series_join_on_index(k, v)
@@ -2230,7 +2237,33 @@ def _assign_single_item(
22302237
elif utils.is_list_like(v):
22312238
return self._assign_single_item_listlike(k, v)
22322239
else:
2233-
return self._assign_scalar(k, v)
2240+
return self._assign_scalar(k, v) # type: ignore
2241+
2242+
def _assign_multi_items(
2243+
self,
2244+
k: list[str],
2245+
v: SingleItemValue | MultiItemValue,
2246+
) -> DataFrame:
2247+
value_sources: Sequence[Any] = []
2248+
if isinstance(v, DataFrame):
2249+
value_sources = [v[col] for col in v.columns]
2250+
elif isinstance(v, bigframes.series.Series):
2251+
# For behavior consistency with Pandas.
2252+
raise ValueError("Columns must be same length as key")
2253+
elif isinstance(v, Sequence):
2254+
value_sources = v
2255+
else:
2256+
# We assign the same scalar value to all target columns.
2257+
value_sources = [v] * len(k)
2258+
2259+
if len(value_sources) != len(k):
2260+
raise ValueError("Columns must be same length as key")
2261+
2262+
# Repeatedly assign columns in order.
2263+
result = self._assign_single_item(k[0], value_sources[0])
2264+
for target, source in zip(k[1:], value_sources[1:]):
2265+
result = result._assign_single_item(target, source)
2266+
return result
22342267

22352268
def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame:
22362269
given_rows = len(v)

tests/system/small/test_dataframe.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,67 @@ def test_assign_new_column_w_setitem_list_error(scalars_dfs):
11381138
bf_df["new_col"] = [1, 2, 3]
11391139

11401140

1141+
@pytest.mark.parametrize(
1142+
("key", "value"),
1143+
[
1144+
pytest.param(["int64_col", "int64_too"], 1, id="scalar_to_existing_column"),
1145+
pytest.param(
1146+
["int64_col", "int64_too"], [1, 2], id="sequence_to_existing_column"
1147+
),
1148+
pytest.param(
1149+
["int64_col", "new_col"], [1, 2], id="sequence_to_partial_new_column"
1150+
),
1151+
pytest.param(
1152+
["new_col", "new_col_too"], [1, 2], id="sequence_to_full_new_column"
1153+
),
1154+
],
1155+
)
1156+
def test_setitem_multicolumn_with_literals(scalars_dfs, key, value):
1157+
scalars_df, scalars_pandas_df = scalars_dfs
1158+
bf_result = scalars_df.copy()
1159+
pd_result = scalars_pandas_df.copy()
1160+
1161+
bf_result[key] = value
1162+
pd_result[key] = value
1163+
1164+
pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False)
1165+
1166+
1167+
def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs):
1168+
scalars_df, _ = scalars_dfs
1169+
bf_result = scalars_df.copy()
1170+
1171+
with pytest.raises(ValueError):
1172+
bf_result[["int64_col", "int64_too"]] = [1]
1173+
1174+
1175+
def test_setitem_multicolumn_with_dataframes(scalars_dfs):
1176+
scalars_df, scalars_pandas_df = scalars_dfs
1177+
bf_result = scalars_df.copy()
1178+
pd_result = scalars_pandas_df.copy()
1179+
1180+
bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2
1181+
pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2
1182+
1183+
pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False)
1184+
1185+
1186+
def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs):
1187+
scalars_df, _ = scalars_dfs
1188+
bf_result = scalars_df.copy()
1189+
1190+
with pytest.raises(ValueError):
1191+
bf_result[["int64_col", "int64_too"]] = bf_result["int64_col"] / 2
1192+
1193+
1194+
def test_setitem_multicolumn_with_dataframes_different_lengths_raise_error(scalars_dfs):
1195+
scalars_df, _ = scalars_dfs
1196+
bf_result = scalars_df.copy()
1197+
1198+
with pytest.raises(ValueError):
1199+
bf_result[["int64_col"]] = bf_result[["int64_col", "int64_too"]] / 2
1200+
1201+
11411202
def test_assign_existing_column(scalars_dfs):
11421203
scalars_df, scalars_pandas_df = scalars_dfs
11431204
kwargs = {"int64_col": 2}

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7626,11 +7626,43 @@ def __setitem__(self, key, value):
76267626
<BLANKLINE>
76277627
[3 rows x 5 columns]
76287628
7629+
You can assign a scalar to multiple columns.
7630+
7631+
>>> df[["age", "new_age"]] = 25
7632+
>>> df
7633+
name age location country new_age
7634+
0 alpha 25 WA USA 25
7635+
1 beta 25 NY USA 25
7636+
2 gamma 25 CA USA 25
7637+
<BLANKLINE>
7638+
[3 rows x 5 columns]
7639+
7640+
You can use a sequence of scalars for assignment of multiple columns:
7641+
7642+
>>> df[["age", "is_happy"]] = [20, True]
7643+
>>> df
7644+
name age location country new_age is_happy
7645+
0 alpha 20 WA USA 25 True
7646+
1 beta 20 NY USA 25 True
7647+
2 gamma 20 CA USA 25 True
7648+
<BLANKLINE>
7649+
[3 rows x 6 columns]
7650+
7651+
You can use a dataframe for assignment of multiple columns:
7652+
>>> df[["age", "new_age"]] = df[["new_age", "age"]]
7653+
>>> df
7654+
name age location country new_age is_happy
7655+
0 alpha 25 WA USA 20 True
7656+
1 beta 25 NY USA 20 True
7657+
2 gamma 25 CA USA 20 True
7658+
<BLANKLINE>
7659+
[3 rows x 6 columns]
7660+
76297661
Args:
76307662
key (column index):
76317663
It can be a new column to be inserted, or an existing column to
76327664
be modified.
7633-
value (scalar or Series):
7665+
value (scalar, Sequence, DataFrame, or Series):
76347666
Value to be assigned to the column
76357667
"""
76367668
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 commit comments

Comments
 (0)