Skip to content

Commit 91fcf17

Browse files
authored
Merge branch 'main' into callable-series-mask
2 parents f7f6a59 + d442f41 commit 91fcf17

File tree

16 files changed

+511
-18
lines changed

16 files changed

+511
-18
lines changed

bigframes/core/blocks.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -387,12 +387,21 @@ def reversed(self) -> Block:
387387
index_labels=self.index.names,
388388
)
389389

390-
def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block:
390+
def reset_index(
391+
self,
392+
level: LevelsType = None,
393+
drop: bool = True,
394+
*,
395+
col_level: Union[str, int] = 0,
396+
col_fill: typing.Hashable = "",
397+
allow_duplicates: bool = False,
398+
) -> Block:
391399
"""Reset the index of the block, promoting the old index to a value column.
392400
393401
Arguments:
394402
level: the label or index level of the index levels to remove.
395403
name: this is the column id for the new value id derived from the old index
404+
allow_duplicates:
396405
397406
Returns:
398407
A new Block because dropping index columns can break references
@@ -438,6 +447,11 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block:
438447
)
439448
else:
440449
# Add index names to column index
450+
col_level_n = (
451+
col_level
452+
if isinstance(col_level, int)
453+
else self.column_labels.names.index(col_level)
454+
)
441455
column_labels_modified = self.column_labels
442456
for position, level_id in enumerate(level_ids):
443457
label = self.col_id_to_index_name[level_id]
@@ -447,11 +461,15 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block:
447461
else:
448462
label = f"level_{self.index_columns.index(level_id)}"
449463

450-
if label in self.column_labels:
464+
if (not allow_duplicates) and (label in self.column_labels):
451465
raise ValueError(f"cannot insert {label}, already exists")
466+
452467
if isinstance(self.column_labels, pd.MultiIndex):
453468
nlevels = self.column_labels.nlevels
454-
label = tuple(label if i == 0 else "" for i in range(nlevels))
469+
label = tuple(
470+
label if i == col_level_n else col_fill for i in range(nlevels)
471+
)
472+
455473
# Create index copy with label inserted
456474
# See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html
457475
column_labels_modified = column_labels_modified.insert(position, label)

bigframes/core/compile/sqlglot/expressions/binary_compiler.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,14 +140,37 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
140140

141141
@BINARY_OP_REGISTRATION.register(ops.ge_op)
142142
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
143-
return sge.GTE(this=left.expr, expression=right.expr)
143+
left_expr = _coerce_bool_to_int(left)
144+
right_expr = _coerce_bool_to_int(right)
145+
return sge.GTE(this=left_expr, expression=right_expr)
146+
147+
148+
@BINARY_OP_REGISTRATION.register(ops.gt_op)
149+
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
150+
left_expr = _coerce_bool_to_int(left)
151+
right_expr = _coerce_bool_to_int(right)
152+
return sge.GT(this=left_expr, expression=right_expr)
144153

145154

146155
@BINARY_OP_REGISTRATION.register(ops.JSONSet)
147156
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
148157
return sge.func("JSON_SET", left.expr, sge.convert(op.json_path), right.expr)
149158

150159

160+
@BINARY_OP_REGISTRATION.register(ops.lt_op)
161+
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
162+
left_expr = _coerce_bool_to_int(left)
163+
right_expr = _coerce_bool_to_int(right)
164+
return sge.LT(this=left_expr, expression=right_expr)
165+
166+
167+
@BINARY_OP_REGISTRATION.register(ops.le_op)
168+
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
169+
left_expr = _coerce_bool_to_int(left)
170+
right_expr = _coerce_bool_to_int(right)
171+
return sge.LTE(this=left_expr, expression=right_expr)
172+
173+
151174
@BINARY_OP_REGISTRATION.register(ops.mul_op)
152175
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
153176
left_expr = _coerce_bool_to_int(left)
@@ -170,6 +193,11 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
170193
return sge.NEQ(this=left_expr, expression=right_expr)
171194

172195

196+
@BINARY_OP_REGISTRATION.register(ops.obj_make_ref_op)
197+
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
198+
return sge.func("OBJ.MAKE_REF", left.expr, right.expr)
199+
200+
173201
@BINARY_OP_REGISTRATION.register(ops.sub_op)
174202
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
175203
if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype):
@@ -202,11 +230,6 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
202230
)
203231

204232

205-
@BINARY_OP_REGISTRATION.register(ops.obj_make_ref_op)
206-
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
207-
return sge.func("OBJ.MAKE_REF", left.expr, right.expr)
208-
209-
210233
def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression:
211234
"""Coerce boolean expression to integer."""
212235
if typed_expr.dtype == dtypes.BOOL_DTYPE:

bigframes/dataframe.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2321,6 +2321,10 @@ def reset_index(
23212321
level: blocks.LevelsType = ...,
23222322
drop: bool = ...,
23232323
inplace: Literal[False] = ...,
2324+
col_level: Union[int, str] = ...,
2325+
col_fill: Hashable = ...,
2326+
allow_duplicates: Optional[bool] = ...,
2327+
names: Union[None, Hashable, Sequence[Hashable]] = ...,
23242328
) -> DataFrame:
23252329
...
23262330

@@ -2330,19 +2334,56 @@ def reset_index(
23302334
level: blocks.LevelsType = ...,
23312335
drop: bool = ...,
23322336
inplace: Literal[True] = ...,
2337+
col_level: Union[int, str] = ...,
2338+
col_fill: Hashable = ...,
2339+
allow_duplicates: Optional[bool] = ...,
2340+
names: Union[None, Hashable, Sequence[Hashable]] = ...,
23332341
) -> None:
23342342
...
23352343

23362344
@overload
23372345
def reset_index(
2338-
self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = ...
2346+
self,
2347+
level: blocks.LevelsType = None,
2348+
drop: bool = False,
2349+
inplace: bool = ...,
2350+
col_level: Union[int, str] = ...,
2351+
col_fill: Hashable = ...,
2352+
allow_duplicates: Optional[bool] = ...,
2353+
names: Union[None, Hashable, Sequence[Hashable]] = ...,
23392354
) -> Optional[DataFrame]:
23402355
...
23412356

23422357
def reset_index(
2343-
self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = False
2358+
self,
2359+
level: blocks.LevelsType = None,
2360+
drop: bool = False,
2361+
inplace: bool = False,
2362+
col_level: Union[int, str] = 0,
2363+
col_fill: Hashable = "",
2364+
allow_duplicates: Optional[bool] = None,
2365+
names: Union[None, Hashable, Sequence[Hashable]] = None,
23442366
) -> Optional[DataFrame]:
2345-
block = self._block.reset_index(level, drop)
2367+
block = self._block
2368+
if names:
2369+
if isinstance(names, blocks.Label) and not isinstance(names, tuple):
2370+
names = [names]
2371+
else:
2372+
names = list(names)
2373+
2374+
if len(names) != self.index.nlevels:
2375+
raise ValueError("'names' must be same length as levels")
2376+
2377+
block = block.with_index_labels(names)
2378+
if allow_duplicates is None:
2379+
allow_duplicates = False
2380+
block = block.reset_index(
2381+
level,
2382+
drop,
2383+
col_level=col_level,
2384+
col_fill=col_fill,
2385+
allow_duplicates=allow_duplicates,
2386+
)
23462387
if inplace:
23472388
self._set_block(block)
23482389
return None

bigframes/series.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ def reset_index(
414414
name: typing.Optional[str] = ...,
415415
drop: Literal[False] = ...,
416416
inplace: Literal[False] = ...,
417+
allow_duplicates: Optional[bool] = ...,
417418
) -> bigframes.dataframe.DataFrame:
418419
...
419420

@@ -425,6 +426,7 @@ def reset_index(
425426
name: typing.Optional[str] = ...,
426427
drop: Literal[True] = ...,
427428
inplace: Literal[False] = ...,
429+
allow_duplicates: Optional[bool] = ...,
428430
) -> Series:
429431
...
430432

@@ -436,6 +438,7 @@ def reset_index(
436438
name: typing.Optional[str] = ...,
437439
drop: bool = ...,
438440
inplace: Literal[True] = ...,
441+
allow_duplicates: Optional[bool] = ...,
439442
) -> None:
440443
...
441444

@@ -447,8 +450,11 @@ def reset_index(
447450
name: typing.Optional[str] = None,
448451
drop: bool = False,
449452
inplace: bool = False,
453+
allow_duplicates: Optional[bool] = None,
450454
) -> bigframes.dataframe.DataFrame | Series | None:
451-
block = self._block.reset_index(level, drop)
455+
if allow_duplicates is None:
456+
allow_duplicates = False
457+
block = self._block.reset_index(level, drop, allow_duplicates=allow_duplicates)
452458
if drop:
453459
if inplace:
454460
self._set_block(block)

tests/system/small/engines/test_comparison_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def apply_op_pairwise(
4848
return new_arr
4949

5050

51-
@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)
51+
@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True)
5252
@pytest.mark.parametrize(
5353
"op",
5454
[

tests/system/small/test_dataframe.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2085,6 +2085,32 @@ def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop):
20852085
pandas.testing.assert_frame_equal(bf_result, pd_result)
20862086

20872087

2088+
def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index):
2089+
scalars_df_index = scalars_df_index.copy()
2090+
scalars_df_index.index.name = "int64_col"
2091+
df = scalars_df_index.reset_index(allow_duplicates=True, drop=False)
2092+
assert df.index.name is None
2093+
2094+
bf_result = df.to_pandas()
2095+
2096+
scalars_pandas_df_index = scalars_pandas_df_index.copy()
2097+
scalars_pandas_df_index.index.name = "int64_col"
2098+
pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False)
2099+
2100+
# Pandas uses int64 instead of Int64 (nullable) dtype.
2101+
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
2102+
2103+
# reset_index should maintain the original ordering.
2104+
pandas.testing.assert_frame_equal(bf_result, pd_result)
2105+
2106+
2107+
def test_reset_index_duplicates_error(scalars_df_index):
2108+
scalars_df_index = scalars_df_index.copy()
2109+
scalars_df_index.index.name = "int64_col"
2110+
with pytest.raises(ValueError):
2111+
scalars_df_index.reset_index(allow_duplicates=False, drop=False)
2112+
2113+
20882114
@pytest.mark.parametrize(
20892115
("drop",),
20902116
((True,), (False,)),

tests/system/small/test_multiindex.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -929,16 +929,30 @@ def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index):
929929
pandas.testing.assert_frame_equal(bf_result, pd_result)
930930

931931

932-
def test_column_multi_index_reset_index(scalars_df_index, scalars_pandas_df_index):
932+
@pytest.mark.parametrize(
933+
("names", "col_fill", "col_level"),
934+
[
935+
(None, "", "l2"),
936+
(("new_name"), "fill", 1),
937+
("new_name", "fill", 0),
938+
],
939+
)
940+
def test_column_multi_index_reset_index(
941+
scalars_df_index, scalars_pandas_df_index, names, col_fill, col_level
942+
):
933943
columns = ["int64_too", "int64_col", "float64_col"]
934-
multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
944+
multi_columns = pandas.MultiIndex.from_tuples(
945+
zip(["a", "b", "a"], ["a", "b", "b"]), names=["l1", "l2"]
946+
)
935947
bf_df = scalars_df_index[columns].copy()
936948
bf_df.columns = multi_columns
937949
pd_df = scalars_pandas_df_index[columns].copy()
938950
pd_df.columns = multi_columns
939951

940-
bf_result = bf_df.reset_index().to_pandas()
941-
pd_result = pd_df.reset_index()
952+
bf_result = bf_df.reset_index(
953+
names=names, col_fill=col_fill, col_level=col_level
954+
).to_pandas()
955+
pd_result = pd_df.reset_index(names=names, col_fill=col_fill, col_level=col_level)
942956

943957
# Pandas uses int64 instead of Int64 (nullable) dtype.
944958
pd_result.index = pd_result.index.astype(pandas.Int64Dtype())

tests/system/small/test_series.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,6 +1339,32 @@ def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index):
13391339
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)
13401340

13411341

1342+
def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index):
1343+
bf_series = scalars_df_index["int64_col"].copy()
1344+
bf_series.index.name = "int64_col"
1345+
df = bf_series.reset_index(allow_duplicates=True, drop=False)
1346+
assert df.index.name is None
1347+
1348+
bf_result = df.to_pandas()
1349+
1350+
pd_series = scalars_pandas_df_index["int64_col"].copy()
1351+
pd_series.index.name = "int64_col"
1352+
pd_result = pd_series.reset_index(allow_duplicates=True, drop=False)
1353+
1354+
# Pandas uses int64 instead of Int64 (nullable) dtype.
1355+
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
1356+
1357+
# reset_index should maintain the original ordering.
1358+
pd.testing.assert_frame_equal(bf_result, pd_result)
1359+
1360+
1361+
def test_series_reset_index_duplicates_error(scalars_df_index):
1362+
scalars_df_index = scalars_df_index["int64_col"].copy()
1363+
scalars_df_index.index.name = "int64_col"
1364+
with pytest.raises(ValueError):
1365+
scalars_df_index.reset_index(allow_duplicates=False, drop=False)
1366+
1367+
13421368
def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index):
13431369
bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"]
13441370
bf_result.reset_index(drop=True, inplace=True)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`bool_col` AS `bfcol_0`,
4+
`int64_col` AS `bfcol_1`,
5+
`rowindex` AS `bfcol_2`
6+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
7+
), `bfcte_1` AS (
8+
SELECT
9+
*,
10+
`bfcol_2` AS `bfcol_6`,
11+
`bfcol_1` AS `bfcol_7`,
12+
`bfcol_0` AS `bfcol_8`,
13+
`bfcol_1` >= `bfcol_1` AS `bfcol_9`
14+
FROM `bfcte_0`
15+
), `bfcte_2` AS (
16+
SELECT
17+
*,
18+
`bfcol_6` AS `bfcol_14`,
19+
`bfcol_7` AS `bfcol_15`,
20+
`bfcol_8` AS `bfcol_16`,
21+
`bfcol_9` AS `bfcol_17`,
22+
`bfcol_7` >= 1 AS `bfcol_18`
23+
FROM `bfcte_1`
24+
), `bfcte_3` AS (
25+
SELECT
26+
*,
27+
`bfcol_14` AS `bfcol_24`,
28+
`bfcol_15` AS `bfcol_25`,
29+
`bfcol_16` AS `bfcol_26`,
30+
`bfcol_17` AS `bfcol_27`,
31+
`bfcol_18` AS `bfcol_28`,
32+
`bfcol_15` >= CAST(`bfcol_16` AS INT64) AS `bfcol_29`
33+
FROM `bfcte_2`
34+
), `bfcte_4` AS (
35+
SELECT
36+
*,
37+
`bfcol_24` AS `bfcol_36`,
38+
`bfcol_25` AS `bfcol_37`,
39+
`bfcol_26` AS `bfcol_38`,
40+
`bfcol_27` AS `bfcol_39`,
41+
`bfcol_28` AS `bfcol_40`,
42+
`bfcol_29` AS `bfcol_41`,
43+
CAST(`bfcol_26` AS INT64) >= `bfcol_25` AS `bfcol_42`
44+
FROM `bfcte_3`
45+
)
46+
SELECT
47+
`bfcol_36` AS `rowindex`,
48+
`bfcol_37` AS `int64_col`,
49+
`bfcol_38` AS `bool_col`,
50+
`bfcol_39` AS `int_ge_int`,
51+
`bfcol_40` AS `int_ge_1`,
52+
`bfcol_41` AS `int_ge_bool`,
53+
`bfcol_42` AS `bool_ge_int`
54+
FROM `bfcte_4`

0 commit comments

Comments
 (0)