Skip to content

Commit d638f7c

Browse files
authored
refactor: consolidate reshaping functions under the reshape package (#1194)
* consolidate reshaping functions under the reshape package * format code * fix import * fix import * lint code
1 parent 9dcf1aa commit d638f7c

File tree

9 files changed

+260
-332
lines changed

9 files changed

+260
-332
lines changed

bigframes/core/reshape/__init__.py

Lines changed: 0 additions & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -11,190 +11,3 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from __future__ import annotations
15-
16-
import typing
17-
from typing import Iterable, Literal, Optional, Union
18-
19-
import bigframes_vendored.constants as constants
20-
import pandas as pd
21-
22-
import bigframes.core.expression as ex
23-
import bigframes.core.ordering as order
24-
import bigframes.core.utils as utils
25-
import bigframes.core.window_spec as window_specs
26-
import bigframes.dataframe
27-
import bigframes.operations as ops
28-
import bigframes.operations.aggregations as agg_ops
29-
import bigframes.series
30-
31-
32-
@typing.overload
33-
def concat(
34-
objs: Iterable[bigframes.series.Series],
35-
*,
36-
axis: typing.Literal["index", 0] = ...,
37-
join=...,
38-
ignore_index=...,
39-
) -> bigframes.series.Series:
40-
...
41-
42-
43-
@typing.overload
44-
def concat(
45-
objs: Iterable[bigframes.dataframe.DataFrame],
46-
*,
47-
axis: typing.Literal["index", 0] = ...,
48-
join=...,
49-
ignore_index=...,
50-
) -> bigframes.dataframe.DataFrame:
51-
...
52-
53-
54-
@typing.overload
55-
def concat(
56-
objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]],
57-
*,
58-
axis: typing.Literal["columns", 1],
59-
join=...,
60-
ignore_index=...,
61-
) -> bigframes.dataframe.DataFrame:
62-
...
63-
64-
65-
@typing.overload
66-
def concat(
67-
objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]],
68-
*,
69-
axis=...,
70-
join=...,
71-
ignore_index=...,
72-
) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]:
73-
...
74-
75-
76-
def concat(
77-
objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]],
78-
*,
79-
axis: typing.Union[str, int] = 0,
80-
join: Literal["inner", "outer"] = "outer",
81-
ignore_index: bool = False,
82-
) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]:
83-
axis_n = utils.get_axis_number(axis)
84-
if axis_n == 0:
85-
contains_dataframes = any(
86-
isinstance(x, bigframes.dataframe.DataFrame) for x in objs
87-
)
88-
if not contains_dataframes:
89-
# Special case, all series, so align everything into single column even if labels don't match
90-
series = typing.cast(typing.Iterable[bigframes.series.Series], objs)
91-
names = {s.name for s in series}
92-
# For series case, labels are stripped if they don't all match
93-
if len(names) > 1:
94-
blocks = [s._block.with_column_labels([None]) for s in series]
95-
else:
96-
blocks = [s._block for s in series]
97-
block = blocks[0].concat(blocks[1:], how=join, ignore_index=ignore_index)
98-
return bigframes.series.Series(block)
99-
blocks = [obj._block for obj in objs]
100-
block = blocks[0].concat(blocks[1:], how=join, ignore_index=ignore_index)
101-
return bigframes.dataframe.DataFrame(block)
102-
else:
103-
# Note: does not validate inputs
104-
block_list = [obj._block for obj in objs]
105-
block = block_list[0]
106-
for rblock in block_list[1:]:
107-
block, _ = block.join(rblock, how=join)
108-
return bigframes.dataframe.DataFrame(block)
109-
110-
111-
def cut(
112-
x: bigframes.series.Series,
113-
bins: Union[
114-
int,
115-
pd.IntervalIndex,
116-
Iterable,
117-
],
118-
*,
119-
labels: Union[Iterable[str], bool, None] = None,
120-
) -> bigframes.series.Series:
121-
if isinstance(bins, int) and bins <= 0:
122-
raise ValueError("`bins` should be a positive integer.")
123-
124-
if isinstance(bins, Iterable):
125-
if isinstance(bins, pd.IntervalIndex):
126-
as_index: pd.IntervalIndex = bins
127-
bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
128-
elif len(list(bins)) == 0:
129-
raise ValueError("`bins` iterable should have at least one item")
130-
elif isinstance(list(bins)[0], tuple):
131-
as_index = pd.IntervalIndex.from_tuples(list(bins))
132-
bins = tuple(bins)
133-
elif pd.api.types.is_number(list(bins)[0]):
134-
bins_list = list(bins)
135-
if len(bins_list) < 2:
136-
raise ValueError(
137-
"`bins` iterable of numeric breaks should have"
138-
" at least two items"
139-
)
140-
as_index = pd.IntervalIndex.from_breaks(bins_list)
141-
single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list])
142-
numeric_type = type(bins_list[0]) if single_type else float
143-
bins = tuple(
144-
[
145-
(numeric_type(bins_list[i]), numeric_type(bins_list[i + 1]))
146-
for i in range(len(bins_list) - 1)
147-
]
148-
)
149-
else:
150-
raise ValueError("`bins` iterable should contain tuples or numerics")
151-
152-
if as_index.is_overlapping:
153-
raise ValueError("Overlapping IntervalIndex is not accepted.")
154-
155-
if labels is not None and labels is not False:
156-
raise NotImplementedError(
157-
"The 'labels' parameter must be either False or None. "
158-
"Please provide a valid value for 'labels'."
159-
)
160-
161-
return x._apply_window_op(
162-
agg_ops.CutOp(bins, labels=labels), window_spec=window_specs.unbound()
163-
)
164-
165-
166-
def qcut(
167-
x: bigframes.series.Series,
168-
q: typing.Union[int, typing.Sequence[float]],
169-
*,
170-
labels: Optional[bool] = None,
171-
duplicates: typing.Literal["drop", "error"] = "error",
172-
) -> bigframes.series.Series:
173-
if isinstance(q, int) and q <= 0:
174-
raise ValueError("`q` should be a positive integer.")
175-
if utils.is_list_like(q):
176-
q = tuple(q)
177-
178-
if labels is not False:
179-
raise NotImplementedError(
180-
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
181-
)
182-
if duplicates != "drop":
183-
raise NotImplementedError(
184-
f"Only duplicates='drop' is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
185-
)
186-
block = x._block
187-
label = block.col_id_to_label[x._value_column]
188-
block, nullity_id = block.apply_unary_op(x._value_column, ops.notnull_op)
189-
block, result = block.apply_window_op(
190-
x._value_column,
191-
agg_ops.QcutOp(q), # type: ignore
192-
window_spec=window_specs.unbound(
193-
grouping_keys=(nullity_id,),
194-
ordering=(order.ascending_over(x._value_column),),
195-
),
196-
)
197-
block, result = block.project_expr(
198-
ops.where_op.as_expr(result, nullity_id, ex.const(None)), label=label
199-
)
200-
return bigframes.series.Series(block.select_column(result))
Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023 Google LLC
1+
# Copyright 2024 Google LLC
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -12,10 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""Helpers to join ArrayValue objects."""
15+
from bigframes.core.reshape.concat import concat
16+
from bigframes.core.reshape.merge import merge
17+
from bigframes.core.reshape.tile import cut, qcut
1618

17-
from bigframes.core.joins.merge import merge
18-
19-
__all__ = [
20-
"merge",
21-
]
19+
__all__ = ["concat", "cut", "qcut", "merge"]

bigframes/core/reshape/concat.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import typing
18+
from typing import Iterable, Literal, Union
19+
20+
import bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat
21+
22+
import bigframes.core.utils as utils
23+
import bigframes.dataframe
24+
import bigframes.series
25+
26+
27+
@typing.overload
28+
def concat(
29+
objs: Iterable[bigframes.series.Series],
30+
*,
31+
axis: typing.Literal["index", 0] = ...,
32+
join=...,
33+
ignore_index=...,
34+
) -> bigframes.series.Series:
35+
...
36+
37+
38+
@typing.overload
39+
def concat(
40+
objs: Iterable[bigframes.dataframe.DataFrame],
41+
*,
42+
axis: typing.Literal["index", 0] = ...,
43+
join=...,
44+
ignore_index=...,
45+
) -> bigframes.dataframe.DataFrame:
46+
...
47+
48+
49+
@typing.overload
50+
def concat(
51+
objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]],
52+
*,
53+
axis: typing.Literal["columns", 1],
54+
join=...,
55+
ignore_index=...,
56+
) -> bigframes.dataframe.DataFrame:
57+
...
58+
59+
60+
@typing.overload
61+
def concat(
62+
objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]],
63+
*,
64+
axis=...,
65+
join=...,
66+
ignore_index=...,
67+
) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]:
68+
...
69+
70+
71+
def concat(
72+
objs: Iterable[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]],
73+
*,
74+
axis: typing.Union[str, int] = 0,
75+
join: Literal["inner", "outer"] = "outer",
76+
ignore_index: bool = False,
77+
) -> Union[bigframes.dataframe.DataFrame, bigframes.series.Series]:
78+
axis_n = utils.get_axis_number(axis)
79+
if axis_n == 0:
80+
contains_dataframes = any(
81+
isinstance(x, bigframes.dataframe.DataFrame) for x in objs
82+
)
83+
if not contains_dataframes:
84+
# Special case, all series, so align everything into single column even if labels don't match
85+
series = typing.cast(typing.Iterable[bigframes.series.Series], objs)
86+
names = {s.name for s in series}
87+
# For series case, labels are stripped if they don't all match
88+
if len(names) > 1:
89+
blocks = [s._block.with_column_labels([None]) for s in series]
90+
else:
91+
blocks = [s._block for s in series]
92+
block = blocks[0].concat(blocks[1:], how=join, ignore_index=ignore_index)
93+
return bigframes.series.Series(block)
94+
blocks = [obj._block for obj in objs]
95+
block = blocks[0].concat(blocks[1:], how=join, ignore_index=ignore_index)
96+
return bigframes.dataframe.DataFrame(block)
97+
else:
98+
# Note: does not validate inputs
99+
block_list = [obj._block for obj in objs]
100+
block = block_list[0]
101+
for rblock in block_list[1:]:
102+
block, _ = block.join(rblock, how=join)
103+
return bigframes.dataframe.DataFrame(block)
104+
105+
106+
concat.__doc__ = vendored_pandas_concat.concat.__doc__

bigframes/core/joins/merge.py renamed to bigframes/core/reshape/merge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import typing
2222
from typing import Literal, Optional
2323

24+
import bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge
25+
2426
# Avoid cirular imports.
2527
if typing.TYPE_CHECKING:
2628
import bigframes.dataframe
@@ -58,6 +60,9 @@ def merge(
5860
)
5961

6062

63+
merge.__doc__ = vendored_pandas_merge.merge.__doc__
64+
65+
6166
def _validate_operand(
6267
obj: bigframes.dataframe.DataFrame | bigframes.series.Series,
6368
) -> bigframes.dataframe.DataFrame:

0 commit comments

Comments
 (0)