Skip to content

Commit 684b2a6

Browse files
authored
refactor: move get_dummies() to encoding.py (#1219)
* refactor: move get_dummies() to encoding.py * remove redundant import
1 parent 8b6a99b commit 684b2a6

File tree

3 files changed

+198
-181
lines changed

3 files changed

+198
-181
lines changed

bigframes/core/reshape/api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
# limitations under the License.
1414

1515
from bigframes.core.reshape.concat import concat
16+
from bigframes.core.reshape.encoding import get_dummies
1617
from bigframes.core.reshape.merge import merge
1718
from bigframes.core.reshape.tile import cut, qcut
1819

19-
__all__ = ["concat", "cut", "qcut", "merge"]
20+
__all__ = ["concat", "get_dummies", "merge", "cut", "qcut"]

bigframes/core/reshape/encoding.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import typing
16+
from typing import Any, List, Optional, Tuple, Union
17+
18+
import bigframes_vendored.constants as constants
19+
import bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding
20+
import pandas
21+
22+
from bigframes import operations
23+
from bigframes.core import blocks, expression
24+
from bigframes.dataframe import DataFrame
25+
from bigframes.series import Series
26+
27+
28+
def get_dummies(
29+
data: Union[DataFrame, Series],
30+
prefix: Union[List, dict, str, None] = None,
31+
prefix_sep: Union[List, dict, str, None] = "_",
32+
dummy_na: bool = False,
33+
columns: Optional[List] = None,
34+
drop_first: bool = False,
35+
dtype: Any = None,
36+
) -> DataFrame:
37+
# simplify input parameters into per-input-label lists
38+
# also raise errors for invalid parameters
39+
column_labels, prefixes, prefix_seps = _standardize_get_dummies_params(
40+
data, prefix, prefix_sep, columns, dtype
41+
)
42+
43+
# combine prefixes into per-column-id list
44+
full_columns_prefixes, columns_ids = _determine_get_dummies_columns_from_labels(
45+
data, column_labels, prefix is not None, prefixes, prefix_seps
46+
)
47+
48+
# run queries to compute unique values
49+
block = data._block
50+
max_unique_value = (
51+
blocks._BQ_MAX_COLUMNS - len(block.value_columns) - len(block.index_columns) - 1
52+
) // len(column_labels)
53+
columns_values = [
54+
block._get_unique_values([col_id], max_unique_value) for col_id in columns_ids
55+
]
56+
57+
# for each dummified column, add the content of the output columns via block operations
58+
intermediate_col_ids = []
59+
for i in range(len(columns_values)):
60+
level = columns_values[i].get_level_values(0).sort_values().dropna()
61+
if drop_first:
62+
level = level[1:]
63+
column_label = full_columns_prefixes[i]
64+
column_id = columns_ids[i]
65+
block, new_intermediate_col_ids = _perform_get_dummies_block_operations(
66+
block, level, column_label, column_id, dummy_na
67+
)
68+
intermediate_col_ids.extend(new_intermediate_col_ids)
69+
70+
# drop dummified columns (and the intermediate columns we added)
71+
block = block.drop_columns(columns_ids + intermediate_col_ids)
72+
return DataFrame(block)
73+
74+
75+
get_dummies.__doc__ = vendored_pandas_encoding.get_dummies.__doc__
76+
77+
78+
def _standardize_get_dummies_params(
79+
data: Union[DataFrame, Series],
80+
prefix: Union[List, dict, str, None],
81+
prefix_sep: Union[List, dict, str, None],
82+
columns: Optional[List],
83+
dtype: Any,
84+
) -> Tuple[List, List[str], List[str]]:
85+
block = data._block
86+
87+
if isinstance(data, Series):
88+
columns = [block.column_labels[0]]
89+
if columns is not None and not pandas.api.types.is_list_like(columns):
90+
raise TypeError("Input must be a list-like for parameter `columns`")
91+
if dtype is not None and dtype not in [
92+
pandas.BooleanDtype,
93+
bool,
94+
"Boolean",
95+
"boolean",
96+
"bool",
97+
]:
98+
raise NotImplementedError(
99+
f"Only Boolean dtype is currently supported. {constants.FEEDBACK_LINK}"
100+
)
101+
102+
if columns is None:
103+
default_dummy_types = [pandas.StringDtype, "string[pyarrow]"]
104+
columns = []
105+
columns_set = set()
106+
for col_id in block.value_columns:
107+
label = block.col_id_to_label[col_id]
108+
if (
109+
label not in columns_set
110+
and block.expr.get_column_type(col_id) in default_dummy_types
111+
):
112+
columns.append(label)
113+
columns_set.add(label)
114+
115+
column_labels: List = typing.cast(List, columns)
116+
117+
def parse_prefix_kwarg(kwarg, kwarg_name) -> Optional[List[str]]:
118+
if kwarg is None:
119+
return None
120+
if isinstance(kwarg, str):
121+
return [kwarg] * len(column_labels)
122+
if isinstance(kwarg, dict):
123+
return [kwarg[column] for column in column_labels]
124+
kwarg = typing.cast(List, kwarg)
125+
if pandas.api.types.is_list_like(kwarg) and len(kwarg) != len(column_labels):
126+
raise ValueError(
127+
f"Length of '{kwarg_name}' ({len(kwarg)}) did not match "
128+
f"the length of the columns being encoded ({len(column_labels)})."
129+
)
130+
if pandas.api.types.is_list_like(kwarg):
131+
return list(map(str, kwarg))
132+
raise TypeError(f"{kwarg_name} kwarg must be a string, list, or dictionary")
133+
134+
prefix_seps = parse_prefix_kwarg(prefix_sep or "_", "prefix_sep")
135+
prefix_seps = typing.cast(List, prefix_seps)
136+
prefixes = parse_prefix_kwarg(prefix, "prefix")
137+
if prefixes is None:
138+
prefixes = column_labels
139+
prefixes = typing.cast(List, prefixes)
140+
141+
return column_labels, prefixes, prefix_seps
142+
143+
144+
def _determine_get_dummies_columns_from_labels(
145+
data: Union[DataFrame, Series],
146+
column_labels: List,
147+
prefix_given: bool,
148+
prefixes: List[str],
149+
prefix_seps: List[str],
150+
) -> Tuple[List[str], List[str]]:
151+
block = data._block
152+
153+
columns_ids = []
154+
columns_prefixes = []
155+
for i in range(len(column_labels)):
156+
label = column_labels[i]
157+
empty_prefix = label is None or (isinstance(data, Series) and not prefix_given)
158+
full_prefix = "" if empty_prefix else prefixes[i] + prefix_seps[i]
159+
160+
for col_id in block.label_to_col_id[label]:
161+
columns_ids.append(col_id)
162+
columns_prefixes.append(full_prefix)
163+
164+
return columns_prefixes, columns_ids
165+
166+
167+
def _perform_get_dummies_block_operations(
168+
block: blocks.Block,
169+
level: pandas.Index,
170+
column_label: str,
171+
column_id: str,
172+
dummy_na: bool,
173+
) -> Tuple[blocks.Block, List[str]]:
174+
intermediate_col_ids = []
175+
for value in level:
176+
new_column_label = f"{column_label}{value}"
177+
if column_label == "":
178+
new_column_label = value
179+
new_block, new_id = block.project_expr(
180+
operations.eq_op.as_expr(column_id, expression.const(value))
181+
)
182+
intermediate_col_ids.append(new_id)
183+
block, _ = new_block.project_expr(
184+
operations.fillna_op.as_expr(new_id, expression.const(False)),
185+
label=new_column_label,
186+
)
187+
if dummy_na:
188+
# dummy column name for na depends on the dtype
189+
na_string = str(pandas.Index([None], dtype=level.dtype)[0])
190+
new_column_label = f"{column_label}{na_string}"
191+
block, _ = block.apply_unary_op(
192+
column_id, operations.isnull_op, result_label=new_column_label
193+
)
194+
return block, intermediate_col_ids

0 commit comments

Comments
 (0)