Skip to content

Commit faa2738

Browse files
refactor: Move dataframe conversion function from ml package to core package (#1180)
* refactor: use core.convert for series conversions under the ml packages * fetch global session lazily * fix type error * remove unnecessary get_session call * code cleanup * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * code cleanup * add session check * check session before conversion * check individual types during conversion * add missing return statement * remove individul session check to relax the arg requirement --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent a970294 commit faa2738

File tree

20 files changed

+200
-116
lines changed

20 files changed

+200
-116
lines changed

bigframes/bigquery/_operations/search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def vector_search(
223223
options = {} if options is None else options
224224
options["use_brute_force"] = use_brute_force
225225

226-
(query,) = utils.convert_to_dataframe(query)
226+
(query,) = utils.batch_convert_to_dataframe(query)
227227
sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)
228228

229229
sql = bigframes.core.sql.create_vector_search_sql(

bigframes/core/convert.py

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,29 @@
1717

1818
import pandas as pd
1919

20-
from bigframes.core import global_session
21-
import bigframes.core.indexes as index
22-
import bigframes.series as series
20+
from bigframes import dataframe, series, session
21+
from bigframes.core import global_session, indexes
2322

2423

25-
def is_series_convertible(obj) -> bool:
24+
def can_convert_to_series(obj) -> bool:
2625
if isinstance(obj, series.Series):
2726
return True
2827
if isinstance(obj, pd.Series):
2928
return True
30-
if isinstance(obj, index.Index):
29+
if isinstance(obj, indexes.Index):
3130
return True
3231
if isinstance(obj, pd.Index):
3332
return True
3433
if pd.api.types.is_list_like(obj):
3534
return True
36-
else:
37-
return False
35+
36+
return False
3837

3938

4039
def to_bf_series(
41-
obj, default_index: Optional[index.Index], session=None
40+
obj,
41+
default_index: Optional[indexes.Index],
42+
session: Optional[session.Session] = None,
4243
) -> series.Series:
4344
"""
4445
Convert a an object to a bigframes series
@@ -60,16 +61,16 @@ def to_bf_series(
6061

6162
if isinstance(obj, pd.Series):
6263
return series.Series(obj, session=session)
63-
if isinstance(obj, index.Index):
64+
if isinstance(obj, indexes.Index):
6465
return series.Series(obj, default_index, session=session)
6566
if isinstance(obj, pd.Index):
6667
return series.Series(obj, default_index, session=session)
6768
if pd.api.types.is_dict_like(obj):
6869
return series.Series(obj, session=session)
6970
if pd.api.types.is_list_like(obj):
7071
return series.Series(obj, default_index, session=session)
71-
else:
72-
raise TypeError(f"Cannot interpret {obj} as series.")
72+
73+
raise TypeError(f"Cannot interpret {obj} as series.")
7374

7475

7576
def to_pd_series(obj, default_index: pd.Index) -> pd.Series:
@@ -89,13 +90,42 @@ def to_pd_series(obj, default_index: pd.Index) -> pd.Series:
8990
return obj.to_pandas()
9091
if isinstance(obj, pd.Series):
9192
return obj
92-
if isinstance(obj, index.Index):
93+
if isinstance(obj, indexes.Index):
9394
return pd.Series(obj.to_pandas(), default_index)
9495
if isinstance(obj, pd.Index):
9596
return pd.Series(obj, default_index)
9697
if pd.api.types.is_dict_like(obj):
9798
return pd.Series(obj)
9899
if pd.api.types.is_list_like(obj):
99100
return pd.Series(obj, default_index)
100-
else:
101-
raise TypeError(f"Cannot interpret {obj} as series.")
101+
102+
raise TypeError(f"Cannot interpret {obj} as series.")
103+
104+
105+
def can_convert_to_dataframe(obj) -> bool:
106+
if can_convert_to_series(obj):
107+
return True
108+
109+
if isinstance(obj, dataframe.DataFrame) or isinstance(obj, pd.DataFrame):
110+
return True
111+
112+
return False
113+
114+
115+
def to_bf_dataframe(
116+
obj,
117+
default_index: Optional[indexes.Index],
118+
session: Optional[session.Session] = None,
119+
) -> dataframe.DataFrame:
120+
if isinstance(obj, dataframe.DataFrame):
121+
return obj
122+
123+
if isinstance(obj, pd.DataFrame):
124+
if session is None:
125+
session = global_session.get_global_session()
126+
return dataframe.DataFrame(obj, session=session)
127+
128+
if can_convert_to_series(obj):
129+
return to_bf_series(obj, default_index, session).to_frame()
130+
131+
raise TypeError(f"Cannot interpret {obj} as a dataframe.")

bigframes/ml/cluster.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def _fit(
106106
y=None, # ignored
107107
transforms: Optional[List[str]] = None,
108108
) -> KMeans:
109-
(X,) = utils.convert_to_dataframe(X)
109+
(X,) = utils.batch_convert_to_dataframe(X)
110110

111111
self._bqml_model = self._bqml_model_factory.create_model(
112112
X_train=X,
@@ -131,7 +131,7 @@ def predict(
131131
if not self._bqml_model:
132132
raise RuntimeError("A model must be fitted before predict")
133133

134-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
134+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
135135

136136
return self._bqml_model.predict(X)
137137

@@ -160,7 +160,7 @@ def detect_anomalies(
160160
if not self._bqml_model:
161161
raise RuntimeError("A model must be fitted before detect_anomalies")
162162

163-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
163+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
164164

165165
return self._bqml_model.detect_anomalies(
166166
X, options={"contamination": contamination}
@@ -191,6 +191,6 @@ def score(
191191
if not self._bqml_model:
192192
raise RuntimeError("A model must be fitted before score")
193193

194-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
194+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
195195

196196
return self._bqml_model.evaluate(X)

bigframes/ml/compose.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ def fit(
336336
X: utils.ArrayType,
337337
y=None, # ignored
338338
) -> ColumnTransformer:
339-
(X,) = utils.convert_to_dataframe(X)
339+
(X,) = utils.batch_convert_to_dataframe(X)
340340

341341
transform_sqls = self._compile_to_sql(X)
342342
self._bqml_model = self._bqml_model_factory.create_model(
@@ -352,7 +352,7 @@ def transform(self, X: utils.ArrayType) -> bpd.DataFrame:
352352
if not self._bqml_model:
353353
raise RuntimeError("Must be fitted before transform")
354354

355-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
355+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
356356

357357
df = self._bqml_model.transform(X)
358358
return typing.cast(

bigframes/ml/decomposition.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _fit(
8888
y=None,
8989
transforms: Optional[List[str]] = None,
9090
) -> PCA:
91-
(X,) = utils.convert_to_dataframe(X)
91+
(X,) = utils.batch_convert_to_dataframe(X)
9292

9393
# To mimic sklearn's behavior
9494
if self.n_components is None:
@@ -133,7 +133,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
133133
if not self._bqml_model:
134134
raise RuntimeError("A model must be fitted before predict")
135135

136-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
136+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
137137

138138
return self._bqml_model.predict(X)
139139

@@ -162,7 +162,7 @@ def detect_anomalies(
162162
if not self._bqml_model:
163163
raise RuntimeError("A model must be fitted before detect_anomalies")
164164

165-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
165+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
166166

167167
return self._bqml_model.detect_anomalies(
168168
X, options={"contamination": contamination}

bigframes/ml/ensemble.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,12 @@ def _fit(
148148
X_eval: Optional[utils.ArrayType] = None,
149149
y_eval: Optional[utils.ArrayType] = None,
150150
) -> XGBRegressor:
151-
X, y = utils.convert_to_dataframe(X, y)
151+
X, y = utils.batch_convert_to_dataframe(X, y)
152152

153153
bqml_options = self._bqml_options
154154

155155
if X_eval is not None and y_eval is not None:
156-
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
156+
X_eval, y_eval = utils.batch_convert_to_dataframe(X_eval, y_eval)
157157
X, y, bqml_options = utils.combine_training_and_evaluation_data(
158158
X, y, X_eval, y_eval, bqml_options
159159
)
@@ -172,7 +172,7 @@ def predict(
172172
) -> bpd.DataFrame:
173173
if not self._bqml_model:
174174
raise RuntimeError("A model must be fitted before predict")
175-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
175+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
176176

177177
return self._bqml_model.predict(X)
178178

@@ -184,7 +184,7 @@ def score(
184184
if not self._bqml_model:
185185
raise RuntimeError("A model must be fitted before score")
186186

187-
X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session)
187+
X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session)
188188

189189
input_data = (
190190
X.join(y, how="outer") if (X is not None) and (y is not None) else None
@@ -307,12 +307,12 @@ def _fit(
307307
X_eval: Optional[utils.ArrayType] = None,
308308
y_eval: Optional[utils.ArrayType] = None,
309309
) -> XGBClassifier:
310-
X, y = utils.convert_to_dataframe(X, y)
310+
X, y = utils.batch_convert_to_dataframe(X, y)
311311

312312
bqml_options = self._bqml_options
313313

314314
if X_eval is not None and y_eval is not None:
315-
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
315+
X_eval, y_eval = utils.batch_convert_to_dataframe(X_eval, y_eval)
316316
X, y, bqml_options = utils.combine_training_and_evaluation_data(
317317
X, y, X_eval, y_eval, bqml_options
318318
)
@@ -328,7 +328,7 @@ def _fit(
328328
def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
329329
if not self._bqml_model:
330330
raise RuntimeError("A model must be fitted before predict")
331-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
331+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
332332

333333
return self._bqml_model.predict(X)
334334

@@ -340,7 +340,7 @@ def score(
340340
if not self._bqml_model:
341341
raise RuntimeError("A model must be fitted before score")
342342

343-
X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session)
343+
X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session)
344344

345345
input_data = (
346346
X.join(y, how="outer") if (X is not None) and (y is not None) else None
@@ -453,12 +453,12 @@ def _fit(
453453
X_eval: Optional[utils.ArrayType] = None,
454454
y_eval: Optional[utils.ArrayType] = None,
455455
) -> RandomForestRegressor:
456-
X, y = utils.convert_to_dataframe(X, y)
456+
X, y = utils.batch_convert_to_dataframe(X, y)
457457

458458
bqml_options = self._bqml_options
459459

460460
if X_eval is not None and y_eval is not None:
461-
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
461+
X_eval, y_eval = utils.batch_convert_to_dataframe(X_eval, y_eval)
462462
X, y, bqml_options = utils.combine_training_and_evaluation_data(
463463
X, y, X_eval, y_eval, bqml_options
464464
)
@@ -477,7 +477,7 @@ def predict(
477477
) -> bpd.DataFrame:
478478
if not self._bqml_model:
479479
raise RuntimeError("A model must be fitted before predict")
480-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
480+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
481481

482482
return self._bqml_model.predict(X)
483483

@@ -506,7 +506,7 @@ def score(
506506
if not self._bqml_model:
507507
raise RuntimeError("A model must be fitted before score")
508508

509-
X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session)
509+
X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session)
510510

511511
input_data = (
512512
X.join(y, how="outer") if (X is not None) and (y is not None) else None
@@ -619,12 +619,12 @@ def _fit(
619619
X_eval: Optional[utils.ArrayType] = None,
620620
y_eval: Optional[utils.ArrayType] = None,
621621
) -> RandomForestClassifier:
622-
X, y = utils.convert_to_dataframe(X, y)
622+
X, y = utils.batch_convert_to_dataframe(X, y)
623623

624624
bqml_options = self._bqml_options
625625

626626
if X_eval is not None and y_eval is not None:
627-
X_eval, y_eval = utils.convert_to_dataframe(X_eval, y_eval)
627+
X_eval, y_eval = utils.batch_convert_to_dataframe(X_eval, y_eval)
628628
X, y, bqml_options = utils.combine_training_and_evaluation_data(
629629
X, y, X_eval, y_eval, bqml_options
630630
)
@@ -643,7 +643,7 @@ def predict(
643643
) -> bpd.DataFrame:
644644
if not self._bqml_model:
645645
raise RuntimeError("A model must be fitted before predict")
646-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
646+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
647647

648648
return self._bqml_model.predict(X)
649649

@@ -672,7 +672,7 @@ def score(
672672
if not self._bqml_model:
673673
raise RuntimeError("A model must be fitted before score")
674674

675-
X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session)
675+
X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session)
676676

677677
input_data = (
678678
X.join(y, how="outer") if (X is not None) and (y is not None) else None

bigframes/ml/forecasting.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def _fit(
206206
if y.columns.size != 1:
207207
raise ValueError("Time series data input y must only contain 1 column.")
208208

209-
X, y = utils.convert_to_dataframe(X, y)
209+
X, y = utils.batch_convert_to_dataframe(X, y)
210210

211211
self._bqml_model = self._bqml_model_factory.create_time_series_model(
212212
X,
@@ -298,7 +298,7 @@ def detect_anomalies(
298298
if not self._bqml_model:
299299
raise RuntimeError("A model must be fitted before detect_anomalies")
300300

301-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
301+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
302302

303303
return self._bqml_model.detect_anomalies(
304304
X, options={"anomaly_prob_threshold": anomaly_prob_threshold}
@@ -331,7 +331,7 @@ def score(
331331
"""
332332
if not self._bqml_model:
333333
raise RuntimeError("A model must be fitted before score")
334-
X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session)
334+
X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session)
335335

336336
input_data = X.join(y, how="outer")
337337
return self._bqml_model.evaluate(input_data)

bigframes/ml/imported.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
8080
self._bqml_model = self._create_bqml_model()
8181
self._bqml_model = cast(core.BqmlModel, self._bqml_model)
8282

83-
(X,) = utils.convert_to_dataframe(X)
83+
(X,) = utils.batch_convert_to_dataframe(X)
8484

8585
return self._bqml_model.predict(X)
8686

@@ -159,7 +159,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
159159
self._bqml_model = self._create_bqml_model()
160160
self._bqml_model = cast(core.BqmlModel, self._bqml_model)
161161

162-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
162+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
163163

164164
return self._bqml_model.predict(X)
165165

@@ -275,7 +275,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
275275
self._bqml_model = self._create_bqml_model()
276276
self._bqml_model = cast(core.BqmlModel, self._bqml_model)
277277

278-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
278+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
279279

280280
return self._bqml_model.predict(X)
281281

bigframes/ml/impute.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def fit(
8787
X: utils.ArrayType,
8888
y=None, # ignored
8989
) -> SimpleImputer:
90-
(X,) = utils.convert_to_dataframe(X)
90+
(X,) = utils.batch_convert_to_dataframe(X)
9191

9292
transform_sqls = self._compile_to_sql(X)
9393
self._bqml_model = self._bqml_model_factory.create_model(
@@ -103,7 +103,7 @@ def transform(self, X: utils.ArrayType) -> bpd.DataFrame:
103103
if not self._bqml_model:
104104
raise RuntimeError("Must be fitted before transform")
105105

106-
(X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session)
106+
(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
107107

108108
df = self._bqml_model.transform(X)
109109
return typing.cast(

0 commit comments

Comments
 (0)