Skip to content

Commit 50648e4

Browse files
authored
docs: add ml.model_selection examples (#1238)
* docs: add ml.model_selection examples * fix
1 parent f7d52d9 commit 50648e4

File tree

3 files changed

+128
-0
lines changed

3 files changed

+128
-0
lines changed

bigframes/ml/model_selection.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
115115

116116
@log_adapter.class_logger
117117
class KFold(vendored_model_selection_split.KFold):
118+
__doc__ = inspect.getdoc(vendored_model_selection_split.KFold)
119+
118120
def __init__(self, n_splits: int = 5, *, random_state: Union[int, None] = None):
119121
if n_splits < 2:
120122
raise ValueError(f"n_splits must be at least 2. Got {n_splits}")

third_party/bigframes_vendored/sklearn/model_selection/_split.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,80 @@ class KFold(_BaseKFold):
6565
Each fold is then used once as a validation while the k - 1 remaining
6666
folds form the training set.
6767
68+
**Examples:**
69+
70+
>>> import bigframes.pandas as bpd
71+
>>> from bigframes.ml.model_selection import KFold
72+
>>> bpd.options.display.progress_bar = None
73+
>>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]})
74+
>>> y = bpd.DataFrame({"label": [1, 2, 3]})
75+
>>> kf = KFold(n_splits=3, random_state=42)
76+
>>> for i, (X_train, X_test, y_train, y_test) in enumerate(kf.split(X, y)):
77+
... print(f"Fold {i}:")
78+
... print(f" X_train: {X_train}")
79+
... print(f" X_test: {X_test}")
80+
... print(f" y_train: {y_train}")
81+
... print(f" y_test: {y_test}")
82+
...
83+
Fold 0:
84+
X_train: feat0 feat1
85+
1 3 4
86+
2 5 6
87+
<BLANKLINE>
88+
[2 rows x 2 columns]
89+
X_test: feat0 feat1
90+
0 1 2
91+
<BLANKLINE>
92+
[1 rows x 2 columns]
93+
y_train: label
94+
1 2
95+
2 3
96+
<BLANKLINE>
97+
[2 rows x 1 columns]
98+
y_test: label
99+
0 1
100+
<BLANKLINE>
101+
[1 rows x 1 columns]
102+
Fold 1:
103+
X_train: feat0 feat1
104+
0 1 2
105+
2 5 6
106+
<BLANKLINE>
107+
[2 rows x 2 columns]
108+
X_test: feat0 feat1
109+
1 3 4
110+
<BLANKLINE>
111+
[1 rows x 2 columns]
112+
y_train: label
113+
0 1
114+
2 3
115+
<BLANKLINE>
116+
[2 rows x 1 columns]
117+
y_test: label
118+
1 2
119+
<BLANKLINE>
120+
[1 rows x 1 columns]
121+
Fold 2:
122+
X_train: feat0 feat1
123+
0 1 2
124+
1 3 4
125+
<BLANKLINE>
126+
[2 rows x 2 columns]
127+
X_test: feat0 feat1
128+
2 5 6
129+
<BLANKLINE>
130+
[1 rows x 2 columns]
131+
y_train: label
132+
0 1
133+
1 2
134+
<BLANKLINE>
135+
[2 rows x 1 columns]
136+
y_test: label
137+
2 3
138+
<BLANKLINE>
139+
[1 rows x 1 columns]
140+
141+
68142
Args:
69143
n_splits (int):
70144
Number of folds. Must be at least 2. Default to 5.
@@ -84,6 +158,41 @@ def train_test_split(
84158
):
85159
"""Splits dataframes or series into random train and test subsets.
86160
161+
**Examples:**
162+
163+
>>> import bigframes.pandas as bpd
164+
>>> from bigframes.ml.model_selection import train_test_split
165+
>>> bpd.options.display.progress_bar = None
166+
>>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]})
167+
>>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]})
168+
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
169+
>>> X_train
170+
feat0 feat1
171+
0 0 1
172+
1 2 3
173+
4 8 9
174+
<BLANKLINE>
175+
[3 rows x 2 columns]
176+
>>> y_train
177+
label
178+
0 0
179+
1 1
180+
4 4
181+
<BLANKLINE>
182+
[3 rows x 1 columns]
183+
>>> X_test
184+
feat0 feat1
185+
2 4 5
186+
3 6 7
187+
<BLANKLINE>
188+
[2 rows x 2 columns]
189+
>>> y_test
190+
label
191+
2 2
192+
3 3
193+
<BLANKLINE>
194+
[2 rows x 1 columns]
195+
87196
Args:
88197
*arrays (bigframes.dataframe.DataFrame or bigframes.series.Series):
89198
A sequence of BigQuery DataFrames or Series that can be joined on

third_party/bigframes_vendored/sklearn/model_selection/_validation.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,23 @@
1414
def cross_validate(estimator, X, y=None, *, cv=None):
1515
"""Evaluate metric(s) by cross-validation and also record fit/score times.
1616
17+
**Examples:**
18+
19+
>>> import bigframes.pandas as bpd
20+
>>> from bigframes.ml.model_selection import cross_validate, KFold
21+
>>> from bigframes.ml.linear_model import LinearRegression
22+
>>> bpd.options.display.progress_bar = None
23+
>>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]})
24+
>>> y = bpd.DataFrame({"label": [1, 2, 3]})
25+
>>> model = LinearRegression()
26+
>>> scores = cross_validate(model, X, y, cv=3) # doctest: +SKIP
27+
>>> for score in scores["test_score"]: # doctest: +SKIP
28+
... print(score["mean_squared_error"][0])
29+
...
30+
5.218167286047954e-19
31+
2.726229944928669e-18
32+
1.6197635612324266e-17
33+
1734
Args:
1835
estimator:
1936
bigframes.ml model that implements fit().

0 commit comments

Comments
 (0)