docs: add ml.model_selection examples (#1238)

GarrettWu · web-flow · commit 50648e4d5d7c · 2024-12-23T13:04:03.000-08:00
* docs: add ml.model_selection examples

* fix
diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
@@ -115,6 +115,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
 
 @log_adapter.class_logger
 class KFold(vendored_model_selection_split.KFold):
+    __doc__ = inspect.getdoc(vendored_model_selection_split.KFold)
+
     def __init__(self, n_splits: int = 5, *, random_state: Union[int, None] = None):
         if n_splits < 2:
             raise ValueError(f"n_splits must be at least 2. Got {n_splits}")
diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py
@@ -65,6 +65,80 @@ class KFold(_BaseKFold):
     Each fold is then used once as a validation while the k - 1 remaining
     folds form the training set.
 
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.model_selection import KFold
+        >>> bpd.options.display.progress_bar = None
+        >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]})
+        >>> y = bpd.DataFrame({"label": [1, 2, 3]})
+        >>> kf = KFold(n_splits=3, random_state=42)
+        >>> for i, (X_train, X_test, y_train, y_test) in enumerate(kf.split(X, y)):
+        ...     print(f"Fold {i}:")
+        ...     print(f"  X_train: {X_train}")
+        ...     print(f"  X_test: {X_test}")
+        ...     print(f"  y_train: {y_train}")
+        ...     print(f"  y_test: {y_test}")
+        ...
+        Fold 0:
+          X_train:    feat0  feat1
+        1      3      4
+        2      5      6
+        <BLANKLINE>
+        [2 rows x 2 columns]
+          X_test:    feat0  feat1
+        0      1      2
+        <BLANKLINE>
+        [1 rows x 2 columns]
+          y_train:    label
+        1      2
+        2      3
+        <BLANKLINE>
+        [2 rows x 1 columns]
+          y_test:    label
+        0      1
+        <BLANKLINE>
+        [1 rows x 1 columns]
+        Fold 1:
+          X_train:    feat0  feat1
+        0      1      2
+        2      5      6
+        <BLANKLINE>
+        [2 rows x 2 columns]
+          X_test:    feat0  feat1
+        1      3      4
+        <BLANKLINE>
+        [1 rows x 2 columns]
+          y_train:    label
+        0      1
+        2      3
+        <BLANKLINE>
+        [2 rows x 1 columns]
+          y_test:    label
+        1      2
+        <BLANKLINE>
+        [1 rows x 1 columns]
+        Fold 2:
+          X_train:    feat0  feat1
+        0      1      2
+        1      3      4
+        <BLANKLINE>
+        [2 rows x 2 columns]
+          X_test:    feat0  feat1
+        2      5      6
+        <BLANKLINE>
+        [1 rows x 2 columns]
+          y_train:    label
+        0      1
+        1      2
+        <BLANKLINE>
+        [2 rows x 1 columns]
+          y_test:    label
+        2      3
+        <BLANKLINE>
+        [1 rows x 1 columns]
+
+
     Args:
         n_splits (int):
             Number of folds. Must be at least 2. Default to 5.
@@ -84,6 +158,41 @@ def train_test_split(
 ):
     """Splits dataframes or series into random train and test subsets.
 
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.model_selection import train_test_split
+        >>> bpd.options.display.progress_bar = None
+        >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]})
+        >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]})
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+        >>> X_train
+            feat0  feat1
+        0      0      1
+        1      2      3
+        4      8      9
+        <BLANKLINE>
+        [3 rows x 2 columns]
+        >>> y_train
+            label
+        0      0
+        1      1
+        4      4
+        <BLANKLINE>
+        [3 rows x 1 columns]
+        >>> X_test
+            feat0  feat1
+        2      4      5
+        3      6      7
+        <BLANKLINE>
+        [2 rows x 2 columns]
+        >>> y_test
+            label
+        2      2
+        3      3
+        <BLANKLINE>
+        [2 rows x 1 columns]
+
     Args:
         *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series):
             A sequence of BigQuery DataFrames or Series that can be joined on
diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py
@@ -14,6 +14,23 @@
 def cross_validate(estimator, X, y=None, *, cv=None):
     """Evaluate metric(s) by cross-validation and also record fit/score times.
 
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.model_selection import cross_validate, KFold
+        >>> from bigframes.ml.linear_model import LinearRegression
+        >>> bpd.options.display.progress_bar = None
+        >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]})
+        >>> y = bpd.DataFrame({"label": [1, 2, 3]})
+        >>> model = LinearRegression()
+        >>> scores = cross_validate(model, X, y, cv=3) # doctest: +SKIP
+        >>> for score in scores["test_score"]: # doctest: +SKIP
+        ...   print(score["mean_squared_error"][0])
+        ...
+        5.218167286047954e-19
+        2.726229944928669e-18
+        1.6197635612324266e-17
+
     Args:
         estimator:
             bigframes.ml model that implements fit().