googleapis
diff --git a/‎bigframes/operations/ai.py
Lines changed: 0 additions & 203 deletions b/‎bigframes/operations/ai.py
Lines changed: 0 additions & 203 deletions
diff --git a/‎notebooks/experimental/ai_operators.ipynb
Lines changed: 0 additions & 123 deletions b/‎notebooks/experimental/ai_operators.ipynb
Lines changed: 0 additions & 123 deletions
@@ -19,8 +19,6 @@
 from typing import Dict, Iterable, List, Optional, Sequence, Union
 import warnings
 
-import numpy as np
-
 from bigframes import dtypes, exceptions, options
 from bigframes.core import guid, log_adapter
 
@@ -586,207 +584,6 @@ def search(
 
         return typing.cast(bigframes.dataframe.DataFrame, search_result)
 
-    def top_k(
-        self,
-        instruction: str,
-        model,
-        k: int = 10,
-        ground_with_google_search: bool = False,
-    ):
-        """
-        Ranks each tuple and returns the k best according to the instruction.
-
-        This method employs a quick select algorithm to efficiently compare the pivot
-        with all other items. By leveraging an LLM (Large Language Model), it then
-        identifies the top 'k' best answers from these comparisons.
-
-        **Examples:**
-
-            >>> import bigframes.pandas as bpd
-            >>> bpd.options.display.progress_bar = None
-            >>> bpd.options.experiments.ai_operators = True
-            >>> bpd.options.compute.ai_ops_confirmation_threshold = 25
-
-            >>> import bigframes.ml.llm as llm
-            >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001")
-
-            >>> df = bpd.DataFrame(
-            ... {
-            ...     "Animals": ["Dog", "Bird", "Cat", "Horse"],
-            ...     "Sounds": ["Woof", "Chirp", "Meow", "Neigh"],
-            ... })
-            >>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2)
-              Animals Sounds
-            0     Dog   Woof
-            2     Cat   Meow
-            <BLANKLINE>
-            [2 rows x 2 columns]
-
-        Args:
-            instruction (str):
-                An instruction on how to map the data. This value must contain
-                column references by name enclosed in braces.
-                For example, to reference a column named "Animals", use "{Animals}" in the
-                instruction, like: "{Animals} are more popular as pets"
-
-            model (bigframes.ml.llm.GeminiTextGenerator):
-                A GeminiTextGenerator provided by the Bigframes ML package.
-
-            k (int, default 10):
-                The number of rows to return.
-
-            ground_with_google_search (bool, default False):
-                Enables Grounding with Google Search for the GeminiTextGenerator model.
-                When set to True, the model incorporates relevant information from Google
-                Search results into its responses, enhancing their accuracy and factualness.
-                Note: Using this feature may impact billing costs. Refer to the pricing
-                page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
-                The default is `False`.
-
-        Returns:
-            bigframes.dataframe.DataFrame: A new DataFrame with the top k rows.
-
-        Raises:
-            NotImplementedError: when the AI operator experiment is off.
-            ValueError: when the instruction refers to a non-existing column, or when no
-                columns are referred to.
-        """
-        if not options.experiments.ai_operators:
-            raise NotImplementedError()
-
-        import bigframes.dataframe
-        import bigframes.series
-
-        self._validate_model(model)
-        columns = self._parse_columns(instruction)
-        for column in columns:
-            if column not in self._df.columns:
-                raise ValueError(f"Column {column} not found.")
-        if len(columns) > 1:
-            raise NotImplementedError("AI top K are limited to a single column.")
-
-        if ground_with_google_search:
-            msg = exceptions.format_message(
-                "Enables Grounding with Google Search may impact billing cost. See pricing "
-                "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
-            )
-            warnings.warn(msg, category=UserWarning)
-
-        work_estimate = int(len(self._df) * (len(self._df) - 1) / 2)
-        self._confirm_operation(work_estimate)
-
-        df: bigframes.dataframe.DataFrame = self._df[columns].copy()
-        column = columns[0]
-        if df[column].dtype != dtypes.STRING_DTYPE:
-            df[column] = df[column].astype(dtypes.STRING_DTYPE)
-
-        # `index` is reserved for the `reset_index` below.
-        if column == "index":
-            raise ValueError(
-                "Column name 'index' is reserved. Please choose a different name."
-            )
-
-        if k < 1:
-            raise ValueError("k must be an integer greater than or equal to 1.")
-
-        user_instruction = self._format_instruction(instruction, columns)
-
-        n = df.shape[0]
-        if k >= n:
-            return df
-
-        # Create a unique index and duplicate it as the "index" column. This workaround
-        # is needed for the select search algorithm due to unimplemented bigFrame methods.
-        df = df.reset_index().rename(columns={"index": "old_index"}).reset_index()
-
-        # Initialize a status column to track the selection status of each item.
-        #  - None: Unknown/not yet processed
-        #  - 1.0: Selected as part of the top-k items
-        #  - -1.0: Excluded from the top-k items
-        status_column = guid.generate_guid("status")
-        df[status_column] = bigframes.series.Series(
-            None, dtype=dtypes.FLOAT_DTYPE, session=df._session
-        )
-
-        num_selected = 0
-        while num_selected < k:
-            df, num_new_selected = self._topk_partition(
-                df,
-                column,
-                status_column,
-                user_instruction,
-                model,
-                k - num_selected,
-                ground_with_google_search,
-            )
-            num_selected += num_new_selected
-
-        result_df: bigframes.dataframe.DataFrame = self._df.copy()
-        return result_df[df.set_index("old_index")[status_column] > 0.0]
-
-    @staticmethod
-    def _topk_partition(
-        df,
-        column: str,
-        status_column: str,
-        user_instruction: str,
-        model,
-        k: int,
-        ground_with_google_search: bool,
-    ):
-        output_instruction = (
-            "Given a question and two documents, choose the document that best answers "
-            "the question. Respond with 'Document 1' or 'Document 2'.  You must choose "
-            "one, even if neither is ideal. "
-        )
-
-        # Random pivot selection for improved average quickselect performance.
-        pending_df = df[df[status_column].isna()]
-        pivot_iloc = np.random.randint(0, pending_df.shape[0])
-        pivot_index = pending_df.iloc[pivot_iloc]["index"]
-        pivot_df = pending_df[pending_df["index"] == pivot_index]
-
-        # Build a prompt to compare the pivot item's relevance to other pending items.
-        prompt_s = pending_df[pending_df["index"] != pivot_index][column]
-        prompt_s = (
-            f"{output_instruction}\n\nQuestion: {user_instruction}\n"
-            + f"\nDocument 1: {column} "
-            + pivot_df.iloc[0][column]
-            + f"\nDocument 2: {column} "
-            + prompt_s  # type:ignore
-        )
-
-        import bigframes.dataframe
-
-        predict_df = typing.cast(
-            bigframes.dataframe.DataFrame,
-            model.predict(
-                prompt_s,
-                temperature=0.0,
-                ground_with_google_search=ground_with_google_search,
-            ),
-        )
-
-        marks = predict_df["ml_generate_text_llm_result"].str.contains("2")
-        more_relavant: bigframes.dataframe.DataFrame = df[marks]
-        less_relavent: bigframes.dataframe.DataFrame = df[~marks]
-
-        num_more_relavant = more_relavant.shape[0]
-        if k < num_more_relavant:
-            less_relavent[status_column] = -1.0
-            pivot_df[status_column] = -1.0
-            df = df.combine_first(less_relavent).combine_first(pivot_df)
-            return df, 0
-        else:  # k >= num_more_relavant
-            more_relavant[status_column] = 1.0
-            df = df.combine_first(more_relavant)
-            if k >= num_more_relavant + 1:
-                pivot_df[status_column] = 1.0
-                df = df.combine_first(pivot_df)
-                return df, num_more_relavant + 1
-            else:
-                return df, num_more_relavant
-
     def sim_join(
         self,
         other,
 
@@ -1064,129 +1064,6 @@
         "animals.ai.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)"
       ]
     },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kU7BsyTyiouX"
-      },
-      "source": [
-        "## AI Top K"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "s9QePXEoiouX"
-      },
-      "source": [
-        "AI Top K selects the top K values based on your instruction. Here is an example:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 18,
-      "metadata": {
-        "id": "bMQqtyZ2iouX"
-      },
-      "outputs": [],
-      "source": [
-        "df = bpd.DataFrame({\"Animals\": [\"Corgi\", \"Orange Cat\", \"Parrot\", \"Tarantula\"]})"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KiljGBSCiouX"
-      },
-      "source": [
-        "You want to find the top two most popular pets:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 19,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 159
-        },
-        "id": "OZv5WUGIiouX",
-        "outputId": "ae1cee27-cc31-455e-c4ac-c0a9a5cf4ca5"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
-            "`db_dtypes` is a preview feature and subject to change.\n",
-            "  warnings.warn(msg, bfe.PreviewWarning)\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>Animals</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>Corgi</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>Orange Cat</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "<p>2 rows × 1 columns</p>\n",
-              "</div>[2 rows x 1 columns in total]"
-            ],
-            "text/plain": [
-              "      Animals\n",
-              "0       Corgi\n",
-              "1  Orange Cat\n",
-              "\n",
-              "[2 rows x 1 columns]"
-            ]
-          },
-          "execution_count": 19,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "df.ai.top_k(\"{Animals} are more popular as pets\", model=gemini_model, k=2)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dC8fyu3aiouX"
-      },
-      "source": [
-        "Under the hood, the AI top K operator performs pair-wise comparisons with LLM. The top K results are returned in the order of their indices instead of their ranks."
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {