chore!: Remove attach_logprobs parameter from AI operations (#1816)

sycai · google-labs-jules[bot] · web-flow · commit 1e8a2f1b9fa8 · 2025-06-13T13:58:27.000-05:00
* Refactor: Remove attach_logprobs parameter from AI operations

This commit removes the `attach_logprobs` parameter from the `filter`, `map`, `classify`, and `join` methods within the `AIAccessor` class in `bigframes/operations/ai.py`.

The associated logic for calculating and attaching the 'logprob' column has also been removed from the `map` method.

System tests in `tests/system/large/operations/test_ai.py` that specifically tested the `attach_logprobs` functionality have been updated by:
- Removing the `attach_logprobs=True` argument from method calls.
- Removing assertions for the 'logprob' column.
- Renaming the test methods to reflect their updated scope (e.g., `test_filter_attach_logprob` to `test_filter_functionality_formerly_attach_logprob`).

The small system tests and experimental notebooks were not affected as they did not utilize this parameter.

* polish tests

---------

Co-authored-by: google-labs-jules[bot] &lt;161369871+google-labs-jules[bot]@users.noreply.github.com&gt;
diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py
@@ -41,7 +41,6 @@ def filter(
         instruction: str,
         model,
         ground_with_google_search: bool = False,
-        attach_logprobs: bool = False,
     ):
         """
         Filters the DataFrame with the semantics of the user instruction.
@@ -82,10 +81,6 @@ def filter(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
         Returns:
             bigframes.pandas.DataFrame: DataFrame filtered by the instruction.
 
@@ -103,7 +98,6 @@ def filter(
             model,
             output_schema,
             ground_with_google_search,
-            attach_logprobs,
         )
 
         return result[result[answer_col]].drop(answer_col, axis=1)
@@ -114,7 +108,6 @@ def map(
         model,
         output_schema: Dict[str, str] | None = None,
         ground_with_google_search: bool = False,
-        attach_logprobs=False,
     ):
         """
         Maps the DataFrame with the semantics of the user instruction. The name of the keys in the output_schema parameter carry
@@ -180,11 +173,6 @@ def map(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
-
         Returns:
             bigframes.pandas.DataFrame: DataFrame with attached mapping results.
 
@@ -258,19 +246,6 @@ def map(
 
         attach_columns = [results[col] for col, _ in output_schema.items()]
 
-        def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series:
-            from bigframes import bigquery as bbq
-
-            logprob_jsons = bbq.json_extract_array(s, "$.candidates").list[0]
-            logprobs = bbq.json_extract(logprob_jsons, "$.avg_logprobs").astype(
-                "Float64"
-            )
-            logprobs.name = "logprob"
-            return logprobs
-
-        if attach_logprobs:
-            attach_columns.append(extract_logprob(results["full_response"]))
-
         from bigframes.core.reshape.api import concat
 
         return concat([self._df, *attach_columns], axis=1)
@@ -282,7 +257,6 @@ def classify(
         labels: Sequence[str],
         output_column: str = "result",
         ground_with_google_search: bool = False,
-        attach_logprobs=False,
     ):
         """
         Classifies the rows of dataframes based on user instruction into the provided labels.
@@ -337,11 +311,6 @@ def classify(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
-
         Returns:
             bigframes.pandas.DataFrame: DataFrame with classification result.
 
@@ -367,7 +336,6 @@ def classify(
             model,
             output_schema={output_column: "string"},
             ground_with_google_search=ground_with_google_search,
-            attach_logprobs=attach_logprobs,
         )
 
     def join(
@@ -376,7 +344,6 @@ def join(
         instruction: str,
         model,
         ground_with_google_search: bool = False,
-        attach_logprobs=False,
     ):
         """
         Joines two dataframes by applying the instruction over each pair of rows from
@@ -428,10 +395,6 @@ def join(
                 page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
                 The default is `False`.
 
-            attach_logprobs (bool, default False):
-                Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
-                of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
-
         Returns:
             bigframes.pandas.DataFrame: The joined dataframe.
 
@@ -510,7 +473,6 @@ def join(
             instruction,
             model,
             ground_with_google_search=ground_with_google_search,
-            attach_logprobs=attach_logprobs,
         ).reset_index(drop=True)
 
     def search(
diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py
@@ -66,31 +66,6 @@ def test_filter(session, gemini_flash_model):
     )
 
 
-def test_filter_attach_logprob(session, gemini_flash_model):
-    df = dataframe.DataFrame(
-        data={
-            "number_1": [1, 2],
-            "number_2": [2, 1],
-            "col": [0, 0],
-        },
-        session=session,
-    )
-
-    with bigframes.option_context(
-        AI_OP_EXP_OPTION,
-        True,
-        THRESHOLD_OPTION,
-        10,
-    ):
-        actual_df = df.ai.filter(
-            "{number_1} is greater than {number_2}",
-            gemini_flash_model,
-            attach_logprobs=True,
-        ).to_pandas()
-
-    assert "logprob" in actual_df.columns
-
-
 def test_filter_multi_model(session, gemini_flash_model):
     with bigframes.option_context(
         AI_OP_EXP_OPTION,
@@ -259,31 +234,6 @@ def test_map(session, gemini_flash_model, output_schema, output_col):
     )
 
 
-def test_map_attach_logprob(session, gemini_flash_model):
-    df = dataframe.DataFrame(
-        data={
-            "ingredient_1": ["Burger Bun", "Soy Bean"],
-            "ingredient_2": ["Beef Patty", "Bittern"],
-            "gluten-free": [True, True],
-        },
-        session=session,
-    )
-
-    with bigframes.option_context(
-        AI_OP_EXP_OPTION,
-        True,
-        THRESHOLD_OPTION,
-        10,
-    ):
-        actual_df = df.ai.map(
-            "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.",
-            gemini_flash_model,
-            attach_logprobs=True,
-        ).to_pandas()
-
-    assert "logprob" in actual_df.columns
-
-
 def test_map_multimodel(session, gemini_flash_model):
     with bigframes.option_context(
         AI_OP_EXP_OPTION,
@@ -478,34 +428,6 @@ def test_join(instruction, session, gemini_flash_model):
     )
 
 
-def test_join_attach_logprob(session, gemini_flash_model):
-    cities = dataframe.DataFrame(
-        data={
-            "city": ["Seattle", "Berlin"],
-        },
-        session=session,
-    )
-    countries = dataframe.DataFrame(
-        data={"country": ["USA", "UK", "Germany"]},
-        session=session,
-    )
-
-    with bigframes.option_context(
-        AI_OP_EXP_OPTION,
-        True,
-        THRESHOLD_OPTION,
-        10,
-    ):
-        actual_df = cities.ai.join(
-            countries,
-            "{city} is in {country}",
-            gemini_flash_model,
-            attach_logprobs=True,
-        ).to_pandas()
-
-    assert "logprob" in actual_df.columns
-
-
 @pytest.mark.parametrize(
     ("reply"),
     [