chore: improve error messages for semantic operators (#1078)

chelsea-lin · web-flow · commit 9d6d9ddcf699 · 2024-10-11T17:22:28.000-05:00
* chore: improve error messages for semantic operators

* fix tests
diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py
@@ -104,6 +104,12 @@ def agg(
         for column in columns:
             if column not in self._df.columns:
                 raise ValueError(f"Column {column} not found.")
+            if self._df[column].dtype != dtypes.STRING_DTYPE:
+                raise TypeError(
+                    "Semantics aggregated column must be a string type, not "
+                    f"{type(self._df[column])}"
+                )
+
         if len(columns) > 1:
             raise NotImplementedError(
                 "Semantic aggregations are limited to a single column."
@@ -324,6 +330,11 @@ def filter(self, instruction: str, model):
         for column in columns:
             if column not in self._df.columns:
                 raise ValueError(f"Column {column} not found.")
+            if self._df[column].dtype != dtypes.STRING_DTYPE:
+                raise TypeError(
+                    "Semantics aggregated column must be a string type, not "
+                    f"{type(self._df[column])}"
+                )
 
         user_instruction = self._format_instruction(instruction, columns)
         output_instruction = "Based on the provided context, reply to the following claim by only True or False:"
@@ -372,7 +383,7 @@ def map(self, instruction: str, output_column: str, model):
                 in the instructions like:
                 "Get the ingredients of {food}."
 
-            result_column_name:
+            output_column:
                 The column name of the mapping result.
 
             model:
@@ -391,6 +402,11 @@ def map(self, instruction: str, output_column: str, model):
         for column in columns:
             if column not in self._df.columns:
                 raise ValueError(f"Column {column} not found.")
+            if self._df[column].dtype != dtypes.STRING_DTYPE:
+                raise TypeError(
+                    "Semantics aggregated column must be a string type, not "
+                    f"{type(self._df[column])}"
+                )
 
         user_instruction = self._format_instruction(instruction, columns)
         output_instruction = (
@@ -512,8 +528,11 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
             else:
                 raise ValueError(f"Column {col} not found")
 
-        if not left_columns or not right_columns:
-            raise ValueError()
+        if not left_columns:
+            raise ValueError("No left column references.")
+
+        if not right_columns:
+            raise ValueError("No right column references.")
 
         joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right"))
 
@@ -570,13 +589,16 @@ def search(
         """
 
         if search_column not in self._df.columns:
-            raise ValueError(f"Column {search_column} not found")
+            raise ValueError(f"Column `{search_column}` not found")
 
         import bigframes.ml.llm as llm
 
         if not isinstance(model, llm.TextEmbeddingGenerator):
             raise TypeError(f"Expect a text embedding model, but got: {type(model)}")
 
+        if top_k < 1:
+            raise ValueError("top_k must be an integer greater than or equal to 1.")
+
         embedded_df = model.predict(self._df[search_column])
         embedded_table = embedded_df.reset_index().to_gbq()
 
@@ -855,6 +877,9 @@ def sim_join(
                 f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}."
             )
 
+        if top_k < 1:
+            raise ValueError("top_k must be an integer greater than or equal to 1.")
+
         base_table_embedding_column = guid.generate_guid()
         base_table = self._attach_embedding(
             other, right_on, base_table_embedding_column, model
@@ -926,4 +951,4 @@ def _validate_model(model):
         from bigframes.ml.llm import GeminiTextGenerator
 
         if not isinstance(model, GeminiTextGenerator):
-            raise ValueError("Model is not GeminiText Generator")
+            raise TypeError("Model is not GeminiText Generator")
diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py
@@ -82,21 +82,33 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column):
             marks=pytest.mark.xfail(raises=ValueError),
         ),
         pytest.param(
-            "{city} is in the {non_existing_column}",
+            "{Movies} is good",
             id="non_existing_column",
             marks=pytest.mark.xfail(raises=ValueError),
         ),
         pytest.param(
-            "{city} is in the {country}",
+            "{Movies} is better than {Movies}",
             id="two_columns",
             marks=pytest.mark.xfail(raises=NotImplementedError),
         ),
+        pytest.param(
+            "{Year}",
+            id="invalid_type",
+            marks=pytest.mark.xfail(raises=TypeError),
+        ),
     ],
 )
 def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model):
     bigframes.options.experiments.semantic_operators = True
     df = dataframe.DataFrame(
-        {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}
+        data={
+            "Movies": [
+                "Titanic",
+                "The Wolf of Wall Street",
+                "Killers of the Flower Moon",
+            ],
+            "Year": [1997, 2013, 2023],
+        },
     )
     df.semantics.agg(instruction, gemini_flash_model)
 
@@ -229,15 +241,26 @@ def test_filter_single_column_reference(session, gemini_flash_model):
 @pytest.mark.parametrize(
     "instruction",
     [
-        "No column reference",
-        "{city} is in the {non_existing_column}",
+        pytest.param(
+            "No column reference",
+            id="zero_column",
+            marks=pytest.mark.xfail(raises=ValueError),
+        ),
+        pytest.param(
+            "{city} is in the {non_existing_column}",
+            id="non_existing_column",
+            marks=pytest.mark.xfail(raises=ValueError),
+        ),
+        pytest.param(
+            "{id}",
+            id="invalid_type",
+            marks=pytest.mark.xfail(raises=TypeError),
+        ),
     ],
 )
 def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model):
     bigframes.options.experiments.semantic_operators = True
-    df = dataframe.DataFrame(
-        {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}
-    )
+    df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]})
 
     with pytest.raises(ValueError):
         df.semantics.filter(instruction, gemini_flash_model)
@@ -249,7 +272,7 @@ def test_filter_invalid_model_raise_error():
         {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}
     )
 
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         df.semantics.filter("{city} is the capital of {country}", None)
 
 
@@ -290,14 +313,28 @@ def test_map(session, gemini_flash_model):
 @pytest.mark.parametrize(
     "instruction",
     [
-        "No column reference",
-        "What is the food made from {ingredient_1} and {non_existing_column}?}",
+        pytest.param(
+            "No column reference",
+            id="zero_column",
+            marks=pytest.mark.xfail(raises=ValueError),
+        ),
+        pytest.param(
+            "What is the food made from {ingredient_1} and {non_existing_column}?}",
+            id="non_existing_column",
+            marks=pytest.mark.xfail(raises=ValueError),
+        ),
+        pytest.param(
+            "{id}",
+            id="invalid_type",
+            marks=pytest.mark.xfail(raises=TypeError),
+        ),
     ],
 )
 def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model):
     bigframes.options.experiments.semantic_operators = True
     df = dataframe.DataFrame(
         data={
+            "id": [1, 2],
             "ingredient_1": ["Burger Bun", "Soy Bean"],
             "ingredient_2": ["Beef Patty", "Bittern"],
         }
@@ -316,7 +353,7 @@ def test_map_invalid_model_raise_error():
         },
     )
 
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         df.semantics.map(
             "What is the food made from {ingredient_1} and {ingredient_2}? One word only.",
             "food",
@@ -462,7 +499,7 @@ def test_join_invalid_model_raise_error():
     cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]})
     countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]})
 
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         cities.semantics.join(countries, "{city} is in {country}", None)
 
 
@@ -528,6 +565,19 @@ def test_search_invalid_model_raises_error(session):
         df.semantics.search("creatures", "monkey", top_k=2, model=None)
 
 
+def test_search_invalid_top_k_raises_error(session, text_embedding_generator):
+    bigframes.options.experiments.semantic_operators = True
+    df = dataframe.DataFrame(
+        data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]},
+        session=session,
+    )
+
+    with pytest.raises(ValueError):
+        df.semantics.search(
+            "creatures", "monkey", top_k=0, model=text_embedding_generator
+        )
+
+
 @pytest.mark.parametrize(
     "score_column",
     [
@@ -614,6 +664,27 @@ def test_sim_join_invalid_model_raises_error(session):
         )
 
 
+def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator):
+    bigframes.options.experiments.semantic_operators = True
+    df1 = dataframe.DataFrame(
+        data={"creatures": ["salmon", "cat"]},
+        session=session,
+    )
+    df2 = dataframe.DataFrame(
+        data={"creatures": ["dog", "tuna"]},
+        session=session,
+    )
+
+    with pytest.raises(ValueError):
+        df1.semantics.sim_join(
+            df2,
+            left_on="creatures",
+            right_on="creatures",
+            top_k=0,
+            model=text_embedding_generator,
+        )
+
+
 def test_sim_join_data_too_large_raises_error(session, text_embedding_generator):
     bigframes.options.experiments.semantic_operators = True
     df1 = dataframe.DataFrame(