chore: Add semantics.filter, guarded by an experiment flag. (#1040)

sycai · gcf-owl-bot[bot] · web-flow · commit 1bfa598ad533 · 2024-10-02T19:38:55.000-05:00
* feat: Add , guarded by an experiment flag. * fix test index * remove redundant line * Move semantic operators into a separate Semantics class * move test file location * check column references and update tests * check column references and update tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Clean up further * fix model name --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -74,6 +74,7 @@
 import bigframes.operations.aggregations
 import bigframes.operations.aggregations as agg_ops
 import bigframes.operations.plotting as plotting
+import bigframes.operations.semantics
 import bigframes.operations.structs
 import bigframes.series
 import bigframes.series as bf_series
@@ -3875,3 +3876,7 @@ def _throw_if_null_index(self, opname: str):
             raise bigframes.exceptions.NullIndexError(
                 f"DataFrame cannot perform {opname} as it has no index. Set an index using set_index."
             )
+
+    @property
+    def semantics(self):
+        return bigframes.operations.semantics.Semantics(self)
diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py
@@ -0,0 +1,90 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import re
+import typing
+
+import bigframes
+
+
+class Semantics:
+    def __init__(self, df) -> None:
+        if not bigframes.options.experiments.semantic_operators:
+            raise NotImplementedError()
+
+        self._df = df
+
+    def filter(self, instruction: str, model):
+        """
+        Filters the DataFrame with the semantics of the user instruction.
+
+        Args:
+            instruction:
+                An instruction on how to filter the data. This value must contain
+                column references by name, which should be wrapped in a pair of braces.
+                For example, if you have a column "food", you can refer to this column
+                in the instructions like:
+                "The {food} is healthy."
+
+            model:
+                A LLM model provided by Bigframes ML package.
+
+        Returns:
+            DataFrame filtered by the instruction.
+
+        Raises:
+            NotImplementedError: when the semantic operator experiment is off.
+            ValueError: when the instruction refers to a non-existing column, or when no
+                columns are referred to.
+        """
+
+        # Validate column references
+        columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", instruction)
+
+        if not columns:
+            raise ValueError("No column references.")
+
+        for column in columns:
+            if column not in self._df.columns:
+                raise ValueError(f"Column {column} not found.")
+
+        # Replace column references with names.
+        instruction = instruction.format(**{col: col for col in columns})
+
+        prompt_df = self._df.copy()
+
+        # Combine context from multiple columns.
+        for idx, col in enumerate(columns):
+            if idx == 0:
+                prompt_df["context"] = f"{col} is `" + prompt_df[col] + "`\n"
+            else:
+                prompt_df["context"] += f"{col} is `" + prompt_df[col] + "`\n"
+
+        prompt_df["prompt"] = (
+            "Decide the folowing claim by only True and False: "
+            + instruction
+            + "\nContext:"
+            + prompt_df["context"]
+        )
+
+        import bigframes.dataframe
+
+        results = typing.cast(
+            bigframes.dataframe.DataFrame, model.predict(prompt_df["prompt"])
+        )
+
+        return self._df[
+            results["ml_generate_text_llm_result"].str.lower().str.contains("true")
+        ]
diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb
@@ -0,0 +1,238 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bigframes\n",
+    "import bigframes.pandas as bpd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Enable the semantic operator experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:33: UserWarning: Semantic operators are still under experiments, and are subject to change in the future.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "bigframes.options.experiments.semantic_operators = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Prepare the LLM model. Here we are going to use Gemini 1.5 Flash."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:559: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n",
+      "  return global_session.get_global_session()\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 05cef003-6ac9-4cfc-b21c-3d6aed5d5b78 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:05cef003-6ac9-4cfc-b21c-3d6aed5d5b78&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import bigframes.ml.llm as llm\n",
+    "model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Semantic Filtering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job f9439f7e-13cd-4990-847b-d318f223af02 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f9439f7e-13cd-4990-847b-d318f223af02&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 51d2b023-6834-47f6-b17c-6b50d759ad88 is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:51d2b023-6834-47f6-b17c-6b50d759ad88&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 7979b08b-687e-41fc-8251-dfa0c0b41bed is DONE. 90 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7979b08b-687e-41fc-8251-dfa0c0b41bed&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 60853851-bd33-4745-959e-bfddd970e4c4 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:60853851-bd33-4745-959e-bfddd970e4c4&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>country</th>\n",
+       "      <th>city</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Germany</td>\n",
+       "      <td>Berlin</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 2 columns</p>\n",
+       "</div>[1 rows x 2 columns in total]"
+      ],
+      "text/plain": [
+       "   country    city\n",
+       "1  Germany  Berlin\n",
+       "\n",
+       "[1 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = bpd.DataFrame({'country': ['USA', 'Germany'], 'city': ['Seattle', 'Berlin']})\n",
+    "df.semantics.filter(\"{city} is the capital of {country}\", model)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/system/small/operations/conftest.py b/tests/system/small/operations/conftest.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import bigframes.ml.llm as llm
+
+
+@pytest.fixture(scope="session")
+def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator:
+    return llm.GeminiTextGenerator(
+        session=session,
+        connection_name=bq_connection,
+        model_name="gemini-1.5-flash-001",
+    )
diff --git a/tests/system/small/operations/test_semantics.py b/tests/system/small/operations/test_semantics.py