googleapis
diff --git a/‎bigframes/operations/semantics.py
Lines changed: 111 additions & 23 deletions b/‎bigframes/operations/semantics.py
Lines changed: 111 additions & 23 deletions
diff --git a/‎notebooks/experimental/semantic_operators.ipynb
Lines changed: 149 additions & 19 deletions b/‎notebooks/experimental/semantic_operators.ipynb
Lines changed: 149 additions & 19 deletions
@@ -30,6 +30,24 @@ def filter(self, instruction: str, model):
         """
         Filters the DataFrame with the semantics of the user instruction.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> import bigframes
+            >>> bigframes.options.experiments.semantic_operators = True
+
+            >>> import bigframes.ml.llm as llm
+            >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
+
+            >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]})
+            >>> df.semantics.filter("{city} is the capital of {country}", model)
+               country    city
+            1  Germany  Berlin
+            <BLANKLINE>
+            [1 rows x 2 columns]
+
         Args:
             instruction:
                 An instruction on how to filter the data. This value must contain
@@ -39,7 +57,7 @@ def filter(self, instruction: str, model):
                 "The {food} is healthy."
 
             model:
-                A LLM model provided by Bigframes ML package.
+                A GeminiTextGenerator provided by Bigframes ML package.
 
         Returns:
             DataFrame filtered by the instruction.
@@ -49,9 +67,89 @@ def filter(self, instruction: str, model):
             ValueError: when the instruction refers to a non-existing column, or when no
                 columns are referred to.
         """
+        _validate_model(model)
+
+        output_instruction = "Based on the provided context, reply to the following claim by only True or False:"
+
+        from bigframes.dataframe import DataFrame
+
+        results = typing.cast(
+            DataFrame, model.predict(self._make_prompt(instruction, output_instruction))
+        )
+
+        return self._df[
+            results["ml_generate_text_llm_result"].str.lower().str.contains("true")
+        ]
+
+    def map(self, instruction: str, output_column: str, model):
+        """
+        Maps the DataFrame with the semantics of the user instruction.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> import bigframes
+            >>> bigframes.options.experiments.semantic_operators = True
+
+            >>> import bigframes.ml.llm as llm
+            >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
+
+            >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
+            >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", result_column_name="food", model=model)
+              ingredient_1 ingredient_2      food
+            0   Burger Bun   Beef Patty  Burger
+            <BLANKLINE>
+            1     Soy Bean      Bittern    Tofu
+            <BLANKLINE>
+            <BLANKLINE>
+            [2 rows x 3 columns]
+
+        Args:
+            instruction:
+                An instruction on how to map the data. This value must contain
+                column references by name, which should be wrapped in a pair of braces.
+                For example, if you have a column "food", you can refer to this column
+                in the instructions like:
+                "Get the ingredients of {food}."
 
+            result_column_name:
+                The column name of the mapping result.
+
+            model:
+                A GeminiTextGenerator provided by Bigframes ML package.
+
+        Returns:
+            DataFrame with attached mapping results.
+
+        Raises:
+            NotImplementedError: when the semantic operator experiment is off.
+            ValueError: when the instruction refers to a non-existing column, or when no
+                columns are referred to.
+        """
+        _validate_model(model)
+
+        output_instruction = (
+            "Based on the provided contenxt, answer the following instruction:"
+        )
+
+        from bigframes.series import Series
+
+        results = typing.cast(
+            Series,
+            model.predict(self._make_prompt(instruction, output_instruction))[
+                "ml_generate_text_llm_result"
+            ],
+        )
+
+        from bigframes.core.reshape import concat
+
+        return concat([self._df, results.rename(output_column)], axis=1)
+
+    def _make_prompt(self, user_instruction: str, output_instruction: str):
         # Validate column references
-        columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", instruction)
+        columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", user_instruction)
 
         if not columns:
             raise ValueError("No column references.")
@@ -61,30 +159,20 @@ def filter(self, instruction: str, model):
                 raise ValueError(f"Column {column} not found.")
 
         # Replace column references with names.
-        instruction = instruction.format(**{col: col for col in columns})
+        user_instruction = user_instruction.format(**{col: col for col in columns})
 
-        prompt_df = self._df.copy()
+        prompt_df = self._df[columns].copy()
+        prompt_df["prompt"] = f"{output_instruction}\n{user_instruction}\nContext: "
 
         # Combine context from multiple columns.
-        for idx, col in enumerate(columns):
-            if idx == 0:
-                prompt_df["context"] = f"{col} is `" + prompt_df[col] + "`\n"
-            else:
-                prompt_df["context"] += f"{col} is `" + prompt_df[col] + "`\n"
-
-        prompt_df["prompt"] = (
-            "Decide the folowing claim by only True and False: "
-            + instruction
-            + "\nContext:"
-            + prompt_df["context"]
-        )
+        for col in columns:
+            prompt_df["prompt"] += f"{col} is `" + prompt_df[col] + "`\n"
 
-        import bigframes.dataframe
+        return prompt_df["prompt"]
 
-        results = typing.cast(
-            bigframes.dataframe.DataFrame, model.predict(prompt_df["prompt"])
-        )
 
-        return self._df[
-            results["ml_generate_text_llm_result"].str.lower().str.contains("true")
-        ]
+def _validate_model(model):
+    from bigframes.ml.llm import GeminiTextGenerator
+
+    if not isinstance(model, GeminiTextGenerator):
+        raise ValueError("Model is not GeminiText Generator")
@@ -65,7 +65,7 @@
     {
      "data": {
       "text/html": [
-       "Query job 05cef003-6ac9-4cfc-b21c-3d6aed5d5b78 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:05cef003-6ac9-4cfc-b21c-3d6aed5d5b78&page=queryresults\">Open Job</a>"
+       "Query job 56de4aea-6e28-42fc-9760-b65c7a9c0ae7 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:56de4aea-6e28-42fc-9760-b65c7a9c0ae7&page=queryresults\">Open Job</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -89,13 +89,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "Query job f9439f7e-13cd-4990-847b-d318f223af02 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f9439f7e-13cd-4990-847b-d318f223af02&page=queryresults\">Open Job</a>"
+       "Query job bf5dd330-8e3e-45d2-b443-a61e595debba is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:bf5dd330-8e3e-45d2-b443-a61e595debba&page=queryresults\">Open Job</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -115,19 +115,7 @@
     {
      "data": {
       "text/html": [
-       "Query job 51d2b023-6834-47f6-b17c-6b50d759ad88 is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:51d2b023-6834-47f6-b17c-6b50d759ad88&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Query job 7979b08b-687e-41fc-8251-dfa0c0b41bed is DONE. 90 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7979b08b-687e-41fc-8251-dfa0c0b41bed&page=queryresults\">Open Job</a>"
+       "Query job 8ede807b-ae35-4d44-aaac-0788aab8398c is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:8ede807b-ae35-4d44-aaac-0788aab8398c&page=queryresults\">Open Job</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -139,7 +127,7 @@
     {
      "data": {
       "text/html": [
-       "Query job 6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2&page=queryresults\">Open Job</a>"
+       "Query job 1c7d1215-0661-4d4a-95eb-79dfbea65413 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1c7d1215-0661-4d4a-95eb-79dfbea65413&page=queryresults\">Open Job</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -151,7 +139,7 @@
     {
      "data": {
       "text/html": [
-       "Query job 60853851-bd33-4745-959e-bfddd970e4c4 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:60853851-bd33-4745-959e-bfddd970e4c4&page=queryresults\">Open Job</a>"
+       "Query job e562f224-9cd6-4b55-8bf0-145a3bd64540 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:e562f224-9cd6-4b55-8bf0-145a3bd64540&page=queryresults\">Open Job</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -203,7 +191,7 @@
        "[1 rows x 2 columns]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -212,6 +200,148 @@
     "df = bpd.DataFrame({'country': ['USA', 'Germany'], 'city': ['Seattle', 'Berlin']})\n",
     "df.semantics.filter(\"{city} is the capital of {country}\", model)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Semantic Mapping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = bpd.DataFrame(\n",
+    "        data={\"ingredient_1\": [\"Burger Bun\", \"Soy Bean\"], \"ingredient_2\": [\"Beef Patty\", \"Bittern\"]}\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job f62b4175-cb34-4e04-9a3f-4bfe1965f72f is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f62b4175-cb34-4e04-9a3f-4bfe1965f72f&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job b86fbb98-a566-4887-a938-f80fe3888b27 is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:b86fbb98-a566-4887-a938-f80fe3888b27&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job d4f09988-48d9-48df-a138-a7256b9a5766 is DONE. 34 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d4f09988-48d9-48df-a138-a7256b9a5766&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 293d186f-359c-40d1-87f2-e8d525fd72ba is DONE. 93 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:293d186f-359c-40d1-87f2-e8d525fd72ba&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ingredient_1</th>\n",
+       "      <th>ingredient_2</th>\n",
+       "      <th>food</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Burger Bun</td>\n",
+       "      <td>Beef Patty</td>\n",
+       "      <td>Burger</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Soy Bean</td>\n",
+       "      <td>Bittern</td>\n",
+       "      <td>Tofu</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 3 columns</p>\n",
+       "</div>[2 rows x 3 columns in total]"
+      ],
+      "text/plain": [
+       "  ingredient_1 ingredient_2      food\n",
+       "0   Burger Bun   Beef Patty  Burger \n",
+       "\n",
+       "1     Soy Bean      Bittern    Tofu \n",
+       "\n",
+       "\n",
+       "[2 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.semantics.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=model)"
+   ]
   }
  ],
  "metadata": {