chore: implement semantics cluster_by (#1067)

chelsea-lin · web-flow · commit f6282b0f359c · 2024-10-08T19:33:06.000-05:00
* chore: implement semantics cluster_by

* address comments and fix tests
diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py
@@ -194,6 +194,88 @@ def agg(
 
         return df[column]
 
+    def cluster_by(
+        self,
+        column: str,
+        output_column: str,
+        model,
+        n_clusters: int = 5,
+    ):
+        """
+        Clusters data based on the semantic similarity of text within a specified column.
+
+        This method leverages a language model to generate text embeddings for each value in
+        the given column. These embeddings capture the semantic meaning of the text.
+        The data is then grouped into `n` clusters using the k-means clustering algorithm,
+        which groups data points based on the similarity of their embeddings.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> bpd.options.experiments.semantic_operators = True
+
+            >>> import bigframes.ml.llm as llm
+            >>> model = llm.TextEmbeddingGenerator()
+
+            >>> df = bpd.DataFrame({
+            ...     "Product": ["Smartphone", "Laptop", "T-shirt", "Jeans"],
+            ... })
+            >>> df.semantics.cluster_by("Product", "Cluster ID", model, n_clusters=2)
+                    Product  Cluster ID
+            0    Smartphone           2
+            1        Laptop           2
+            2       T-shirt           1
+            3         Jeans           1
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        Args:
+            column (str):
+                An column name to perform the similarity clustering.
+
+            output_column (str):
+                An output column to store the clustering ID.
+
+            model (bigframes.ml.llm.TextEmbeddingGenerator):
+                A TextEmbeddingGenerator provided by Bigframes ML package.
+
+            n_clusters (int, default 5):
+                Default 5. Number of clusters to be detected.
+
+        Returns:
+            bigframes.dataframe.DataFrame: A new DataFrame with the clustering output column.
+
+        Raises:
+            NotImplementedError: when the semantic operator experiment is off.
+            ValueError: when the column refers to a non-existing column.
+        """
+
+        import bigframes.dataframe
+        import bigframes.ml.cluster as cluster
+        import bigframes.ml.llm as llm
+
+        if not isinstance(model, llm.TextEmbeddingGenerator):
+            raise TypeError(f"Expect a text embedding model, but got: {type(model)}")
+
+        if column not in self._df.columns:
+            raise ValueError(f"Column {column} not found.")
+
+        if n_clusters <= 1:
+            raise ValueError(
+                f"Invalid value for `n_clusters`: {n_clusters}."
+                "It must be greater than 1."
+            )
+
+        df: bigframes.dataframe.DataFrame = self._df.copy()
+        embeddings_df = model.predict(df[column])
+
+        cluster_model = cluster.KMeans(n_clusters=n_clusters)
+        cluster_model.fit(embeddings_df[["ml_generate_embedding_result"]])
+        clustered_result = cluster_model.predict(embeddings_df)
+        df[output_column] = clustered_result["CENTROID_ID"]
+        return df
+
     def filter(self, instruction: str, model):
         """
         Filters the DataFrame with the semantics of the user instruction.
diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb
@@ -33,7 +33,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/google/home/chelsealin/src/bigframes3/bigframes/_config/experiment_options.py:33: UserWarning: Semantic operators are still under experiments, and are subject to change in the future.\n",
+      "/usr/local/google/home/chelsealin/src/bigframes/bigframes/_config/experiment_options.py:33: UserWarning: Semantic operators are still under experiments, and are subject to change in the future.\n",
       "  warnings.warn(\n"
      ]
     }
@@ -51,21 +51,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/google/home/chelsealin/src/bigframes3/bigframes/pandas/__init__.py:559: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n",
-      "  return global_session.get_global_session()\n"
-     ]
+     "data": {
+      "text/html": [
+       "Query job 13e4b10e-70cf-4b93-8c59-5f6f5fb10aeb is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:13e4b10e-70cf-4b93-8c59-5f6f5fb10aeb&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "data": {
       "text/html": [
-       "Query job aef2dd7b-bdad-4dda-91be-867e8dac2613 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:aef2dd7b-bdad-4dda-91be-867e8dac2613&page=queryresults\">Open Job</a>"
+       "Query job 559dd42c-573d-4b00-8fe9-b7061afdd672 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:559dd42c-573d-4b00-8fe9-b7061afdd672&page=queryresults\">Open Job</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -77,7 +81,8 @@
    ],
    "source": [
     "import bigframes.ml.llm as llm\n",
-    "gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)"
+    "gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)\n",
+    "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")"
    ]
   },
   {
@@ -657,28 +662,6 @@
     "## Semantic Search"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "Query job 48aafee2-4948-4677-ab02-a94a71b9f6e2 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:48aafee2-4948-4677-ab02-a94a71b9f6e2&page=queryresults\">Open Job</a>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -1156,6 +1139,188 @@
     "agg_df = df.semantics.agg(\"Find the shared first name of actors in {Movies}. One word answer.\", model=gemini_model)\n",
     "agg_df"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Semantic Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job 92ce82b9-c521-42af-a2b7-6114b27a9ce4 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:92ce82b9-c521-42af-a2b7-6114b27a9ce4&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/google/home/chelsealin/src/bigframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 8c4c7391-2889-4cf1-bbfa-5cbf6b144db5 is DONE. 10 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:8c4c7391-2889-4cf1-bbfa-5cbf6b144db5&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 19ae7cc6-3d61-4c69-9148-1956fafb577a is DONE. 30.8 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:19ae7cc6-3d61-4c69-9148-1956fafb577a&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 7c2b62df-3bed-4469-9ffc-131843efe25e is DONE. 30.7 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7c2b62df-3bed-4469-9ffc-131843efe25e&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job 74155e34-d8ca-4fba-8b93-33b1b325a5f1 is DONE. 138.9 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:74155e34-d8ca-4fba-8b93-33b1b325a5f1&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job d9151043-a9c3-4388-8268-ef41162012b7 is DONE. 80 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d9151043-a9c3-4388-8268-ef41162012b7&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Query job d2c4ad9a-c637-490e-a2cf-37d7f5a34024 is DONE. 170 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d2c4ad9a-c637-490e-a2cf-37d7f5a34024&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Product</th>\n",
+       "      <th>Cluster ID</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Smartphone</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Laptop</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Coffee Maker</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>T-shirt</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Jeans</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 2 columns</p>\n",
+       "</div>[5 rows x 2 columns in total]"
+      ],
+      "text/plain": [
+       "        Product  Cluster ID\n",
+       "0    Smartphone           3\n",
+       "1        Laptop           3\n",
+       "2  Coffee Maker           1\n",
+       "3       T-shirt           2\n",
+       "4         Jeans           2\n",
+       "\n",
+       "[5 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = bpd.DataFrame({'Product': ['Smartphone', 'Laptop', 'Coffee Maker', 'T-shirt', 'Jeans']})\n",
+    "\n",
+    "df.semantics.cluster_by(column='Product', output_column='Cluster ID', model=text_embedding_model, n=3)"
+   ]
   }
  ],
  "metadata": {
@@ -1174,7 +1339,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.12.1"
   }
  },
  "nbformat": 4,
diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py