|
15 | 15 |
|
16 | 16 | import re
|
17 | 17 | import typing
|
18 |
| -from typing import List |
| 18 | +from typing import List, Optional |
19 | 19 |
|
20 | 20 | import bigframes
|
21 | 21 |
|
@@ -279,6 +279,100 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
|
279 | 279 |
|
280 | 280 | return joined_df.semantics.filter(instruction, model).reset_index(drop=True)
|
281 | 281 |
|
| 282 | + def search( |
| 283 | + self, |
| 284 | + search_column: str, |
| 285 | + query: str, |
| 286 | + top_k: int, |
| 287 | + model, |
| 288 | + score_column: Optional[str] = None, |
| 289 | + ): |
| 290 | + """ |
| 291 | + Performs semantic search on the DataFrame. |
| 292 | +
|
| 293 | + ** Examples: ** |
| 294 | +
|
| 295 | + >>> import bigframes.pandas as bpd |
| 296 | + >>> bpd.options.display.progress_bar = None |
| 297 | +
|
| 298 | + >>> import bigframes |
| 299 | + >>> bigframes.options.experiments.semantic_operators = True |
| 300 | +
|
| 301 | + >>> import bigframes.ml.llm as llm |
| 302 | + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004") |
| 303 | +
|
| 304 | + >>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]}) |
| 305 | + >>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance') |
| 306 | + creatures distance |
| 307 | + 3 chimpanzee 0.781101 |
| 308 | + <BLANKLINE> |
| 309 | + [1 rows x 2 columns] |
| 310 | +
|
| 311 | + Args: |
| 312 | + search_column: |
| 313 | + The name of the column to search from. |
| 314 | + query (str): |
| 315 | + The search query. |
| 316 | + top_k (int): |
| 317 | + The number of nearest neighbors to return. |
| 318 | + model (TextEmbeddingGenerator): |
| 319 | + A TextEmbeddingGenerator provided by Bigframes ML package. |
| 320 | + score_column (Optional[str], default None): |
| 321 | + The name of the the additional column containning the similarity scores. If None, |
| 322 | + this column won't be attached to the result. |
| 323 | +
|
| 324 | + Returns: |
| 325 | + DataFrame: the DataFrame with the search result. |
| 326 | +
|
| 327 | + Raises: |
| 328 | + ValueError: when the search_column is not found from the the data frame. |
| 329 | + TypeError: when the provided model is not TextEmbeddingGenerator. |
| 330 | + """ |
| 331 | + |
| 332 | + if search_column not in self._df.columns: |
| 333 | + raise ValueError(f"Column {search_column} not found") |
| 334 | + |
| 335 | + import bigframes.ml.llm as llm |
| 336 | + |
| 337 | + if not isinstance(model, llm.TextEmbeddingGenerator): |
| 338 | + raise TypeError(f"Expect a text embedding model, but got: {type(model)}") |
| 339 | + |
| 340 | + embedded_df = model.predict(self._df[search_column]) |
| 341 | + embedded_table = embedded_df.reset_index().to_gbq() |
| 342 | + |
| 343 | + import bigframes.pandas as bpd |
| 344 | + |
| 345 | + embedding_result_column = "ml_generate_embedding_result" |
| 346 | + query_df = model.predict(bpd.DataFrame({"query_id": [query]})).rename( |
| 347 | + columns={"content": "query_id", embedding_result_column: "embedding"} |
| 348 | + ) |
| 349 | + |
| 350 | + import bigframes.bigquery as bbq |
| 351 | + |
| 352 | + search_result = ( |
| 353 | + bbq.vector_search( |
| 354 | + base_table=embedded_table, |
| 355 | + column_to_search=embedding_result_column, |
| 356 | + query=query_df, |
| 357 | + top_k=top_k, |
| 358 | + ) |
| 359 | + .rename(columns={"content": search_column}) |
| 360 | + .set_index("index") |
| 361 | + ) |
| 362 | + |
| 363 | + search_result.index.name = self._df.index.name |
| 364 | + |
| 365 | + if score_column is not None: |
| 366 | + search_result = search_result.rename(columns={"distance": score_column})[ |
| 367 | + [search_column, score_column] |
| 368 | + ] |
| 369 | + else: |
| 370 | + search_result = search_result[[search_column]] |
| 371 | + |
| 372 | + import bigframes.dataframe |
| 373 | + |
| 374 | + return typing.cast(bigframes.dataframe.DataFrame, search_result) |
| 375 | + |
282 | 376 |
|
283 | 377 | def _validate_model(model):
|
284 | 378 | from bigframes.ml.llm import GeminiTextGenerator
|
|
0 commit comments