Skip to content

Commit 1162b3b

Browse files
authored
chore: implement semantic search (#1058)
1 parent a70d683 commit 1162b3b

File tree

4 files changed

+481
-26
lines changed

4 files changed

+481
-26
lines changed

bigframes/operations/semantics.py

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import re
1717
import typing
18-
from typing import List
18+
from typing import List, Optional
1919

2020
import bigframes
2121

@@ -279,6 +279,100 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
279279

280280
return joined_df.semantics.filter(instruction, model).reset_index(drop=True)
281281

282+
def search(
283+
self,
284+
search_column: str,
285+
query: str,
286+
top_k: int,
287+
model,
288+
score_column: Optional[str] = None,
289+
):
290+
"""
291+
Performs semantic search on the DataFrame.
292+
293+
** Examples: **
294+
295+
>>> import bigframes.pandas as bpd
296+
>>> bpd.options.display.progress_bar = None
297+
298+
>>> import bigframes
299+
>>> bigframes.options.experiments.semantic_operators = True
300+
301+
>>> import bigframes.ml.llm as llm
302+
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
303+
304+
>>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]})
305+
>>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance')
306+
creatures distance
307+
3 chimpanzee 0.781101
308+
<BLANKLINE>
309+
[1 rows x 2 columns]
310+
311+
Args:
312+
search_column:
313+
The name of the column to search from.
314+
query (str):
315+
The search query.
316+
top_k (int):
317+
The number of nearest neighbors to return.
318+
model (TextEmbeddingGenerator):
319+
A TextEmbeddingGenerator provided by Bigframes ML package.
320+
score_column (Optional[str], default None):
321+
The name of the the additional column containning the similarity scores. If None,
322+
this column won't be attached to the result.
323+
324+
Returns:
325+
DataFrame: the DataFrame with the search result.
326+
327+
Raises:
328+
ValueError: when the search_column is not found from the the data frame.
329+
TypeError: when the provided model is not TextEmbeddingGenerator.
330+
"""
331+
332+
if search_column not in self._df.columns:
333+
raise ValueError(f"Column {search_column} not found")
334+
335+
import bigframes.ml.llm as llm
336+
337+
if not isinstance(model, llm.TextEmbeddingGenerator):
338+
raise TypeError(f"Expect a text embedding model, but got: {type(model)}")
339+
340+
embedded_df = model.predict(self._df[search_column])
341+
embedded_table = embedded_df.reset_index().to_gbq()
342+
343+
import bigframes.pandas as bpd
344+
345+
embedding_result_column = "ml_generate_embedding_result"
346+
query_df = model.predict(bpd.DataFrame({"query_id": [query]})).rename(
347+
columns={"content": "query_id", embedding_result_column: "embedding"}
348+
)
349+
350+
import bigframes.bigquery as bbq
351+
352+
search_result = (
353+
bbq.vector_search(
354+
base_table=embedded_table,
355+
column_to_search=embedding_result_column,
356+
query=query_df,
357+
top_k=top_k,
358+
)
359+
.rename(columns={"content": search_column})
360+
.set_index("index")
361+
)
362+
363+
search_result.index.name = self._df.index.name
364+
365+
if score_column is not None:
366+
search_result = search_result.rename(columns={"distance": score_column})[
367+
[search_column, score_column]
368+
]
369+
else:
370+
search_result = search_result[[search_column]]
371+
372+
import bigframes.dataframe
373+
374+
return typing.cast(bigframes.dataframe.DataFrame, search_result)
375+
282376

283377
def _validate_model(model):
284378
from bigframes.ml.llm import GeminiTextGenerator

0 commit comments

Comments
 (0)