Skip to content

Commit c06d8db

Browse files
authored
chore!: remove ai.top_k(). (#1842)
* chore: remove ai.top_k(). * remove redundant import
1 parent 15e1277 commit c06d8db

File tree

4 files changed

+0
-409
lines changed

4 files changed

+0
-409
lines changed

bigframes/operations/ai.py

Lines changed: 0 additions & 203 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
from typing import Dict, Iterable, List, Optional, Sequence, Union
2020
import warnings
2121

22-
import numpy as np
23-
2422
from bigframes import dtypes, exceptions, options
2523
from bigframes.core import guid, log_adapter
2624

@@ -586,207 +584,6 @@ def search(
586584

587585
return typing.cast(bigframes.dataframe.DataFrame, search_result)
588586

589-
def top_k(
590-
self,
591-
instruction: str,
592-
model,
593-
k: int = 10,
594-
ground_with_google_search: bool = False,
595-
):
596-
"""
597-
Ranks each tuple and returns the k best according to the instruction.
598-
599-
This method employs a quick select algorithm to efficiently compare the pivot
600-
with all other items. By leveraging an LLM (Large Language Model), it then
601-
identifies the top 'k' best answers from these comparisons.
602-
603-
**Examples:**
604-
605-
>>> import bigframes.pandas as bpd
606-
>>> bpd.options.display.progress_bar = None
607-
>>> bpd.options.experiments.ai_operators = True
608-
>>> bpd.options.compute.ai_ops_confirmation_threshold = 25
609-
610-
>>> import bigframes.ml.llm as llm
611-
>>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001")
612-
613-
>>> df = bpd.DataFrame(
614-
... {
615-
... "Animals": ["Dog", "Bird", "Cat", "Horse"],
616-
... "Sounds": ["Woof", "Chirp", "Meow", "Neigh"],
617-
... })
618-
>>> df.ai.top_k("{Animals} are more popular as pets", model=model, k=2)
619-
Animals Sounds
620-
0 Dog Woof
621-
2 Cat Meow
622-
<BLANKLINE>
623-
[2 rows x 2 columns]
624-
625-
Args:
626-
instruction (str):
627-
An instruction on how to map the data. This value must contain
628-
column references by name enclosed in braces.
629-
For example, to reference a column named "Animals", use "{Animals}" in the
630-
instruction, like: "{Animals} are more popular as pets"
631-
632-
model (bigframes.ml.llm.GeminiTextGenerator):
633-
A GeminiTextGenerator provided by the Bigframes ML package.
634-
635-
k (int, default 10):
636-
The number of rows to return.
637-
638-
ground_with_google_search (bool, default False):
639-
Enables Grounding with Google Search for the GeminiTextGenerator model.
640-
When set to True, the model incorporates relevant information from Google
641-
Search results into its responses, enhancing their accuracy and factualness.
642-
Note: Using this feature may impact billing costs. Refer to the pricing
643-
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
644-
The default is `False`.
645-
646-
Returns:
647-
bigframes.dataframe.DataFrame: A new DataFrame with the top k rows.
648-
649-
Raises:
650-
NotImplementedError: when the AI operator experiment is off.
651-
ValueError: when the instruction refers to a non-existing column, or when no
652-
columns are referred to.
653-
"""
654-
if not options.experiments.ai_operators:
655-
raise NotImplementedError()
656-
657-
import bigframes.dataframe
658-
import bigframes.series
659-
660-
self._validate_model(model)
661-
columns = self._parse_columns(instruction)
662-
for column in columns:
663-
if column not in self._df.columns:
664-
raise ValueError(f"Column {column} not found.")
665-
if len(columns) > 1:
666-
raise NotImplementedError("AI top K are limited to a single column.")
667-
668-
if ground_with_google_search:
669-
msg = exceptions.format_message(
670-
"Enables Grounding with Google Search may impact billing cost. See pricing "
671-
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
672-
)
673-
warnings.warn(msg, category=UserWarning)
674-
675-
work_estimate = int(len(self._df) * (len(self._df) - 1) / 2)
676-
self._confirm_operation(work_estimate)
677-
678-
df: bigframes.dataframe.DataFrame = self._df[columns].copy()
679-
column = columns[0]
680-
if df[column].dtype != dtypes.STRING_DTYPE:
681-
df[column] = df[column].astype(dtypes.STRING_DTYPE)
682-
683-
# `index` is reserved for the `reset_index` below.
684-
if column == "index":
685-
raise ValueError(
686-
"Column name 'index' is reserved. Please choose a different name."
687-
)
688-
689-
if k < 1:
690-
raise ValueError("k must be an integer greater than or equal to 1.")
691-
692-
user_instruction = self._format_instruction(instruction, columns)
693-
694-
n = df.shape[0]
695-
if k >= n:
696-
return df
697-
698-
# Create a unique index and duplicate it as the "index" column. This workaround
699-
# is needed for the select search algorithm due to unimplemented bigFrame methods.
700-
df = df.reset_index().rename(columns={"index": "old_index"}).reset_index()
701-
702-
# Initialize a status column to track the selection status of each item.
703-
# - None: Unknown/not yet processed
704-
# - 1.0: Selected as part of the top-k items
705-
# - -1.0: Excluded from the top-k items
706-
status_column = guid.generate_guid("status")
707-
df[status_column] = bigframes.series.Series(
708-
None, dtype=dtypes.FLOAT_DTYPE, session=df._session
709-
)
710-
711-
num_selected = 0
712-
while num_selected < k:
713-
df, num_new_selected = self._topk_partition(
714-
df,
715-
column,
716-
status_column,
717-
user_instruction,
718-
model,
719-
k - num_selected,
720-
ground_with_google_search,
721-
)
722-
num_selected += num_new_selected
723-
724-
result_df: bigframes.dataframe.DataFrame = self._df.copy()
725-
return result_df[df.set_index("old_index")[status_column] > 0.0]
726-
727-
@staticmethod
728-
def _topk_partition(
729-
df,
730-
column: str,
731-
status_column: str,
732-
user_instruction: str,
733-
model,
734-
k: int,
735-
ground_with_google_search: bool,
736-
):
737-
output_instruction = (
738-
"Given a question and two documents, choose the document that best answers "
739-
"the question. Respond with 'Document 1' or 'Document 2'. You must choose "
740-
"one, even if neither is ideal. "
741-
)
742-
743-
# Random pivot selection for improved average quickselect performance.
744-
pending_df = df[df[status_column].isna()]
745-
pivot_iloc = np.random.randint(0, pending_df.shape[0])
746-
pivot_index = pending_df.iloc[pivot_iloc]["index"]
747-
pivot_df = pending_df[pending_df["index"] == pivot_index]
748-
749-
# Build a prompt to compare the pivot item's relevance to other pending items.
750-
prompt_s = pending_df[pending_df["index"] != pivot_index][column]
751-
prompt_s = (
752-
f"{output_instruction}\n\nQuestion: {user_instruction}\n"
753-
+ f"\nDocument 1: {column} "
754-
+ pivot_df.iloc[0][column]
755-
+ f"\nDocument 2: {column} "
756-
+ prompt_s # type:ignore
757-
)
758-
759-
import bigframes.dataframe
760-
761-
predict_df = typing.cast(
762-
bigframes.dataframe.DataFrame,
763-
model.predict(
764-
prompt_s,
765-
temperature=0.0,
766-
ground_with_google_search=ground_with_google_search,
767-
),
768-
)
769-
770-
marks = predict_df["ml_generate_text_llm_result"].str.contains("2")
771-
more_relavant: bigframes.dataframe.DataFrame = df[marks]
772-
less_relavent: bigframes.dataframe.DataFrame = df[~marks]
773-
774-
num_more_relavant = more_relavant.shape[0]
775-
if k < num_more_relavant:
776-
less_relavent[status_column] = -1.0
777-
pivot_df[status_column] = -1.0
778-
df = df.combine_first(less_relavent).combine_first(pivot_df)
779-
return df, 0
780-
else: # k >= num_more_relavant
781-
more_relavant[status_column] = 1.0
782-
df = df.combine_first(more_relavant)
783-
if k >= num_more_relavant + 1:
784-
pivot_df[status_column] = 1.0
785-
df = df.combine_first(pivot_df)
786-
return df, num_more_relavant + 1
787-
else:
788-
return df, num_more_relavant
789-
790587
def sim_join(
791588
self,
792589
other,

notebooks/experimental/ai_operators.ipynb

Lines changed: 0 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,129 +1064,6 @@
10641064
"animals.ai.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)"
10651065
]
10661066
},
1067-
{
1068-
"cell_type": "markdown",
1069-
"metadata": {
1070-
"id": "kU7BsyTyiouX"
1071-
},
1072-
"source": [
1073-
"## AI Top K"
1074-
]
1075-
},
1076-
{
1077-
"cell_type": "markdown",
1078-
"metadata": {
1079-
"id": "s9QePXEoiouX"
1080-
},
1081-
"source": [
1082-
"AI Top K selects the top K values based on your instruction. Here is an example:"
1083-
]
1084-
},
1085-
{
1086-
"cell_type": "code",
1087-
"execution_count": 18,
1088-
"metadata": {
1089-
"id": "bMQqtyZ2iouX"
1090-
},
1091-
"outputs": [],
1092-
"source": [
1093-
"df = bpd.DataFrame({\"Animals\": [\"Corgi\", \"Orange Cat\", \"Parrot\", \"Tarantula\"]})"
1094-
]
1095-
},
1096-
{
1097-
"cell_type": "markdown",
1098-
"metadata": {
1099-
"id": "KiljGBSCiouX"
1100-
},
1101-
"source": [
1102-
"You want to find the top two most popular pets:"
1103-
]
1104-
},
1105-
{
1106-
"cell_type": "code",
1107-
"execution_count": 19,
1108-
"metadata": {
1109-
"colab": {
1110-
"base_uri": "https://localhost:8080/",
1111-
"height": 159
1112-
},
1113-
"id": "OZv5WUGIiouX",
1114-
"outputId": "ae1cee27-cc31-455e-c4ac-c0a9a5cf4ca5"
1115-
},
1116-
"outputs": [
1117-
{
1118-
"name": "stderr",
1119-
"output_type": "stream",
1120-
"text": [
1121-
"/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n",
1122-
"`db_dtypes` is a preview feature and subject to change.\n",
1123-
" warnings.warn(msg, bfe.PreviewWarning)\n"
1124-
]
1125-
},
1126-
{
1127-
"data": {
1128-
"text/html": [
1129-
"<div>\n",
1130-
"<style scoped>\n",
1131-
" .dataframe tbody tr th:only-of-type {\n",
1132-
" vertical-align: middle;\n",
1133-
" }\n",
1134-
"\n",
1135-
" .dataframe tbody tr th {\n",
1136-
" vertical-align: top;\n",
1137-
" }\n",
1138-
"\n",
1139-
" .dataframe thead th {\n",
1140-
" text-align: right;\n",
1141-
" }\n",
1142-
"</style>\n",
1143-
"<table border=\"1\" class=\"dataframe\">\n",
1144-
" <thead>\n",
1145-
" <tr style=\"text-align: right;\">\n",
1146-
" <th></th>\n",
1147-
" <th>Animals</th>\n",
1148-
" </tr>\n",
1149-
" </thead>\n",
1150-
" <tbody>\n",
1151-
" <tr>\n",
1152-
" <th>0</th>\n",
1153-
" <td>Corgi</td>\n",
1154-
" </tr>\n",
1155-
" <tr>\n",
1156-
" <th>1</th>\n",
1157-
" <td>Orange Cat</td>\n",
1158-
" </tr>\n",
1159-
" </tbody>\n",
1160-
"</table>\n",
1161-
"<p>2 rows × 1 columns</p>\n",
1162-
"</div>[2 rows x 1 columns in total]"
1163-
],
1164-
"text/plain": [
1165-
" Animals\n",
1166-
"0 Corgi\n",
1167-
"1 Orange Cat\n",
1168-
"\n",
1169-
"[2 rows x 1 columns]"
1170-
]
1171-
},
1172-
"execution_count": 19,
1173-
"metadata": {},
1174-
"output_type": "execute_result"
1175-
}
1176-
],
1177-
"source": [
1178-
"df.ai.top_k(\"{Animals} are more popular as pets\", model=gemini_model, k=2)"
1179-
]
1180-
},
1181-
{
1182-
"cell_type": "markdown",
1183-
"metadata": {
1184-
"id": "dC8fyu3aiouX"
1185-
},
1186-
"source": [
1187-
"Under the hood, the AI top K operator performs pair-wise comparisons with LLM. The top K results are returned in the order of their indices instead of their ranks."
1188-
]
1189-
},
11901067
{
11911068
"cell_type": "markdown",
11921069
"metadata": {

0 commit comments

Comments
 (0)