Skip to content

Commit 3988eda

Browse files
chore: Implement semantic map (#1045)
* chore: implement semantic map * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * validate model type * update API and tests * migrate tests to large system and update doc examples * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add more test and polish parameters --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent c86e002 commit 3988eda

File tree

6 files changed

+435
-106
lines changed

6 files changed

+435
-106
lines changed

bigframes/operations/semantics.py

Lines changed: 111 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,24 @@ def filter(self, instruction: str, model):
3030
"""
3131
Filters the DataFrame with the semantics of the user instruction.
3232
33+
**Examples:**
34+
35+
>>> import bigframes.pandas as bpd
36+
>>> bpd.options.display.progress_bar = None
37+
38+
>>> import bigframes
39+
>>> bigframes.options.experiments.semantic_operators = True
40+
41+
>>> import bigframes.ml.llm as llm
42+
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
43+
44+
>>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]})
45+
>>> df.semantics.filter("{city} is the capital of {country}", model)
46+
country city
47+
1 Germany Berlin
48+
<BLANKLINE>
49+
[1 rows x 2 columns]
50+
3351
Args:
3452
instruction:
3553
An instruction on how to filter the data. This value must contain
@@ -39,7 +57,7 @@ def filter(self, instruction: str, model):
3957
"The {food} is healthy."
4058
4159
model:
42-
A LLM model provided by Bigframes ML package.
60+
A GeminiTextGenerator provided by Bigframes ML package.
4361
4462
Returns:
4563
DataFrame filtered by the instruction.
@@ -49,9 +67,89 @@ def filter(self, instruction: str, model):
4967
ValueError: when the instruction refers to a non-existing column, or when no
5068
columns are referred to.
5169
"""
70+
_validate_model(model)
71+
72+
output_instruction = "Based on the provided context, reply to the following claim by only True or False:"
73+
74+
from bigframes.dataframe import DataFrame
75+
76+
results = typing.cast(
77+
DataFrame, model.predict(self._make_prompt(instruction, output_instruction))
78+
)
79+
80+
return self._df[
81+
results["ml_generate_text_llm_result"].str.lower().str.contains("true")
82+
]
83+
84+
def map(self, instruction: str, output_column: str, model):
85+
"""
86+
Maps the DataFrame with the semantics of the user instruction.
87+
88+
**Examples:**
89+
90+
>>> import bigframes.pandas as bpd
91+
>>> bpd.options.display.progress_bar = None
92+
93+
>>> import bigframes
94+
>>> bigframes.options.experiments.semantic_operators = True
95+
96+
>>> import bigframes.ml.llm as llm
97+
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
98+
99+
>>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
100+
>>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", result_column_name="food", model=model)
101+
ingredient_1 ingredient_2 food
102+
0 Burger Bun Beef Patty Burger
103+
<BLANKLINE>
104+
1 Soy Bean Bittern Tofu
105+
<BLANKLINE>
106+
<BLANKLINE>
107+
[2 rows x 3 columns]
108+
109+
Args:
110+
instruction:
111+
An instruction on how to map the data. This value must contain
112+
column references by name, which should be wrapped in a pair of braces.
113+
For example, if you have a column "food", you can refer to this column
114+
in the instructions like:
115+
"Get the ingredients of {food}."
52116
117+
result_column_name:
118+
The column name of the mapping result.
119+
120+
model:
121+
A GeminiTextGenerator provided by Bigframes ML package.
122+
123+
Returns:
124+
DataFrame with attached mapping results.
125+
126+
Raises:
127+
NotImplementedError: when the semantic operator experiment is off.
128+
ValueError: when the instruction refers to a non-existing column, or when no
129+
columns are referred to.
130+
"""
131+
_validate_model(model)
132+
133+
output_instruction = (
134+
"Based on the provided contenxt, answer the following instruction:"
135+
)
136+
137+
from bigframes.series import Series
138+
139+
results = typing.cast(
140+
Series,
141+
model.predict(self._make_prompt(instruction, output_instruction))[
142+
"ml_generate_text_llm_result"
143+
],
144+
)
145+
146+
from bigframes.core.reshape import concat
147+
148+
return concat([self._df, results.rename(output_column)], axis=1)
149+
150+
def _make_prompt(self, user_instruction: str, output_instruction: str):
53151
# Validate column references
54-
columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", instruction)
152+
columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", user_instruction)
55153

56154
if not columns:
57155
raise ValueError("No column references.")
@@ -61,30 +159,20 @@ def filter(self, instruction: str, model):
61159
raise ValueError(f"Column {column} not found.")
62160

63161
# Replace column references with names.
64-
instruction = instruction.format(**{col: col for col in columns})
162+
user_instruction = user_instruction.format(**{col: col for col in columns})
65163

66-
prompt_df = self._df.copy()
164+
prompt_df = self._df[columns].copy()
165+
prompt_df["prompt"] = f"{output_instruction}\n{user_instruction}\nContext: "
67166

68167
# Combine context from multiple columns.
69-
for idx, col in enumerate(columns):
70-
if idx == 0:
71-
prompt_df["context"] = f"{col} is `" + prompt_df[col] + "`\n"
72-
else:
73-
prompt_df["context"] += f"{col} is `" + prompt_df[col] + "`\n"
74-
75-
prompt_df["prompt"] = (
76-
"Decide the folowing claim by only True and False: "
77-
+ instruction
78-
+ "\nContext:"
79-
+ prompt_df["context"]
80-
)
168+
for col in columns:
169+
prompt_df["prompt"] += f"{col} is `" + prompt_df[col] + "`\n"
81170

82-
import bigframes.dataframe
171+
return prompt_df["prompt"]
83172

84-
results = typing.cast(
85-
bigframes.dataframe.DataFrame, model.predict(prompt_df["prompt"])
86-
)
87173

88-
return self._df[
89-
results["ml_generate_text_llm_result"].str.lower().str.contains("true")
90-
]
174+
def _validate_model(model):
175+
from bigframes.ml.llm import GeminiTextGenerator
176+
177+
if not isinstance(model, GeminiTextGenerator):
178+
raise ValueError("Model is not GeminiText Generator")

notebooks/experimental/semantic_operators.ipynb

Lines changed: 149 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
{
6666
"data": {
6767
"text/html": [
68-
"Query job 05cef003-6ac9-4cfc-b21c-3d6aed5d5b78 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:05cef003-6ac9-4cfc-b21c-3d6aed5d5b78&page=queryresults\">Open Job</a>"
68+
"Query job 56de4aea-6e28-42fc-9760-b65c7a9c0ae7 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:56de4aea-6e28-42fc-9760-b65c7a9c0ae7&page=queryresults\">Open Job</a>"
6969
],
7070
"text/plain": [
7171
"<IPython.core.display.HTML object>"
@@ -89,13 +89,13 @@
8989
},
9090
{
9191
"cell_type": "code",
92-
"execution_count": 5,
92+
"execution_count": 4,
9393
"metadata": {},
9494
"outputs": [
9595
{
9696
"data": {
9797
"text/html": [
98-
"Query job f9439f7e-13cd-4990-847b-d318f223af02 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f9439f7e-13cd-4990-847b-d318f223af02&page=queryresults\">Open Job</a>"
98+
"Query job bf5dd330-8e3e-45d2-b443-a61e595debba is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:bf5dd330-8e3e-45d2-b443-a61e595debba&page=queryresults\">Open Job</a>"
9999
],
100100
"text/plain": [
101101
"<IPython.core.display.HTML object>"
@@ -115,19 +115,7 @@
115115
{
116116
"data": {
117117
"text/html": [
118-
"Query job 51d2b023-6834-47f6-b17c-6b50d759ad88 is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:51d2b023-6834-47f6-b17c-6b50d759ad88&page=queryresults\">Open Job</a>"
119-
],
120-
"text/plain": [
121-
"<IPython.core.display.HTML object>"
122-
]
123-
},
124-
"metadata": {},
125-
"output_type": "display_data"
126-
},
127-
{
128-
"data": {
129-
"text/html": [
130-
"Query job 7979b08b-687e-41fc-8251-dfa0c0b41bed is DONE. 90 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7979b08b-687e-41fc-8251-dfa0c0b41bed&page=queryresults\">Open Job</a>"
118+
"Query job 8ede807b-ae35-4d44-aaac-0788aab8398c is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:8ede807b-ae35-4d44-aaac-0788aab8398c&page=queryresults\">Open Job</a>"
131119
],
132120
"text/plain": [
133121
"<IPython.core.display.HTML object>"
@@ -139,7 +127,7 @@
139127
{
140128
"data": {
141129
"text/html": [
142-
"Query job 6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2&page=queryresults\">Open Job</a>"
130+
"Query job 1c7d1215-0661-4d4a-95eb-79dfbea65413 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1c7d1215-0661-4d4a-95eb-79dfbea65413&page=queryresults\">Open Job</a>"
143131
],
144132
"text/plain": [
145133
"<IPython.core.display.HTML object>"
@@ -151,7 +139,7 @@
151139
{
152140
"data": {
153141
"text/html": [
154-
"Query job 60853851-bd33-4745-959e-bfddd970e4c4 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:60853851-bd33-4745-959e-bfddd970e4c4&page=queryresults\">Open Job</a>"
142+
"Query job e562f224-9cd6-4b55-8bf0-145a3bd64540 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:e562f224-9cd6-4b55-8bf0-145a3bd64540&page=queryresults\">Open Job</a>"
155143
],
156144
"text/plain": [
157145
"<IPython.core.display.HTML object>"
@@ -203,7 +191,7 @@
203191
"[1 rows x 2 columns]"
204192
]
205193
},
206-
"execution_count": 5,
194+
"execution_count": 4,
207195
"metadata": {},
208196
"output_type": "execute_result"
209197
}
@@ -212,6 +200,148 @@
212200
"df = bpd.DataFrame({'country': ['USA', 'Germany'], 'city': ['Seattle', 'Berlin']})\n",
213201
"df.semantics.filter(\"{city} is the capital of {country}\", model)"
214202
]
203+
},
204+
{
205+
"cell_type": "markdown",
206+
"metadata": {},
207+
"source": [
208+
"## Semantic Mapping"
209+
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"execution_count": 5,
214+
"metadata": {},
215+
"outputs": [],
216+
"source": [
217+
"df = bpd.DataFrame(\n",
218+
" data={\"ingredient_1\": [\"Burger Bun\", \"Soy Bean\"], \"ingredient_2\": [\"Beef Patty\", \"Bittern\"]}\n",
219+
" )"
220+
]
221+
},
222+
{
223+
"cell_type": "code",
224+
"execution_count": 6,
225+
"metadata": {},
226+
"outputs": [
227+
{
228+
"data": {
229+
"text/html": [
230+
"Query job f62b4175-cb34-4e04-9a3f-4bfe1965f72f is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f62b4175-cb34-4e04-9a3f-4bfe1965f72f&page=queryresults\">Open Job</a>"
231+
],
232+
"text/plain": [
233+
"<IPython.core.display.HTML object>"
234+
]
235+
},
236+
"metadata": {},
237+
"output_type": "display_data"
238+
},
239+
{
240+
"name": "stderr",
241+
"output_type": "stream",
242+
"text": [
243+
"/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n",
244+
" warnings.warn(\n"
245+
]
246+
},
247+
{
248+
"data": {
249+
"text/html": [
250+
"Query job b86fbb98-a566-4887-a938-f80fe3888b27 is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:b86fbb98-a566-4887-a938-f80fe3888b27&page=queryresults\">Open Job</a>"
251+
],
252+
"text/plain": [
253+
"<IPython.core.display.HTML object>"
254+
]
255+
},
256+
"metadata": {},
257+
"output_type": "display_data"
258+
},
259+
{
260+
"data": {
261+
"text/html": [
262+
"Query job d4f09988-48d9-48df-a138-a7256b9a5766 is DONE. 34 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d4f09988-48d9-48df-a138-a7256b9a5766&page=queryresults\">Open Job</a>"
263+
],
264+
"text/plain": [
265+
"<IPython.core.display.HTML object>"
266+
]
267+
},
268+
"metadata": {},
269+
"output_type": "display_data"
270+
},
271+
{
272+
"data": {
273+
"text/html": [
274+
"Query job 293d186f-359c-40d1-87f2-e8d525fd72ba is DONE. 93 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:293d186f-359c-40d1-87f2-e8d525fd72ba&page=queryresults\">Open Job</a>"
275+
],
276+
"text/plain": [
277+
"<IPython.core.display.HTML object>"
278+
]
279+
},
280+
"metadata": {},
281+
"output_type": "display_data"
282+
},
283+
{
284+
"data": {
285+
"text/html": [
286+
"<div>\n",
287+
"<style scoped>\n",
288+
" .dataframe tbody tr th:only-of-type {\n",
289+
" vertical-align: middle;\n",
290+
" }\n",
291+
"\n",
292+
" .dataframe tbody tr th {\n",
293+
" vertical-align: top;\n",
294+
" }\n",
295+
"\n",
296+
" .dataframe thead th {\n",
297+
" text-align: right;\n",
298+
" }\n",
299+
"</style>\n",
300+
"<table border=\"1\" class=\"dataframe\">\n",
301+
" <thead>\n",
302+
" <tr style=\"text-align: right;\">\n",
303+
" <th></th>\n",
304+
" <th>ingredient_1</th>\n",
305+
" <th>ingredient_2</th>\n",
306+
" <th>food</th>\n",
307+
" </tr>\n",
308+
" </thead>\n",
309+
" <tbody>\n",
310+
" <tr>\n",
311+
" <th>0</th>\n",
312+
" <td>Burger Bun</td>\n",
313+
" <td>Beef Patty</td>\n",
314+
" <td>Burger</td>\n",
315+
" </tr>\n",
316+
" <tr>\n",
317+
" <th>1</th>\n",
318+
" <td>Soy Bean</td>\n",
319+
" <td>Bittern</td>\n",
320+
" <td>Tofu</td>\n",
321+
" </tr>\n",
322+
" </tbody>\n",
323+
"</table>\n",
324+
"<p>2 rows × 3 columns</p>\n",
325+
"</div>[2 rows x 3 columns in total]"
326+
],
327+
"text/plain": [
328+
" ingredient_1 ingredient_2 food\n",
329+
"0 Burger Bun Beef Patty Burger \n",
330+
"\n",
331+
"1 Soy Bean Bittern Tofu \n",
332+
"\n",
333+
"\n",
334+
"[2 rows x 3 columns]"
335+
]
336+
},
337+
"execution_count": 6,
338+
"metadata": {},
339+
"output_type": "execute_result"
340+
}
341+
],
342+
"source": [
343+
"df.semantics.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=model)"
344+
]
215345
}
216346
],
217347
"metadata": {

0 commit comments

Comments
 (0)