Skip to content

Commit 1bfa598

Browse files
chore: Add semantics.filter, guarded by an experiment flag. (#1040)
* feat: Add , guarded by an experiment flag. * fix test index * remove redundant line * Move semantic operators into a separate Semantics class * move test file location * check column references and update tests * check column references and update tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Clean up further * fix model name --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 991a462 commit 1bfa598

File tree

5 files changed

+423
-0
lines changed

5 files changed

+423
-0
lines changed

bigframes/dataframe.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
import bigframes.operations.aggregations
7575
import bigframes.operations.aggregations as agg_ops
7676
import bigframes.operations.plotting as plotting
77+
import bigframes.operations.semantics
7778
import bigframes.operations.structs
7879
import bigframes.series
7980
import bigframes.series as bf_series
@@ -3875,3 +3876,7 @@ def _throw_if_null_index(self, opname: str):
38753876
raise bigframes.exceptions.NullIndexError(
38763877
f"DataFrame cannot perform {opname} as it has no index. Set an index using set_index."
38773878
)
3879+
3880+
@property
3881+
def semantics(self):
3882+
return bigframes.operations.semantics.Semantics(self)

bigframes/operations/semantics.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import re
17+
import typing
18+
19+
import bigframes
20+
21+
22+
class Semantics:
23+
def __init__(self, df) -> None:
24+
if not bigframes.options.experiments.semantic_operators:
25+
raise NotImplementedError()
26+
27+
self._df = df
28+
29+
def filter(self, instruction: str, model):
30+
"""
31+
Filters the DataFrame with the semantics of the user instruction.
32+
33+
Args:
34+
instruction:
35+
An instruction on how to filter the data. This value must contain
36+
column references by name, which should be wrapped in a pair of braces.
37+
For example, if you have a column "food", you can refer to this column
38+
in the instructions like:
39+
"The {food} is healthy."
40+
41+
model:
42+
A LLM model provided by Bigframes ML package.
43+
44+
Returns:
45+
DataFrame filtered by the instruction.
46+
47+
Raises:
48+
NotImplementedError: when the semantic operator experiment is off.
49+
ValueError: when the instruction refers to a non-existing column, or when no
50+
columns are referred to.
51+
"""
52+
53+
# Validate column references
54+
columns = re.findall(r"(?<!{)\{(?!{)(.*?)\}(?!\})", instruction)
55+
56+
if not columns:
57+
raise ValueError("No column references.")
58+
59+
for column in columns:
60+
if column not in self._df.columns:
61+
raise ValueError(f"Column {column} not found.")
62+
63+
# Replace column references with names.
64+
instruction = instruction.format(**{col: col for col in columns})
65+
66+
prompt_df = self._df.copy()
67+
68+
# Combine context from multiple columns.
69+
for idx, col in enumerate(columns):
70+
if idx == 0:
71+
prompt_df["context"] = f"{col} is `" + prompt_df[col] + "`\n"
72+
else:
73+
prompt_df["context"] += f"{col} is `" + prompt_df[col] + "`\n"
74+
75+
prompt_df["prompt"] = (
76+
"Decide the folowing claim by only True and False: "
77+
+ instruction
78+
+ "\nContext:"
79+
+ prompt_df["context"]
80+
)
81+
82+
import bigframes.dataframe
83+
84+
results = typing.cast(
85+
bigframes.dataframe.DataFrame, model.predict(prompt_df["prompt"])
86+
)
87+
88+
return self._df[
89+
results["ml_generate_text_llm_result"].str.lower().str.contains("true")
90+
]
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"## Preparation"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import bigframes\n",
17+
"import bigframes.pandas as bpd"
18+
]
19+
},
20+
{
21+
"cell_type": "markdown",
22+
"metadata": {},
23+
"source": [
24+
"Enable the semantic operator experiment"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": 2,
30+
"metadata": {},
31+
"outputs": [
32+
{
33+
"name": "stderr",
34+
"output_type": "stream",
35+
"text": [
36+
"/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:33: UserWarning: Semantic operators are still under experiments, and are subject to change in the future.\n",
37+
" warnings.warn(\n"
38+
]
39+
}
40+
],
41+
"source": [
42+
"bigframes.options.experiments.semantic_operators = True"
43+
]
44+
},
45+
{
46+
"cell_type": "markdown",
47+
"metadata": {},
48+
"source": [
49+
"Prepare the LLM model. Here we are going to use Gemini 1.5 Flash."
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 3,
55+
"metadata": {},
56+
"outputs": [
57+
{
58+
"name": "stderr",
59+
"output_type": "stream",
60+
"text": [
61+
"/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:559: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n",
62+
" return global_session.get_global_session()\n"
63+
]
64+
},
65+
{
66+
"data": {
67+
"text/html": [
68+
"Query job 05cef003-6ac9-4cfc-b21c-3d6aed5d5b78 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:05cef003-6ac9-4cfc-b21c-3d6aed5d5b78&page=queryresults\">Open Job</a>"
69+
],
70+
"text/plain": [
71+
"<IPython.core.display.HTML object>"
72+
]
73+
},
74+
"metadata": {},
75+
"output_type": "display_data"
76+
}
77+
],
78+
"source": [
79+
"import bigframes.ml.llm as llm\n",
80+
"model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)"
81+
]
82+
},
83+
{
84+
"cell_type": "markdown",
85+
"metadata": {},
86+
"source": [
87+
"## Semantic Filtering"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 5,
93+
"metadata": {},
94+
"outputs": [
95+
{
96+
"data": {
97+
"text/html": [
98+
"Query job f9439f7e-13cd-4990-847b-d318f223af02 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f9439f7e-13cd-4990-847b-d318f223af02&page=queryresults\">Open Job</a>"
99+
],
100+
"text/plain": [
101+
"<IPython.core.display.HTML object>"
102+
]
103+
},
104+
"metadata": {},
105+
"output_type": "display_data"
106+
},
107+
{
108+
"name": "stderr",
109+
"output_type": "stream",
110+
"text": [
111+
"/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n",
112+
" warnings.warn(\n"
113+
]
114+
},
115+
{
116+
"data": {
117+
"text/html": [
118+
"Query job 51d2b023-6834-47f6-b17c-6b50d759ad88 is DONE. 4 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:51d2b023-6834-47f6-b17c-6b50d759ad88&page=queryresults\">Open Job</a>"
119+
],
120+
"text/plain": [
121+
"<IPython.core.display.HTML object>"
122+
]
123+
},
124+
"metadata": {},
125+
"output_type": "display_data"
126+
},
127+
{
128+
"data": {
129+
"text/html": [
130+
"Query job 7979b08b-687e-41fc-8251-dfa0c0b41bed is DONE. 90 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:7979b08b-687e-41fc-8251-dfa0c0b41bed&page=queryresults\">Open Job</a>"
131+
],
132+
"text/plain": [
133+
"<IPython.core.display.HTML object>"
134+
]
135+
},
136+
"metadata": {},
137+
"output_type": "display_data"
138+
},
139+
{
140+
"data": {
141+
"text/html": [
142+
"Query job 6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:6e39dcb2-fe2f-40ad-899b-f1a29bf47bd2&page=queryresults\">Open Job</a>"
143+
],
144+
"text/plain": [
145+
"<IPython.core.display.HTML object>"
146+
]
147+
},
148+
"metadata": {},
149+
"output_type": "display_data"
150+
},
151+
{
152+
"data": {
153+
"text/html": [
154+
"Query job 60853851-bd33-4745-959e-bfddd970e4c4 is DONE. 33 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:60853851-bd33-4745-959e-bfddd970e4c4&page=queryresults\">Open Job</a>"
155+
],
156+
"text/plain": [
157+
"<IPython.core.display.HTML object>"
158+
]
159+
},
160+
"metadata": {},
161+
"output_type": "display_data"
162+
},
163+
{
164+
"data": {
165+
"text/html": [
166+
"<div>\n",
167+
"<style scoped>\n",
168+
" .dataframe tbody tr th:only-of-type {\n",
169+
" vertical-align: middle;\n",
170+
" }\n",
171+
"\n",
172+
" .dataframe tbody tr th {\n",
173+
" vertical-align: top;\n",
174+
" }\n",
175+
"\n",
176+
" .dataframe thead th {\n",
177+
" text-align: right;\n",
178+
" }\n",
179+
"</style>\n",
180+
"<table border=\"1\" class=\"dataframe\">\n",
181+
" <thead>\n",
182+
" <tr style=\"text-align: right;\">\n",
183+
" <th></th>\n",
184+
" <th>country</th>\n",
185+
" <th>city</th>\n",
186+
" </tr>\n",
187+
" </thead>\n",
188+
" <tbody>\n",
189+
" <tr>\n",
190+
" <th>1</th>\n",
191+
" <td>Germany</td>\n",
192+
" <td>Berlin</td>\n",
193+
" </tr>\n",
194+
" </tbody>\n",
195+
"</table>\n",
196+
"<p>1 rows × 2 columns</p>\n",
197+
"</div>[1 rows x 2 columns in total]"
198+
],
199+
"text/plain": [
200+
" country city\n",
201+
"1 Germany Berlin\n",
202+
"\n",
203+
"[1 rows x 2 columns]"
204+
]
205+
},
206+
"execution_count": 5,
207+
"metadata": {},
208+
"output_type": "execute_result"
209+
}
210+
],
211+
"source": [
212+
"df = bpd.DataFrame({'country': ['USA', 'Germany'], 'city': ['Seattle', 'Berlin']})\n",
213+
"df.semantics.filter(\"{city} is the capital of {country}\", model)"
214+
]
215+
}
216+
],
217+
"metadata": {
218+
"kernelspec": {
219+
"display_name": "venv",
220+
"language": "python",
221+
"name": "python3"
222+
},
223+
"language_info": {
224+
"codemirror_mode": {
225+
"name": "ipython",
226+
"version": 3
227+
},
228+
"file_extension": ".py",
229+
"mimetype": "text/x-python",
230+
"name": "python",
231+
"nbconvert_exporter": "python",
232+
"pygments_lexer": "ipython3",
233+
"version": "3.11.9"
234+
}
235+
},
236+
"nbformat": 4,
237+
"nbformat_minor": 2
238+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
import bigframes.ml.llm as llm
18+
19+
20+
@pytest.fixture(scope="session")
21+
def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator:
22+
return llm.GeminiTextGenerator(
23+
session=session,
24+
connection_name=bq_connection,
25+
model_name="gemini-1.5-flash-001",
26+
)

0 commit comments

Comments
 (0)