@@ -30,6 +30,24 @@ def filter(self, instruction: str, model):
30
30
"""
31
31
Filters the DataFrame with the semantics of the user instruction.
32
32
33
+ **Examples:**
34
+
35
+ >>> import bigframes.pandas as bpd
36
+ >>> bpd.options.display.progress_bar = None
37
+
38
+ >>> import bigframes
39
+ >>> bigframes.options.experiments.semantic_operators = True
40
+
41
+ >>> import bigframes.ml.llm as llm
42
+ >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
43
+
44
+ >>> df = bpd.DataFrame({"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]})
45
+ >>> df.semantics.filter("{city} is the capital of {country}", model)
46
+ country city
47
+ 1 Germany Berlin
48
+ <BLANKLINE>
49
+ [1 rows x 2 columns]
50
+
33
51
Args:
34
52
instruction:
35
53
An instruction on how to filter the data. This value must contain
@@ -39,7 +57,7 @@ def filter(self, instruction: str, model):
39
57
"The {food} is healthy."
40
58
41
59
model:
42
- A LLM model provided by Bigframes ML package.
60
+ A GeminiTextGenerator provided by Bigframes ML package.
43
61
44
62
Returns:
45
63
DataFrame filtered by the instruction.
@@ -49,9 +67,89 @@ def filter(self, instruction: str, model):
49
67
ValueError: when the instruction refers to a non-existing column, or when no
50
68
columns are referred to.
51
69
"""
70
+ _validate_model (model )
71
+
72
+ output_instruction = "Based on the provided context, reply to the following claim by only True or False:"
73
+
74
+ from bigframes .dataframe import DataFrame
75
+
76
+ results = typing .cast (
77
+ DataFrame , model .predict (self ._make_prompt (instruction , output_instruction ))
78
+ )
79
+
80
+ return self ._df [
81
+ results ["ml_generate_text_llm_result" ].str .lower ().str .contains ("true" )
82
+ ]
83
+
84
+ def map (self , instruction : str , output_column : str , model ):
85
+ """
86
+ Maps the DataFrame with the semantics of the user instruction.
87
+
88
+ **Examples:**
89
+
90
+ >>> import bigframes.pandas as bpd
91
+ >>> bpd.options.display.progress_bar = None
92
+
93
+ >>> import bigframes
94
+ >>> bigframes.options.experiments.semantic_operators = True
95
+
96
+ >>> import bigframes.ml.llm as llm
97
+ >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
98
+
99
+ >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
100
+ >>> df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", result_column_name="food", model=model)
101
+ ingredient_1 ingredient_2 food
102
+ 0 Burger Bun Beef Patty Burger
103
+ <BLANKLINE>
104
+ 1 Soy Bean Bittern Tofu
105
+ <BLANKLINE>
106
+ <BLANKLINE>
107
+ [2 rows x 3 columns]
108
+
109
+ Args:
110
+ instruction:
111
+ An instruction on how to map the data. This value must contain
112
+ column references by name, which should be wrapped in a pair of braces.
113
+ For example, if you have a column "food", you can refer to this column
114
+ in the instructions like:
115
+ "Get the ingredients of {food}."
52
116
117
+ result_column_name:
118
+ The column name of the mapping result.
119
+
120
+ model:
121
+ A GeminiTextGenerator provided by Bigframes ML package.
122
+
123
+ Returns:
124
+ DataFrame with attached mapping results.
125
+
126
+ Raises:
127
+ NotImplementedError: when the semantic operator experiment is off.
128
+ ValueError: when the instruction refers to a non-existing column, or when no
129
+ columns are referred to.
130
+ """
131
+ _validate_model (model )
132
+
133
+ output_instruction = (
134
+ "Based on the provided contenxt, answer the following instruction:"
135
+ )
136
+
137
+ from bigframes .series import Series
138
+
139
+ results = typing .cast (
140
+ Series ,
141
+ model .predict (self ._make_prompt (instruction , output_instruction ))[
142
+ "ml_generate_text_llm_result"
143
+ ],
144
+ )
145
+
146
+ from bigframes .core .reshape import concat
147
+
148
+ return concat ([self ._df , results .rename (output_column )], axis = 1 )
149
+
150
+ def _make_prompt (self , user_instruction : str , output_instruction : str ):
53
151
# Validate column references
54
- columns = re .findall (r"(?<!{)\{(?!{)(.*?)\}(?!\})" , instruction )
152
+ columns = re .findall (r"(?<!{)\{(?!{)(.*?)\}(?!\})" , user_instruction )
55
153
56
154
if not columns :
57
155
raise ValueError ("No column references." )
@@ -61,30 +159,20 @@ def filter(self, instruction: str, model):
61
159
raise ValueError (f"Column { column } not found." )
62
160
63
161
# Replace column references with names.
64
- instruction = instruction .format (** {col : col for col in columns })
162
+ user_instruction = user_instruction .format (** {col : col for col in columns })
65
163
66
- prompt_df = self ._df .copy ()
164
+ prompt_df = self ._df [columns ].copy ()
165
+ prompt_df ["prompt" ] = f"{ output_instruction } \n { user_instruction } \n Context: "
67
166
68
167
# Combine context from multiple columns.
69
- for idx , col in enumerate (columns ):
70
- if idx == 0 :
71
- prompt_df ["context" ] = f"{ col } is `" + prompt_df [col ] + "`\n "
72
- else :
73
- prompt_df ["context" ] += f"{ col } is `" + prompt_df [col ] + "`\n "
74
-
75
- prompt_df ["prompt" ] = (
76
- "Decide the folowing claim by only True and False: "
77
- + instruction
78
- + "\n Context:"
79
- + prompt_df ["context" ]
80
- )
168
+ for col in columns :
169
+ prompt_df ["prompt" ] += f"{ col } is `" + prompt_df [col ] + "`\n "
81
170
82
- import bigframes . dataframe
171
+ return prompt_df [ "prompt" ]
83
172
84
- results = typing .cast (
85
- bigframes .dataframe .DataFrame , model .predict (prompt_df ["prompt" ])
86
- )
87
173
88
- return self ._df [
89
- results ["ml_generate_text_llm_result" ].str .lower ().str .contains ("true" )
90
- ]
174
+ def _validate_model (model ):
175
+ from bigframes .ml .llm import GeminiTextGenerator
176
+
177
+ if not isinstance (model , GeminiTextGenerator ):
178
+ raise ValueError ("Model is not GeminiText Generator" )
0 commit comments