1010# TODO: re-eng all userfns to just be SQL constants, without pasting, perhaps pass in db handle
1111
1212# convert datetime to date
13+ import data_algebra .user_fn
14+
15+
1316def as_int64 (col ):
1417 assert isinstance (col , str )
15- return data_algebra .data_ops .user_fn (
16- lambda x : x .astype ('int64' ), # x is a pandas Series
18+ return data_algebra .user_fn .FnTerm (
19+ pandas_fn = lambda x : x .astype ('int64' ), # x is a pandas Series
20+ sql_fn = lambda subs , db_model : f'CAST({ subs [0 ]} AS INT64)' ,
1721 args = [col ],
22+ display_form = f'as_int64({ col } )' ,
1823 name = 'as_int64' ,
19- sql_name = 'CAST' ,
20- sql_suffix = ' AS INT64'
2124 )
2225
2326
@@ -26,22 +29,25 @@ def trimstr(col_name, *, start=0, stop):
2629 assert isinstance (start , int )
2730 assert isinstance (stop , int )
2831 assert isinstance (col_name , str )
29- return data_algebra .data_ops .user_fn (
30- lambda x : x .str .slice (start = start , stop = stop ), # x is a pandas Series
32+ return data_algebra .user_fn .FnTerm (
33+ pandas_fn = lambda x : x .str .slice (start = start , stop = stop ), # x is a pandas Series
34+ sql_fn = lambda subs , db_model : f'SUBSTR({ subs [0 ]} , { start + 1 } , { stop } )' ,
3135 args = [col_name ],
32- name = f'trimstr_{ start + 1 } _{ stop } ' ,
33- sql_name = 'SUBSTR' , sql_suffix = f', { start + 1 } , { stop } ' )
36+ display_form = f'trimstr({ col_name } , start={ start } , stop={ stop } )' ,
37+ name = 'trimstr' ,
38+ )
3439
3540
3641# replace missing with zeros
3742def coalesce_0 (col ):
3843 assert isinstance (col , str )
39- return data_algebra .data_ops .user_fn (
40- lambda x : x .fillna (0 ),
41- args = col ,
44+ return data_algebra .user_fn .FnTerm (
45+ pandas_fn = lambda x : x .fillna (0 ), # x is a pandas Series
46+ sql_fn = lambda subs , db_model : f'COALESCE({ subs [0 ]} , 0)' ,
47+ args = [col ],
48+ display_form = f'coalesce_0({ col } )' ,
4249 name = 'coalesce_0' ,
43- sql_name = 'COALESCE' ,
44- sql_suffix = ', 0' )
50+ )
4551
4652
4753# compute difference in dates in days
@@ -59,210 +65,230 @@ def f(*args):
5965 res = res .combine_first (args [i ])
6066 return res
6167
62- return data_algebra .data_ops .user_fn (
63- f ,
68+ return data_algebra .user_fn .FnTerm (
69+ pandas_fn = f ,
70+ sql_fn = lambda subs , db_model : f'COALESCE({ ", " .join (subs )} )' , # TODO: check SQL
6471 args = cols ,
72+ display_form = f'coalesce({ cols } )' ,
6573 name = 'coalesce' ,
66- sql_name = 'COALESCE' ) # TODO: implement SQL
74+ )
6775
6876
6977# convert datetime to date
7078def datetime_to_date (col ):
7179 assert isinstance (col , str )
72- return data_algebra .data_ops .user_fn (
73- lambda x : x .dt .date .copy (), # x is a pandas Series
74- args = col ,
80+ return data_algebra .user_fn .FnTerm (
81+ pandas_fn = lambda x : x .dt .date .copy (), # x is a pandas Series
82+ sql_fn = lambda subs , db_model : f'DATE({ subs [0 ]} )' ,
83+ args = [col ],
84+ display_form = f'datetime_to_date({ col } )' ,
7585 name = 'datetime_to_date' ,
76- sql_name = 'DATE' )
86+ )
7787
7888
7989# convert str to datetime
8090# https://cloud.google.com/bigquery/docs/reference/standard-sql/datetime_functions
8191def parse_datetime (col , * , format = "%Y-%m-%d %H:%M:%S" ):
8292 assert isinstance (col , str )
83- return data_algebra .data_ops .user_fn (
93+ assert isinstance (format , str )
94+ return data_algebra .user_fn .FnTerm (
8495 # https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
85- lambda x : data_algebra .default_data_model .pd .to_datetime (x , format = format ), # x is a pandas Series
86- args = col ,
96+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x , format = format ), # x is a pandas Series
97+ sql_fn = lambda subs , db_model : f'PARSE_DATETIME({ db_model .quote_string (format )} , { subs [0 ]} )' ,
98+ args = [col ],
99+ display_form = f'parse_datetime({ col } , format="{ format } ")' ,
87100 name = 'parse_datetime' ,
88- sql_name = 'PARSE_DATETIME' , # https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
89- sql_prefix = f'"{ format } ", ' )
101+ )
90102
91103
92104# convert str to date
93105def parse_date (col , * , format = "%Y-%m-%d" ):
94106 assert isinstance (col , str )
95- return data_algebra .data_ops .user_fn (
107+ assert isinstance (format , str )
108+ return data_algebra .user_fn .FnTerm (
96109 # https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
97- lambda x : data_algebra .default_data_model .pd .to_datetime (x , format = format ).dt .date .copy (), # x is a pandas Series
98- args = col ,
110+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x , format = format ).dt .date .copy (), # x is a pandas Series
111+ sql_fn = lambda subs , db_model : f'PARSE_DATE({ db_model .quote_string (format )} , { subs [0 ]} )' ,
112+ args = [col ],
113+ display_form = f'parse_date({ col } , format="{ format } ")' ,
99114 name = 'parse_date' ,
100- sql_name = 'PARSE_DATE' , # https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
101- sql_prefix = f'"{ format } ", ' )
115+ )
102116
103117
104118# convert datetime to str
105119# https://cloud.google.com/bigquery/docs/reference/standard-sql/datetime_functions
106120def format_datetime (col , * , format = "%Y-%m-%d %H:%M:%S" ):
107121 assert isinstance (col , str )
108- return data_algebra .data_ops .user_fn (
122+ assert isinstance (format , str )
123+ return data_algebra .user_fn .FnTerm (
109124 # x is a pandas Series
110- lambda x : x .dt .strftime (date_format = format ),
111- args = col ,
125+ pandas_fn = lambda x : x .dt .strftime (date_format = format ),
126+ sql_fn = lambda subs , db_model : f'FORMAT_DATETIME({ db_model .quote_string (format )} , { subs [0 ]} )' ,
127+ args = [col ],
128+ display_form = f'format_datetime({ col } , format="{ format } ")' ,
112129 name = 'format_datetime' ,
113- sql_name = 'FORMAT_DATETIME' ,
114- sql_prefix = f'"{ format } ", ' )
130+ )
115131
116132
117133# convert date to str
118134def format_date (col , * , format = "%Y-%m-%d" ):
119135 assert isinstance (col , str )
120- return data_algebra .data_ops .user_fn (
136+ assert isinstance (format , str )
137+ return data_algebra .user_fn .FnTerm (
121138 # x is a pandas Series
122- lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .strftime (date_format = format ),
123- args = col ,
139+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .strftime (date_format = format ),
140+ sql_fn = lambda subs , db_model : f'FORMAT_DATE({ db_model .quote_string (format )} , { subs [0 ]} )' ,
141+ args = [col ],
142+ display_form = f'format_date({ col } , format="{ format } ")' ,
124143 name = 'format_date' ,
125- sql_name = 'FORMAT_DATE' ,
126- sql_prefix = f'"{ format } ", ' )
144+ )
127145
128146
129147# convert date to dayofweek Sunday=1 through Saturday=7
130148# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
131149def dayofweek (col ):
132150 assert isinstance (col , str )
133- return data_algebra .data_ops . user_fn (
151+ return data_algebra .user_fn . FnTerm (
134152 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.dayofweek.html#pandas.Series.dt.dayofweek
135153 # https://stackoverflow.com/a/30222759
136154 # x is a pandas Series
137- lambda x : 1 + ((data_algebra .default_data_model .pd .to_datetime (x ).dt .dayofweek .astype ('int64' ) + 1 ) % 7 ),
138- args = col ,
155+ pandas_fn = lambda x : 1 + ((data_algebra .default_data_model .pd .to_datetime (x ).dt .dayofweek .astype ('int64' ) + 1 ) % 7 ),
156+ sql_fn = lambda subs , db_model : f'EXTRACT(DAYOFWEEK FROM { subs [0 ]} )' ,
157+ args = [col ],
158+ display_form = f'dayofweek({ col } )' ,
139159 name = 'dayofweek' ,
140- sql_name = 'EXTRACT' ,
141- sql_prefix = 'DAYOFWEEK FROM ' )
160+ )
142161
143162
144163# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
145164def dayofyear (col ):
146165 assert isinstance (col , str )
147- return data_algebra .data_ops . user_fn (
166+ return data_algebra .user_fn . FnTerm (
148167 # x is a pandas Series
149- lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .dayofyear .astype ('int64' ),
150- args = col ,
168+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .dayofyear .astype ('int64' ),
169+ sql_fn = lambda subs , db_model : f'EXTRACT(DAYOFYEAR FROM { subs [0 ]} )' ,
170+ args = [col ],
171+ display_form = f'dayofyear({ col } )' ,
151172 name = 'dayofyear' ,
152- sql_name = 'EXTRACT' ,
153- sql_prefix = 'DAYOFYEAR FROM ' )
173+ )
154174
155175
156176# convert date to week of year
157177# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
158178def weekofyear (col ):
159179 assert isinstance (col , str )
160- return data_algebra .data_ops . user_fn (
180+ return data_algebra .user_fn . FnTerm (
161181 # x is a pandas Series
162- lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .isocalendar ().week .astype ('int64' ),
163- args = col ,
182+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .isocalendar ().week .astype ('int64' ),
183+ sql_fn = lambda subs , db_model : f'EXTRACT(WEEK FROM { subs [0 ]} )' ,
184+ args = [col ],
185+ display_form = f'weekofyear({ col } )' ,
164186 name = 'weekofyear' ,
165- sql_name = 'EXTRACT' ,
166- sql_prefix = 'WEEK FROM ' )
187+ )
167188
168189
169190# convert date to dayofweek 1 through 7
170191# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
171192def dayofmonth (col ):
172193 assert isinstance (col , str )
173- return data_algebra .data_ops . user_fn (
194+ return data_algebra .user_fn . FnTerm (
174195 # x is a pandas Series
175- lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .day .astype ('int64' ),
176- args = col ,
196+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .day .astype ('int64' ),
197+ sql_fn = lambda subs , db_model : f'EXTRACT(DAYOFMONTH FROM { subs [0 ]} )' ,
198+ args = [col ],
199+ display_form = f'dayofmonth({ col } )' ,
177200 name = 'dayofmonth' ,
178- sql_name = 'EXTRACT' ,
179- sql_prefix = 'DAYOFMONTH FROM ' )
201+ )
180202
181203
182204# convert date to month
183205# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
184206def month (col ):
185207 assert isinstance (col , str )
186- return data_algebra .data_ops . user_fn (
208+ return data_algebra .user_fn . FnTerm (
187209 # x is a pandas Series
188- lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .month .astype ('int64' ),
189- args = col ,
210+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .month .astype ('int64' ),
211+ sql_fn = lambda subs , db_model : f'EXTRACT(MONTH FROM { subs [0 ]} )' ,
212+ args = [col ],
213+ display_form = f'month({ col } )' ,
190214 name = 'month' ,
191- sql_name = 'EXTRACT' ,
192- sql_prefix = 'MONTH FROM ' )
215+ )
193216
194217
195218# convert date to quarter
196219# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
197220def quarter (col ):
198221 assert isinstance (col , str )
199- return data_algebra .data_ops . user_fn (
222+ return data_algebra .user_fn . FnTerm (
200223 # x is a pandas Series
201- lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .quarter .astype ('int64' ),
202- args = col ,
224+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .quarter .astype ('int64' ),
225+ sql_fn = lambda subs , db_model : f'EXTRACT(QUARTER FROM { subs [0 ]} )' ,
226+ args = [col ],
227+ display_form = f'quarter({ col } )' ,
203228 name = 'quarter' ,
204- sql_name = 'EXTRACT' ,
205- sql_prefix = 'QUARTER FROM ' )
229+ )
206230
207231
208232# convert date to year
209233# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
210234def year (col ):
211235 assert isinstance (col , str )
212- return data_algebra .data_ops .user_fn (
213- # x is a pandas Series
214- lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .year .astype ('int64' ),
215- args = col ,
236+ return data_algebra .user_fn .FnTerm (
237+ pandas_fn = lambda x : data_algebra .default_data_model .pd .to_datetime (x ).dt .year .astype ('int64' ),
238+ sql_fn = lambda subs , db_model : f'EXTRACT(YEAR FROM { subs [0 ]} )' ,
239+ args = [col ],
240+ display_form = f'year({ col } )' ,
216241 name = 'year' ,
217- sql_name = 'EXTRACT' ,
218- sql_prefix = 'YEAR FROM ' )
242+ )
219243
220244
221245# compute difference in timestamps in seconds
222246# https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions#timestamp_diff
223247def timestamp_diff (col1 , col2 ):
224248 assert isinstance (col1 , str )
225249 assert isinstance (col2 , str )
226- return data_algebra .data_ops . user_fn (
250+ return data_algebra .user_fn . FnTerm (
227251 # https://stackoverflow.com/a/41340398
228252 # looks like Timedelta is scalar
229253 # TODO: find vectorized form
230- lambda c1 , c2 : [
254+ pandas_fn = lambda c1 , c2 : [
231255 data_algebra .default_data_model .pd .Timedelta (c1 [i ] - c2 [i ]).total_seconds () for i in range (len (c1 ))],
256+ sql_fn = lambda subs , db_model : f'TIMESTAMP_DIFF({ subs [0 ]} , { subs [1 ]} , SECOND)' ,
232257 args = [col1 , col2 ],
258+ display_form = f'timestamp_diff({ col1 } , { col2 } )' ,
233259 name = 'timestamp_diff' ,
234- sql_name = 'TIMESTAMP_DIFF' ,
235- sql_suffix = ', SECOND' )
260+ )
236261
237262
238263# compute difference in dates in days
239264def date_diff (col1 , col2 ):
240265 assert isinstance (col1 , str )
241266 assert isinstance (col2 , str )
242- return data_algebra .data_ops . user_fn (
267+ return data_algebra .user_fn . FnTerm (
243268 # https://stackoverflow.com/a/41340398
244269 # looks like Timedelta is scalar
245270 # TODO: find vectorized form
246- lambda c1 , c2 : [
271+ pandas_fn = lambda c1 , c2 : [
247272 data_algebra .default_data_model .pd .Timedelta (c1 [i ] - c2 [i ]).days for i in range (len (c1 ))],
273+ sql_fn = lambda subs , db_model : f'TIMESTAMP_DIFF({ subs [0 ]} , { subs [1 ]} , DAY)' ,
248274 args = [col1 , col2 ],
275+ display_form = f'date_diff({ col1 } , { col2 } )' ,
249276 name = 'date_diff' ,
250- sql_name = 'TIMESTAMP_DIFF' ,
251- sql_suffix = ', DAY' )
277+ )
252278
253279
254280# find the nearest Sunday at or before this date
255281def base_Sunday (col ):
256282 assert isinstance (col , str )
257- return data_algebra .data_ops . user_fn (
283+ return data_algebra .user_fn . FnTerm (
258284 # x is a pandas Series of datetime.date
259285 # TODO: vectorize
260- lambda x : [x [i ] - datetime .timedelta (days = (x [i ].weekday () + 1 ) % 7 ) for i in range (len (x ))],
286+ pandas_fn = lambda x : [x [i ] - datetime .timedelta (days = (x [i ].weekday () + 1 ) % 7 ) for i in range (len (x ))],
287+ sql_fn = lambda subs , db_model : f'DATE_SUB({ subs [0 ]} , INTERVAL (EXTRACT(DAYOFWEEK FROM { subs [0 ]} ) - 1) DAY)' ,
261288 args = [col ],
289+ display_form = f'base_Sunday({ col } )' ,
262290 name = 'base_Sunday' ,
263- sql_name = 'DATE_SUB' ,
264- sql_prefix = '' ,
265- sql_suffix = f', INTERVAL (EXTRACT(DAYOFWEEK FROM `{ col } `)-1) DAY' )
291+ )
266292
267293
268294# TODO: documentation page
0 commit comments