Skip to content

Commit 42bc468

Browse files
committed
rebuild and retest
1 parent 9fa7a0b commit 42bc468

File tree

10 files changed

+280
-242
lines changed

10 files changed

+280
-242
lines changed

build/lib/data_algebra/bigquery_user_fns.py

Lines changed: 115 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,17 @@
1010
# TODO: re-eng all userfns to just be SQL constants, without pasting, perhaps pass in db handle
1111

1212
# convert datetime to date
13+
import data_algebra.user_fn
14+
15+
1316
def as_int64(col):
1417
assert isinstance(col, str)
15-
return data_algebra.data_ops.user_fn(
16-
lambda x: x.astype('int64'), # x is a pandas Series
18+
return data_algebra.user_fn.FnTerm(
19+
pandas_fn = lambda x: x.astype('int64'), # x is a pandas Series
20+
sql_fn = lambda subs, db_model: f'CAST({subs[0]} AS INT64)',
1721
args=[col],
22+
display_form = f'as_int64({col})',
1823
name='as_int64',
19-
sql_name='CAST',
20-
sql_suffix=' AS INT64'
2124
)
2225

2326

@@ -26,22 +29,25 @@ def trimstr(col_name, *, start=0, stop):
2629
assert isinstance(start, int)
2730
assert isinstance(stop, int)
2831
assert isinstance(col_name, str)
29-
return data_algebra.data_ops.user_fn(
30-
lambda x: x.str.slice(start=start, stop=stop), # x is a pandas Series
32+
return data_algebra.user_fn.FnTerm(
33+
pandas_fn = lambda x: x.str.slice(start=start, stop=stop), # x is a pandas Series
34+
sql_fn = lambda subs, db_model: f'SUBSTR({subs[0]}, {start+1}, {stop})',
3135
args=[col_name],
32-
name=f'trimstr_{start+1}_{stop}',
33-
sql_name='SUBSTR', sql_suffix=f', {start+1}, {stop}')
36+
display_form = f'trimstr({col_name}, start={start}, stop={stop})',
37+
name='trimstr',
38+
)
3439

3540

3641
# replace missing with zeros
3742
def coalesce_0(col):
3843
assert isinstance(col, str)
39-
return data_algebra.data_ops.user_fn(
40-
lambda x: x.fillna(0),
41-
args=col,
44+
return data_algebra.user_fn.FnTerm(
45+
pandas_fn = lambda x: x.fillna(0), # x is a pandas Series
46+
sql_fn = lambda subs, db_model: f'COALESCE({subs[0]}, 0)',
47+
args=[col],
48+
display_form = f'coalesce_0({col})',
4249
name='coalesce_0',
43-
sql_name='COALESCE',
44-
sql_suffix=', 0')
50+
)
4551

4652

4753
# compute difference in dates in days
@@ -59,210 +65,230 @@ def f(*args):
5965
res = res.combine_first(args[i])
6066
return res
6167

62-
return data_algebra.data_ops.user_fn(
63-
f,
68+
return data_algebra.user_fn.FnTerm(
69+
pandas_fn = f,
70+
sql_fn = lambda subs, db_model: f'COALESCE({", ".join(subs)})', # TODO: check SQL
6471
args=cols,
72+
display_form = f'coalesce({cols})',
6573
name='coalesce',
66-
sql_name='COALESCE') # TODO: implement SQL
74+
)
6775

6876

6977
# convert datetime to date
7078
def datetime_to_date(col):
7179
assert isinstance(col, str)
72-
return data_algebra.data_ops.user_fn(
73-
lambda x: x.dt.date.copy(), # x is a pandas Series
74-
args=col,
80+
return data_algebra.user_fn.FnTerm(
81+
pandas_fn = lambda x: x.dt.date.copy(), # x is a pandas Series
82+
sql_fn = lambda subs, db_model: f'DATE({subs[0]})',
83+
args=[col],
84+
display_form = f'datetime_to_date({col})',
7585
name='datetime_to_date',
76-
sql_name='DATE')
86+
)
7787

7888

7989
# convert str to datetime
8090
# https://cloud.google.com/bigquery/docs/reference/standard-sql/datetime_functions
8191
def parse_datetime(col, *, format="%Y-%m-%d %H:%M:%S"):
8292
assert isinstance(col, str)
83-
return data_algebra.data_ops.user_fn(
93+
assert isinstance(format, str)
94+
return data_algebra.user_fn.FnTerm(
8495
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
85-
lambda x: data_algebra.default_data_model.pd.to_datetime(x, format=format), # x is a pandas Series
86-
args=col,
96+
pandas_fn = lambda x: data_algebra.default_data_model.pd.to_datetime(x, format=format), # x is a pandas Series
97+
sql_fn = lambda subs, db_model: f'PARSE_DATETIME({db_model.quote_string(format)}, {subs[0]})',
98+
args=[col],
99+
display_form = f'parse_datetime({col}, format="{format}")',
87100
name='parse_datetime',
88-
sql_name='PARSE_DATETIME', # https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
89-
sql_prefix=f'"{format}", ')
101+
)
90102

91103

92104
# convert str to date
93105
def parse_date(col, *, format="%Y-%m-%d"):
94106
assert isinstance(col, str)
95-
return data_algebra.data_ops.user_fn(
107+
assert isinstance(format, str)
108+
return data_algebra.user_fn.FnTerm(
96109
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
97-
lambda x: data_algebra.default_data_model.pd.to_datetime(x, format=format).dt.date.copy(), # x is a pandas Series
98-
args=col,
110+
pandas_fn=lambda x: data_algebra.default_data_model.pd.to_datetime(x, format=format).dt.date.copy(), # x is a pandas Series
111+
sql_fn=lambda subs, db_model: f'PARSE_DATE({db_model.quote_string(format)}, {subs[0]})',
112+
args=[col],
113+
display_form=f'parse_date({col}, format="{format}")',
99114
name='parse_date',
100-
sql_name='PARSE_DATE', # https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
101-
sql_prefix=f'"{format}", ')
115+
)
102116

103117

104118
# convert datetime to str
105119
# https://cloud.google.com/bigquery/docs/reference/standard-sql/datetime_functions
106120
def format_datetime(col, *, format="%Y-%m-%d %H:%M:%S"):
107121
assert isinstance(col, str)
108-
return data_algebra.data_ops.user_fn(
122+
assert isinstance(format, str)
123+
return data_algebra.user_fn.FnTerm(
109124
# x is a pandas Series
110-
lambda x: x.dt.strftime(date_format=format),
111-
args=col,
125+
pandas_fn=lambda x: x.dt.strftime(date_format=format),
126+
sql_fn=lambda subs, db_model: f'FORMAT_DATETIME({db_model.quote_string(format)}, {subs[0]})',
127+
args=[col],
128+
display_form=f'format_datetime({col}, format="{format}")',
112129
name='format_datetime',
113-
sql_name='FORMAT_DATETIME',
114-
sql_prefix=f'"{format}", ')
130+
)
115131

116132

117133
# convert date to str
118134
def format_date(col, *, format="%Y-%m-%d"):
119135
assert isinstance(col, str)
120-
return data_algebra.data_ops.user_fn(
136+
assert isinstance(format, str)
137+
return data_algebra.user_fn.FnTerm(
121138
# x is a pandas Series
122-
lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.strftime(date_format=format),
123-
args=col,
139+
pandas_fn=lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.strftime(date_format=format),
140+
sql_fn=lambda subs, db_model: f'FORMAT_DATE({db_model.quote_string(format)}, {subs[0]})',
141+
args=[col],
142+
display_form=f'format_date({col}, format="{format}")',
124143
name='format_date',
125-
sql_name='FORMAT_DATE',
126-
sql_prefix=f'"{format}", ')
144+
)
127145

128146

129147
# convert date to dayofweek Sunday=1 through Saturday=7
130148
# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
131149
def dayofweek(col):
132150
assert isinstance(col, str)
133-
return data_algebra.data_ops.user_fn(
151+
return data_algebra.user_fn.FnTerm(
134152
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.dayofweek.html#pandas.Series.dt.dayofweek
135153
# https://stackoverflow.com/a/30222759
136154
# x is a pandas Series
137-
lambda x: 1 + ((data_algebra.default_data_model.pd.to_datetime(x).dt.dayofweek.astype('int64') + 1) % 7),
138-
args=col,
155+
pandas_fn=lambda x: 1 + ((data_algebra.default_data_model.pd.to_datetime(x).dt.dayofweek.astype('int64') + 1) % 7),
156+
sql_fn=lambda subs, db_model: f'EXTRACT(DAYOFWEEK FROM {subs[0]})',
157+
args=[col],
158+
display_form=f'dayofweek({col})',
139159
name='dayofweek',
140-
sql_name='EXTRACT',
141-
sql_prefix='DAYOFWEEK FROM ')
160+
)
142161

143162

144163
# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
145164
def dayofyear(col):
146165
assert isinstance(col, str)
147-
return data_algebra.data_ops.user_fn(
166+
return data_algebra.user_fn.FnTerm(
148167
# x is a pandas Series
149-
lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.dayofyear.astype('int64'),
150-
args=col,
168+
pandas_fn=lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.dayofyear.astype('int64'),
169+
sql_fn=lambda subs, db_model: f'EXTRACT(DAYOFYEAR FROM {subs[0]})',
170+
args=[col],
171+
display_form=f'dayofyear({col})',
151172
name='dayofyear',
152-
sql_name='EXTRACT',
153-
sql_prefix='DAYOFYEAR FROM ')
173+
)
154174

155175

156176
# convert date to week of year
157177
# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
158178
def weekofyear(col):
159179
assert isinstance(col, str)
160-
return data_algebra.data_ops.user_fn(
180+
return data_algebra.user_fn.FnTerm(
161181
# x is a pandas Series
162-
lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.isocalendar().week.astype('int64'),
163-
args=col,
182+
pandas_fn=lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.isocalendar().week.astype('int64'),
183+
sql_fn=lambda subs, db_model: f'EXTRACT(WEEK FROM {subs[0]})',
184+
args=[col],
185+
display_form=f'weekofyear({col})',
164186
name='weekofyear',
165-
sql_name='EXTRACT',
166-
sql_prefix='WEEK FROM ')
187+
)
167188

168189

169190
# convert date to dayofweek 1 through 7
170191
# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
171192
def dayofmonth(col):
172193
assert isinstance(col, str)
173-
return data_algebra.data_ops.user_fn(
194+
return data_algebra.user_fn.FnTerm(
174195
# x is a pandas Series
175-
lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.day.astype('int64'),
176-
args=col,
196+
pandas_fn=lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.day.astype('int64'),
197+
sql_fn=lambda subs, db_model: f'EXTRACT(DAYOFMONTH FROM {subs[0]})',
198+
args=[col],
199+
display_form=f'dayofmonth({col})',
177200
name='dayofmonth',
178-
sql_name='EXTRACT',
179-
sql_prefix='DAYOFMONTH FROM ')
201+
)
180202

181203

182204
# convert date to month
183205
# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
184206
def month(col):
185207
assert isinstance(col, str)
186-
return data_algebra.data_ops.user_fn(
208+
return data_algebra.user_fn.FnTerm(
187209
# x is a pandas Series
188-
lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.month.astype('int64'),
189-
args=col,
210+
pandas_fn=lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.month.astype('int64'),
211+
sql_fn=lambda subs, db_model: f'EXTRACT(MONTH FROM {subs[0]})',
212+
args=[col],
213+
display_form=f'month({col})',
190214
name='month',
191-
sql_name='EXTRACT',
192-
sql_prefix='MONTH FROM ')
215+
)
193216

194217

195218
# convert date to quarter
196219
# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
197220
def quarter(col):
198221
assert isinstance(col, str)
199-
return data_algebra.data_ops.user_fn(
222+
return data_algebra.user_fn.FnTerm(
200223
# x is a pandas Series
201-
lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.quarter.astype('int64'),
202-
args=col,
224+
pandas_fn=lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.quarter.astype('int64'),
225+
sql_fn=lambda subs, db_model: f'EXTRACT(QUARTER FROM {subs[0]})',
226+
args=[col],
227+
display_form=f'quarter({col})',
203228
name='quarter',
204-
sql_name='EXTRACT',
205-
sql_prefix='QUARTER FROM ')
229+
)
206230

207231

208232
# convert date to year
209233
# https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions
210234
def year(col):
211235
assert isinstance(col, str)
212-
return data_algebra.data_ops.user_fn(
213-
# x is a pandas Series
214-
lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.year.astype('int64'),
215-
args=col,
236+
return data_algebra.user_fn.FnTerm(
237+
pandas_fn = lambda x: data_algebra.default_data_model.pd.to_datetime(x).dt.year.astype('int64'),
238+
sql_fn = lambda subs, db_model: f'EXTRACT(YEAR FROM {subs[0]})',
239+
args=[col],
240+
display_form = f'year({col})',
216241
name='year',
217-
sql_name='EXTRACT',
218-
sql_prefix='YEAR FROM ')
242+
)
219243

220244

221245
# compute difference in timestamps in seconds
222246
# https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions#timestamp_diff
223247
def timestamp_diff(col1, col2):
224248
assert isinstance(col1, str)
225249
assert isinstance(col2, str)
226-
return data_algebra.data_ops.user_fn(
250+
return data_algebra.user_fn.FnTerm(
227251
# https://stackoverflow.com/a/41340398
228252
# looks like Timedelta is scalar
229253
# TODO: find vectorized form
230-
lambda c1, c2: [
254+
pandas_fn = lambda c1, c2: [
231255
data_algebra.default_data_model.pd.Timedelta(c1[i] - c2[i]).total_seconds() for i in range(len(c1))],
256+
sql_fn = lambda subs, db_model: f'TIMESTAMP_DIFF({subs[0]}, {subs[1]}, SECOND)',
232257
args=[col1, col2],
258+
display_form = f'timestamp_diff({col1}, {col2})',
233259
name='timestamp_diff',
234-
sql_name='TIMESTAMP_DIFF',
235-
sql_suffix=', SECOND')
260+
)
236261

237262

238263
# compute difference in dates in days
239264
def date_diff(col1, col2):
240265
assert isinstance(col1, str)
241266
assert isinstance(col2, str)
242-
return data_algebra.data_ops.user_fn(
267+
return data_algebra.user_fn.FnTerm(
243268
# https://stackoverflow.com/a/41340398
244269
# looks like Timedelta is scalar
245270
# TODO: find vectorized form
246-
lambda c1, c2: [
271+
pandas_fn=lambda c1, c2: [
247272
data_algebra.default_data_model.pd.Timedelta(c1[i] - c2[i]).days for i in range(len(c1))],
273+
sql_fn = lambda subs, db_model: f'TIMESTAMP_DIFF({subs[0]}, {subs[1]}, DAY)',
248274
args=[col1, col2],
275+
display_form = f'date_diff({col1}, {col2})',
249276
name='date_diff',
250-
sql_name='TIMESTAMP_DIFF',
251-
sql_suffix=', DAY')
277+
)
252278

253279

254280
# find the nearest Sunday at or before this date
255281
def base_Sunday(col):
256282
assert isinstance(col, str)
257-
return data_algebra.data_ops.user_fn(
283+
return data_algebra.user_fn.FnTerm(
258284
# x is a pandas Series of datetime.date
259285
# TODO: vectorize
260-
lambda x: [x[i] - datetime.timedelta(days= (x[i].weekday() + 1) % 7) for i in range(len(x))],
286+
pandas_fn=lambda x: [x[i] - datetime.timedelta(days=(x[i].weekday() + 1) % 7) for i in range(len(x))],
287+
sql_fn = lambda subs, db_model: f'DATE_SUB({subs[0]}, INTERVAL (EXTRACT(DAYOFWEEK FROM {subs[0]}) - 1) DAY)',
261288
args=[col],
289+
display_form = f'base_Sunday({col})',
262290
name='base_Sunday',
263-
sql_name='DATE_SUB',
264-
sql_prefix='',
265-
sql_suffix=f', INTERVAL (EXTRACT(DAYOFWEEK FROM `{col}`)-1) DAY')
291+
)
266292

267293

268294
# TODO: documentation page

0 commit comments

Comments
 (0)