Skip to content

Commit 09a4b71

Browse files
committed
work on partioned/windowed functions
1 parent b1ce26c commit 09a4b71

File tree

10 files changed

+213
-46
lines changed

10 files changed

+213
-46
lines changed

build/lib/data_algebra/data_ops.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,22 @@
1515
import data_algebra.env
1616

1717

18-
have_black = False
18+
_have_black = False
1919
try:
2020
# noinspection PyUnresolvedReferences
2121
import black
2222

23-
have_black = True
23+
_have_black = True
2424
except ImportError:
2525
pass
2626

2727

28-
have_sqlparse = False
28+
_have_sqlparse = False
2929
try:
3030
# noinspection PyUnresolvedReferences
3131
import sqlparse
3232

33-
have_sqlparse = True
33+
_have_sqlparse = True
3434
except ImportError:
3535
pass
3636

@@ -243,7 +243,7 @@ def to_python(self, *, indent=0, strict=True, pretty=False, black_mode=None):
243243
indent=indent, strict=strict, print_sources=True
244244
)
245245
if pretty:
246-
if have_black:
246+
if _have_black:
247247
try:
248248
if black_mode is None:
249249
black_mode = black.FileMode()
@@ -276,7 +276,7 @@ def to_sql(self, db_model, *, pretty=False, encoding=None, sqlparse_options=None
276276
sql_str = self.to_sql_implementation(
277277
db_model=db_model, using=None, temp_id_source=temp_id_source
278278
)
279-
if pretty and have_sqlparse:
279+
if pretty and _have_sqlparse:
280280
try:
281281
sql_str = sqlparse.format(
282282
sql_str, encoding=encoding, **sqlparse_options
@@ -482,6 +482,8 @@ def select_columns(self, columns):
482482
return self.sources[0].select_columns(columns)
483483
if isinstance(self, SelectColumnsNode):
484484
return self.sources[0].select_columns(columns)
485+
if isinstance(self, DropColumnsNode):
486+
return self.sources[0].select_columns(columns)
485487
return SelectColumnsNode(source=self, columns=columns)
486488

487489
def rename_columns(self, column_remapping):
@@ -627,7 +629,7 @@ def to_sql(self, db_model, *, pretty=False, encoding=None, sqlparse_options=None
627629
sql_str = self.to_sql_implementation(
628630
db_model=db_model, using=None, temp_id_source=temp_id_source, force_sql=True
629631
)
630-
if pretty and have_sqlparse:
632+
if pretty and _have_sqlparse:
631633
sql_str = sqlparse.format(sql_str, encoding=encoding, **sqlparse_options)
632634
return sql_str
633635

@@ -873,6 +875,10 @@ def __init__(
873875
)
874876
if len(ops) < 1:
875877
raise ValueError("no ops")
878+
for (k, opk) in ops.items(): # look for aggregation functions
879+
if isinstance(opk, data_algebra.expr_rep.Expression):
880+
if opk.op in data_algebra.expr_rep.fn_names_that_imply_windowed_situation:
881+
partitioned = True
876882
self.ops = ops
877883
if partition_by is None:
878884
partition_by = []
@@ -928,6 +934,22 @@ def __init__(
928934
)
929935
if len(bad_overwrite) > 0:
930936
raise ValueError("tried to change: " + str(bad_overwrite))
937+
# check op arguments are very simple: all arguments are column names
938+
if partitioned:
939+
for (k, opk) in ops.items():
940+
if not isinstance(opk, data_algebra.expr_rep.Expression):
941+
raise ValueError(
942+
"non-aggregated expression in windowed/partitoned extend: " +
943+
"'" + k + "': '" + opk.to_pandas() + "'"
944+
)
945+
if len(opk.args) > 1:
946+
raise ValueError("in windowed/partitioned situations only simple operators are allowed, " +
947+
"'" + k + "': '" + opk.to_pandas() + "' term is too complex an expression")
948+
if len(opk.args) > 0:
949+
value_name = opk.args[0].to_pandas()
950+
if value_name not in source.column_set:
951+
raise ValueError("in windowed/partitioned situations only simple operators are allowed, " +
952+
"'" + k + "': '" + opk.to_pandas() + "' term is too complex an expression")
931953
ViewRepresentation.__init__(self, column_names=column_names, sources=[source])
932954

933955
def columns_used_from_sources(self, using=None):

build/lib/data_algebra/expr_rep.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,47 @@
1212
# http://tomerfiliba.com/blog/Infix-Operators/
1313

1414

15+
# list of window/aggregation functons that must be windowed/aggegated
16+
# (note some other functions work in more than one mode)
17+
fn_names_that_imply_windowed_situation = {
18+
'all',
19+
'any',
20+
'bfill',
21+
'count',
22+
'cumcount',
23+
'cummax',
24+
'cummin',
25+
'cumprod',
26+
'cumsum',
27+
'ffill',
28+
'first',
29+
'head',
30+
'is_monotonic_decreasing',
31+
'is_monotonic_increasing',
32+
'last',
33+
'max',
34+
'mean',
35+
'median',
36+
'min',
37+
'ngroup',
38+
'nlargest',
39+
'nsmallest',
40+
'nth',
41+
'nunique',
42+
'ohlc',
43+
'pct_change',
44+
'rank',
45+
'sem',
46+
'shift',
47+
'size',
48+
'std',
49+
'tail',
50+
'unique',
51+
'value_counts',
52+
'var',
53+
}
54+
55+
1556
class Term:
1657
"""Inherit from this class to capture expressions.
1758
Abstract class, should be extended for use."""

build/lib/data_algebra/pandas_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def extend_step(self, op, *, data_map, eval_env):
8888
if c not in col_list:
8989
col_list = col_list + [c]
9090
value_name = None
91+
# assumes all args are column names, enforce this earlier
9192
if len(opk.args) > 0:
9293
value_name = opk.args[0].to_pandas()
9394
if value_name not in set(col_list):

coverage.txt

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,44 @@
22
platform darwin -- Python 3.6.9, pytest-5.2.2, py-1.8.0, pluggy-0.13.0
33
rootdir: /Users/johnmount/Documents/work/data_algebra
44
plugins: cov-2.8.1
5-
collected 58 items
5+
collected 60 items
66

77
tests/test_R_yaml.py . [ 1%]
88
tests/test_apply.py . [ 3%]
99
tests/test_arrow1.py . [ 5%]
1010
tests/test_calc_warnings_errors.py . [ 6%]
11-
tests/test_cc.py ...... [ 17%]
11+
tests/test_cc.py ...... [ 16%]
1212
tests/test_cdata1.py . [ 18%]
1313
tests/test_cdata_example.py .... [ 25%]
14-
tests/test_cols_used.py . [ 27%]
15-
tests/test_concat_rows.py . [ 29%]
16-
tests/test_degenerate_project.py . [ 31%]
17-
tests/test_drop_columns.py . [ 32%]
18-
tests/test_exampe1.py . [ 34%]
19-
tests/test_example_data_ops.py . [ 36%]
20-
tests/test_exp.py . [ 37%]
21-
tests/test_export_neg.py . [ 39%]
22-
tests/test_expr_parse.py . [ 41%]
23-
tests/test_extend.py ... [ 46%]
24-
tests/test_flow_text.py . [ 48%]
25-
tests/test_free_expr.py . [ 50%]
26-
tests/test_ghost_col_issue.py . [ 51%]
27-
tests/test_if_else.py . [ 53%]
28-
tests/test_join_check.py . [ 55%]
29-
tests/test_join_effects.py .. [ 58%]
30-
tests/test_math.py . [ 60%]
31-
tests/test_natural_join.py . [ 62%]
32-
tests/test_neg.py . [ 63%]
33-
tests/test_null_bad.py . [ 65%]
34-
tests/test_parse.py . [ 67%]
35-
tests/test_project.py ..... [ 75%]
36-
tests/test_scatter_example.py . [ 77%]
37-
tests/test_scoring_example.py . [ 79%]
14+
tests/test_cols_used.py . [ 26%]
15+
tests/test_concat_rows.py . [ 28%]
16+
tests/test_degenerate_project.py . [ 30%]
17+
tests/test_drop_columns.py . [ 31%]
18+
tests/test_exampe1.py ... [ 36%]
19+
tests/test_example_data_ops.py . [ 38%]
20+
tests/test_exp.py . [ 40%]
21+
tests/test_export_neg.py . [ 41%]
22+
tests/test_expr_parse.py . [ 43%]
23+
tests/test_extend.py ... [ 48%]
24+
tests/test_flow_text.py . [ 50%]
25+
tests/test_free_expr.py . [ 51%]
26+
tests/test_ghost_col_issue.py . [ 53%]
27+
tests/test_if_else.py . [ 55%]
28+
tests/test_join_check.py . [ 56%]
29+
tests/test_join_effects.py .. [ 60%]
30+
tests/test_math.py . [ 61%]
31+
tests/test_natural_join.py . [ 63%]
32+
tests/test_neg.py . [ 65%]
33+
tests/test_null_bad.py . [ 66%]
34+
tests/test_parse.py . [ 68%]
35+
tests/test_project.py ..... [ 76%]
36+
tests/test_scatter_example.py . [ 78%]
37+
tests/test_scoring_example.py . [ 80%]
3838
tests/test_select_stacking.py . [ 81%]
39-
tests/test_simple.py ..... [ 89%]
39+
tests/test_simple.py ..... [ 90%]
4040
tests/test_spark_sql.py . [ 91%]
4141
tests/test_sqlite.py . [ 93%]
42-
tests/test_strat_example.py . [ 94%]
42+
tests/test_strat_example.py . [ 95%]
4343
tests/test_table_is_key_by_columns.py . [ 96%]
4444
tests/test_window2.py . [ 98%]
4545
tests/test_window_fns.py . [100%]
@@ -56,19 +56,19 @@ data_algebra/cdata.py 228 77 66%
5656
data_algebra/cdata_impl.py 13 1 92%
5757
data_algebra/connected_components.py 49 1 98%
5858
data_algebra/data_model.py 44 16 64%
59-
data_algebra/data_ops.py 1226 275 78%
59+
data_algebra/data_ops.py 1242 278 78%
6060
data_algebra/db_model.py 389 76 80%
6161
data_algebra/diagram.py 56 43 23%
6262
data_algebra/env.py 52 7 87%
6363
data_algebra/expr.py 20 4 80%
64-
data_algebra/expr_rep.py 617 206 67%
64+
data_algebra/expr_rep.py 618 206 67%
6565
data_algebra/flow_text.py 17 0 100%
6666
data_algebra/pandas_model.py 178 22 88%
6767
data_algebra/test_util.py 6 0 100%
6868
data_algebra/util.py 93 13 86%
6969
data_algebra/yaml.py 123 15 88%
7070
----------------------------------------------------------
71-
TOTAL 3387 818 76%
71+
TOTAL 3404 821 76%
7272

7373

74-
============================= 58 passed in 11.61s ==============================
74+
============================== 60 passed in 6.27s ==============================

data_algebra/data_ops.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,22 @@
1515
import data_algebra.env
1616

1717

18-
have_black = False
18+
_have_black = False
1919
try:
2020
# noinspection PyUnresolvedReferences
2121
import black
2222

23-
have_black = True
23+
_have_black = True
2424
except ImportError:
2525
pass
2626

2727

28-
have_sqlparse = False
28+
_have_sqlparse = False
2929
try:
3030
# noinspection PyUnresolvedReferences
3131
import sqlparse
3232

33-
have_sqlparse = True
33+
_have_sqlparse = True
3434
except ImportError:
3535
pass
3636

@@ -243,7 +243,7 @@ def to_python(self, *, indent=0, strict=True, pretty=False, black_mode=None):
243243
indent=indent, strict=strict, print_sources=True
244244
)
245245
if pretty:
246-
if have_black:
246+
if _have_black:
247247
try:
248248
if black_mode is None:
249249
black_mode = black.FileMode()
@@ -276,7 +276,7 @@ def to_sql(self, db_model, *, pretty=False, encoding=None, sqlparse_options=None
276276
sql_str = self.to_sql_implementation(
277277
db_model=db_model, using=None, temp_id_source=temp_id_source
278278
)
279-
if pretty and have_sqlparse:
279+
if pretty and _have_sqlparse:
280280
try:
281281
sql_str = sqlparse.format(
282282
sql_str, encoding=encoding, **sqlparse_options
@@ -482,6 +482,8 @@ def select_columns(self, columns):
482482
return self.sources[0].select_columns(columns)
483483
if isinstance(self, SelectColumnsNode):
484484
return self.sources[0].select_columns(columns)
485+
if isinstance(self, DropColumnsNode):
486+
return self.sources[0].select_columns(columns)
485487
return SelectColumnsNode(source=self, columns=columns)
486488

487489
def rename_columns(self, column_remapping):
@@ -627,7 +629,7 @@ def to_sql(self, db_model, *, pretty=False, encoding=None, sqlparse_options=None
627629
sql_str = self.to_sql_implementation(
628630
db_model=db_model, using=None, temp_id_source=temp_id_source, force_sql=True
629631
)
630-
if pretty and have_sqlparse:
632+
if pretty and _have_sqlparse:
631633
sql_str = sqlparse.format(sql_str, encoding=encoding, **sqlparse_options)
632634
return sql_str
633635

@@ -873,6 +875,10 @@ def __init__(
873875
)
874876
if len(ops) < 1:
875877
raise ValueError("no ops")
878+
for (k, opk) in ops.items(): # look for aggregation functions
879+
if isinstance(opk, data_algebra.expr_rep.Expression):
880+
if opk.op in data_algebra.expr_rep.fn_names_that_imply_windowed_situation:
881+
partitioned = True
876882
self.ops = ops
877883
if partition_by is None:
878884
partition_by = []
@@ -928,6 +934,22 @@ def __init__(
928934
)
929935
if len(bad_overwrite) > 0:
930936
raise ValueError("tried to change: " + str(bad_overwrite))
937+
# check op arguments are very simple: all arguments are column names
938+
if partitioned:
939+
for (k, opk) in ops.items():
940+
if not isinstance(opk, data_algebra.expr_rep.Expression):
941+
raise ValueError(
942+
"non-aggregated expression in windowed/partitoned extend: " +
943+
"'" + k + "': '" + opk.to_pandas() + "'"
944+
)
945+
if len(opk.args) > 1:
946+
raise ValueError("in windowed/partitioned situations only simple operators are allowed, " +
947+
"'" + k + "': '" + opk.to_pandas() + "' term is too complex an expression")
948+
if len(opk.args) > 0:
949+
value_name = opk.args[0].to_pandas()
950+
if value_name not in source.column_set:
951+
raise ValueError("in windowed/partitioned situations only simple operators are allowed, " +
952+
"'" + k + "': '" + opk.to_pandas() + "' term is too complex an expression")
931953
ViewRepresentation.__init__(self, column_names=column_names, sources=[source])
932954

933955
def columns_used_from_sources(self, using=None):

data_algebra/expr_rep.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,47 @@
1212
# http://tomerfiliba.com/blog/Infix-Operators/
1313

1414

15+
# list of window/aggregation functons that must be windowed/aggegated
16+
# (note some other functions work in more than one mode)
17+
fn_names_that_imply_windowed_situation = {
18+
'all',
19+
'any',
20+
'bfill',
21+
'count',
22+
'cumcount',
23+
'cummax',
24+
'cummin',
25+
'cumprod',
26+
'cumsum',
27+
'ffill',
28+
'first',
29+
'head',
30+
'is_monotonic_decreasing',
31+
'is_monotonic_increasing',
32+
'last',
33+
'max',
34+
'mean',
35+
'median',
36+
'min',
37+
'ngroup',
38+
'nlargest',
39+
'nsmallest',
40+
'nth',
41+
'nunique',
42+
'ohlc',
43+
'pct_change',
44+
'rank',
45+
'sem',
46+
'shift',
47+
'size',
48+
'std',
49+
'tail',
50+
'unique',
51+
'value_counts',
52+
'var',
53+
}
54+
55+
1556
class Term:
1657
"""Inherit from this class to capture expressions.
1758
Abstract class, should be extended for use."""

0 commit comments

Comments
 (0)