Skip to content

Commit 0480a9d

Browse files
committed
fix repeated table issue in diagrammer
get zero-column project working re-render examples
1 parent ed9ba7b commit 0480a9d

File tree

16 files changed

+1279
-94
lines changed

16 files changed

+1279
-94
lines changed

Examples/WindowFunctions/WindowFunctions.ipynb

Lines changed: 295 additions & 25 deletions
Large diffs are not rendered by default.

Examples/WindowFunctions/WindowFunctions.md

Lines changed: 660 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 71 additions & 0 deletions
Loading

build/lib/data_algebra/data_ops.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def extend(
6565
):
6666
raise NotImplementedError("base class called")
6767

68-
def project(self, ops, *, group_by=None, parse_env=None):
68+
def project(self, ops=None, *, group_by=None, parse_env=None):
6969
raise NotImplementedError("base class called")
7070

7171
def natural_join(self, b, *, by=None, jointype="INNER"):
@@ -444,7 +444,7 @@ def extend(
444444
parse_env=parse_env,
445445
)
446446

447-
def project(self, ops, *, group_by=None, parse_env=None):
447+
def project(self, ops=None, *, group_by=None, parse_env=None):
448448
return ProjectNode(source=self, ops=ops, group_by=group_by, parse_env=parse_env)
449449

450450
def natural_join(self, b, *, by=None, jointype="INNER"):
@@ -739,12 +739,12 @@ def eval_implementation(self, *, data_map, eval_env, data_model):
739739

740740

741741
class ProjectNode(ViewRepresentation):
742-
def __init__(self, source, ops, *, group_by=None, parse_env=None):
742+
def __init__(self, source, ops=None, *, group_by=None, parse_env=None):
743+
if ops is None:
744+
ops = {}
743745
ops = data_algebra.expr_rep.parse_assignments_in_context(
744746
ops, source, parse_env=parse_env
745747
)
746-
if len(ops) < 1:
747-
raise ValueError("no ops")
748748
self.ops = ops
749749
if group_by is None:
750750
group_by = []
@@ -1203,7 +1203,7 @@ def to_python_implementation(self, *, indent=0, strict=True, print_sources=True)
12031203
+ " " * (indent + 6)
12041204
)
12051205
else:
1206-
s = s + " _1 "
1206+
s = s + " _1, "
12071207
s = s + (
12081208
"by=" + self.by.__repr__() + ", jointype=" + self.jointype.__repr__() + ")"
12091209
)
@@ -1414,9 +1414,11 @@ class Project(data_algebra.pipe.PipeStep):
14141414

14151415
ops: Dict[str, data_algebra.expr_rep.Expression]
14161416

1417-
def __init__(self, ops, *, group_by=None):
1417+
def __init__(self, ops=None, *, group_by=None):
14181418
if isinstance(group_by, str):
14191419
group_by = [group_by]
1420+
if ops is None:
1421+
ops = {}
14201422
data_algebra.pipe.PipeStep.__init__(self, name="Project")
14211423
self._ops = ops
14221424
self.group_by = group_by
@@ -1619,7 +1621,7 @@ def apply(self, other, **kwargs):
16191621
def __repr__(self):
16201622
return (
16211623
"NaturalJoin("
1622-
+ ", b="
1624+
+ "b="
16231625
+ self._b.__repr__()
16241626
+ ", by="
16251627
+ self._by.__repr__()
@@ -1733,9 +1735,11 @@ def extend(
17331735
self.ops.append(op)
17341736
return self
17351737

1736-
def project(self, ops, *, group_by=None, parse_env=None):
1738+
def project(self, ops=None, *, group_by=None, parse_env=None):
17371739
if parse_env is not None:
17381740
raise ValueError("Expected parse_env to be None")
1741+
if ops is None:
1742+
ops = {}
17391743
op = Project(ops=ops, group_by=group_by)
17401744
self.ops.append(op)
17411745
return self

build/lib/data_algebra/db_model.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,10 @@ def project_to_sql(self, project_node, *, using=None, temp_id_source=None):
281281
if using is None:
282282
using = project_node.column_set
283283
subops = {k: op for (k, op) in project_node.ops.items() if k in using}
284-
if len(subops) < 1:
285-
raise ValueError("must produce at least one column")
286284
subusing = project_node.columns_used_from_sources(using=using)[0]
287-
if len(subusing) < 1:
285+
if (len(project_node.group_by) + len(subusing)) < 1:
288286
raise ValueError("must use at least one column")
287+
grouping = [g for g in project_node.group_by]
289288
derived = [
290289
self.expr_to_sql(oi) + " AS " + self.quote_identifier(ci)
291290
for (ci, oi) in subops.items()
@@ -297,7 +296,7 @@ def project_to_sql(self, project_node, *, using=None, temp_id_source=None):
297296
temp_id_source[0] = temp_id_source[0] + 1
298297
sql_str = (
299298
"SELECT "
300-
+ ", ".join(derived)
299+
+ ", ".join(grouping + derived)
301300
+ " FROM ( "
302301
+ subsql
303302
+ " ) "

build/lib/data_algebra/diagram.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,25 @@
2323
def _get_op_str(op):
2424
op_str = op.to_python_implementation(print_sources=False)
2525
if have_black:
26-
black_mode = black.FileMode(line_length=60)
27-
op_str = black.format_str(op_str, mode=black_mode)
26+
try:
27+
black_mode = black.FileMode(line_length=60)
28+
op_str = black.format_str(op_str, mode=black_mode)
29+
except Exception:
30+
pass
2831
return op_str
2932

3033

3134
def _to_digraph_r_nodes(ops, dot, table_keys, nextid, edges):
3235
if isinstance(ops, data_algebra.data_ops.TableDescription):
33-
if ops.key in table_keys:
36+
try:
3437
return table_keys[ops.key]
35-
table_keys.add(ops.key)
36-
node_id = nextid[0]
37-
nextid[0] = node_id + 1
38-
dot.attr("node", shape="folder", color="blue")
39-
dot.node(str(node_id), _get_op_str(ops))
40-
return node_id
38+
except KeyError:
39+
node_id = nextid[0]
40+
table_keys[ops.key] = node_id
41+
nextid[0] = node_id + 1
42+
dot.attr("node", shape="folder", color="blue")
43+
dot.node(str(node_id), _get_op_str(ops))
44+
return node_id
4145
source_ids = [
4246
_to_digraph_r_nodes(
4347
ops=op, dot=dot, table_keys=table_keys, nextid=nextid, edges=edges
@@ -63,7 +67,7 @@ def to_digraph(ops):
6367
raise RuntimeError("graphviz not installed")
6468
dot = graphviz.Digraph()
6569
edges = []
66-
_to_digraph_r_nodes(ops=ops, dot=dot, table_keys=set(), nextid=[0], edges=edges)
70+
_to_digraph_r_nodes(ops=ops, dot=dot, table_keys={}, nextid=[0], edges=edges)
6771
for (sub_id, node_id, label) in edges:
6872
if label is None:
6973
dot.edge(sub_id, node_id)

build/lib/data_algebra/expr_rep.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,8 @@ def parse_assignments_in_context(ops, view, *, parse_env=None):
846846
:param parse_env map of names to values to add to parsing environment
847847
:return:
848848
"""
849+
if ops is None:
850+
ops = {}
849851
if not isinstance(ops, dict):
850852
raise TypeError("ops should be a dictionary")
851853
column_defs = view.column_map.__dict__

build/lib/data_algebra/pandas_model.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ def extend_step(self, op, *, data_map, eval_env):
8888
# Groupby preserves the order of rows within each group.
8989
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html
9090
else:
91-
opframe = subframe
91+
subframe['_data_algebra_temp_g'] = 1
92+
opframe = subframe.groupby(['_data_algebra_temp_g'])
9293
# TODO: document exactly which of these are available
9394
if len(opk.args) == 0:
9495
if opk.op == "row_number":
@@ -143,7 +144,12 @@ def project_step(self, op, *, data_map, eval_env):
143144
)
144145
if len(op.group_by) > 0:
145146
res = res.groupby(op.group_by)
146-
cols = {k: res[str(opk.args[0])].agg(opk.op) for (k, opk) in op.ops.items()}
147+
remove_temp_col = False
148+
if len(op.ops) > 0:
149+
cols = {k: res[str(opk.args[0])].agg(opk.op) for (k, opk) in op.ops.items()}
150+
else:
151+
cols = {'_data_table_temp_col': res[op.sources[0].column_names[0]].agg('sum') }
152+
remove_temp_col = True
147153

148154
# agg can return scalars, which then can't be made into a pandas.DataFrame
149155
def promote_scalar(v):
@@ -158,6 +164,8 @@ def promote_scalar(v):
158164
res = self.columns_to_frame(cols).reset_index(
159165
drop=len(op.group_by) < 1
160166
) # grouping variables in the index
167+
if remove_temp_col:
168+
res = res.drop('_data_table_temp_col', 1)
161169
return res
162170

163171
def select_rows_step(self, op, *, data_map, eval_env):

coverage.txt

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,34 @@
22
platform darwin -- Python 3.6.9, pytest-5.0.1, py-1.8.0, pluggy-0.12.0
33
rootdir: /Users/johnmount/Documents/work/data_algebra
44
plugins: cov-2.7.1
5-
collected 34 items
5+
collected 36 items
66

77
tests/test_R_yaml.py . [ 2%]
88
tests/test_apply.py . [ 5%]
99
tests/test_cdata1.py . [ 8%]
10-
tests/test_cdata_example.py .... [ 20%]
11-
tests/test_cols_used.py . [ 23%]
12-
tests/test_dask.py .. [ 29%]
13-
tests/test_datatable.py . [ 32%]
14-
tests/test_drop_columns.py . [ 35%]
15-
tests/test_example_data_ops.py . [ 38%]
16-
tests/test_exp.py . [ 41%]
17-
tests/test_export_neg.py . [ 44%]
18-
tests/test_free_expr.py . [ 47%]
19-
tests/test_if_else.py . [ 50%]
20-
tests/test_math.py . [ 52%]
21-
tests/test_natural_join.py . [ 55%]
22-
tests/test_neg.py . [ 58%]
23-
tests/test_null_bad.py . [ 61%]
24-
tests/test_parse.py . [ 64%]
25-
tests/test_poject.py . [ 67%]
26-
tests/test_project.py . [ 70%]
27-
tests/test_scatter_example.py . [ 73%]
28-
tests/test_scoring_example.py . [ 76%]
29-
tests/test_select_stacking.py . [ 79%]
30-
tests/test_simple.py ..... [ 94%]
31-
tests/test_sqlite.py . [ 97%]
10+
tests/test_cdata_example.py .... [ 19%]
11+
tests/test_cols_used.py . [ 22%]
12+
tests/test_dask.py .. [ 27%]
13+
tests/test_datatable.py . [ 30%]
14+
tests/test_drop_columns.py . [ 33%]
15+
tests/test_example_data_ops.py . [ 36%]
16+
tests/test_exp.py . [ 38%]
17+
tests/test_export_neg.py . [ 41%]
18+
tests/test_free_expr.py . [ 44%]
19+
tests/test_if_else.py . [ 47%]
20+
tests/test_math.py . [ 50%]
21+
tests/test_natural_join.py . [ 52%]
22+
tests/test_neg.py . [ 55%]
23+
tests/test_null_bad.py . [ 58%]
24+
tests/test_parse.py . [ 61%]
25+
tests/test_poject.py .. [ 66%]
26+
tests/test_project.py . [ 69%]
27+
tests/test_scatter_example.py . [ 72%]
28+
tests/test_scoring_example.py . [ 75%]
29+
tests/test_select_stacking.py . [ 77%]
30+
tests/test_simple.py ..... [ 91%]
31+
tests/test_sqlite.py . [ 94%]
32+
tests/test_window2.py . [ 97%]
3233
tests/test_window_fns.py . [100%]
3334

3435
---------- coverage: platform darwin, python 3.6.9-final-0 -----------
@@ -42,21 +43,21 @@ data_algebra/cdata.py 105 21 80%
4243
data_algebra/cdata_impl.py 152 60 61%
4344
data_algebra/dask_model.py 121 23 81%
4445
data_algebra/data_model.py 41 15 63%
45-
data_algebra/data_ops.py 1088 263 76%
46+
data_algebra/data_ops.py 1092 265 76%
4647
data_algebra/data_types.py 39 19 51%
4748
data_algebra/datatable_model.py 131 81 38%
48-
data_algebra/db_model.py 364 83 77%
49-
data_algebra/diagram.py 52 52 0%
49+
data_algebra/db_model.py 363 82 77%
50+
data_algebra/diagram.py 56 56 0%
5051
data_algebra/env.py 48 7 85%
5152
data_algebra/expr.py 20 4 80%
52-
data_algebra/expr_rep.py 554 199 64%
53-
data_algebra/pandas_model.py 145 25 83%
53+
data_algebra/expr_rep.py 556 200 64%
54+
data_algebra/pandas_model.py 152 24 84%
5455
data_algebra/pending_eval.py 34 34 0%
5556
data_algebra/pipe.py 65 19 71%
5657
data_algebra/util.py 84 7 92%
5758
data_algebra/yaml.py 119 15 87%
5859
-----------------------------------------------------
59-
TOTAL 3331 967 71%
60+
TOTAL 3347 972 71%
6061

6162

62-
========================== 34 passed in 7.43 seconds ===========================
63+
========================== 36 passed in 8.02 seconds ===========================

data_algebra/data_ops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,7 +1203,7 @@ def to_python_implementation(self, *, indent=0, strict=True, print_sources=True)
12031203
+ " " * (indent + 6)
12041204
)
12051205
else:
1206-
s = s + " _1 "
1206+
s = s + " _1, "
12071207
s = s + (
12081208
"by=" + self.by.__repr__() + ", jointype=" + self.jointype.__repr__() + ")"
12091209
)
@@ -1621,7 +1621,7 @@ def apply(self, other, **kwargs):
16211621
def __repr__(self):
16221622
return (
16231623
"NaturalJoin("
1624-
+ ", b="
1624+
+ "b="
16251625
+ self._b.__repr__()
16261626
+ ", by="
16271627
+ self._by.__repr__()

0 commit comments

Comments
 (0)