work on zero-length operators on project

JohnMount · JohnMount · commit 4e77c04f31ea · 2019-10-04T11:06:26.000-07:00
diff --git a/build/lib/data_algebra/pandas_model.py b/build/lib/data_algebra/pandas_model.py
@@ -128,28 +128,31 @@ def project_step(self, op, *, data_map, eval_env):
         # build an agg list: https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
         # https://stackoverflow.com/questions/44635626/rename-result-columns-from-pandas-aggregation-futurewarning-using-a-dict-with
         for (k, opk) in op.ops.items():
-            if len(opk.args) != 1:
+            if len(opk.args) > 1:
                 raise ValueError(
                     "non-trivial aggregation expression: " + str(k) + ": " + str(opk)
                 )
-            if not isinstance(opk.args[0], data_algebra.expr_rep.ColumnReference):
-                raise ValueError(
-                    "windows expression argument must be a column: "
-                    + str(k)
-                    + ": "
-                    + str(opk)
-                )
+            if len(opk.args) > 0:
+                if not isinstance(opk.args[0], data_algebra.expr_rep.ColumnReference):
+                    raise ValueError(
+                        "windows expression argument must be a column: "
+                        + str(k)
+                        + ": "
+                        + str(opk)
+                    )
         res = op.sources[0].eval_implementation(
             data_map=data_map, eval_env=eval_env, data_model=self
         )
+        res['_data_table_temp_col'] = 1
         if len(op.group_by) > 0:
             res = res.groupby(op.group_by)
-        remove_temp_col = False
         if len(op.ops) > 0:
-            cols = {k: res[str(opk.args[0])].agg(opk.op) for (k, opk) in op.ops.items()}
+            cols = {k: (res[str(opk.args[0])].agg(opk.op) if
+                        len(opk.args)>0 else
+                        res['_data_table_temp_col'].agg(opk.op))
+                    for (k, opk) in op.ops.items()}
         else:
-            cols = {'_data_table_temp_col': res[op.sources[0].column_names[0]].agg('sum') }
-            remove_temp_col = True
+            cols = {'_data_table_temp_col': res['_data_table_temp_col'].agg('sum') }
 
         # agg can return scalars, which then can't be made into a pandas.DataFrame
         def promote_scalar(v):
@@ -164,7 +167,7 @@ def promote_scalar(v):
         res = self.columns_to_frame(cols).reset_index(
             drop=len(op.group_by) < 1
         )  # grouping variables in the index
-        if remove_temp_col:
+        if '_data_table_temp_col' in res.columns:
             res = res.drop('_data_table_temp_col', 1)
         return res
 
diff --git a/coverage.txt b/coverage.txt
@@ -2,33 +2,34 @@
 platform darwin -- Python 3.6.9, pytest-5.0.1, py-1.8.0, pluggy-0.12.0
 rootdir: /Users/johnmount/Documents/work/data_algebra
 plugins: cov-2.7.1
-collected 36 items
+collected 37 items
 
 tests/test_R_yaml.py .                                                   [  2%]
 tests/test_apply.py .                                                    [  5%]
 tests/test_cdata1.py .                                                   [  8%]
-tests/test_cdata_example.py ....                                         [ 19%]
-tests/test_cols_used.py .                                                [ 22%]
+tests/test_cdata_example.py ....                                         [ 18%]
+tests/test_cols_used.py .                                                [ 21%]
 tests/test_dask.py ..                                                    [ 27%]
-tests/test_datatable.py .                                                [ 30%]
-tests/test_drop_columns.py .                                             [ 33%]
-tests/test_example_data_ops.py .                                         [ 36%]
-tests/test_exp.py .                                                      [ 38%]
-tests/test_export_neg.py .                                               [ 41%]
-tests/test_free_expr.py .                                                [ 44%]
-tests/test_if_else.py .                                                  [ 47%]
-tests/test_math.py .                                                     [ 50%]
-tests/test_natural_join.py .                                             [ 52%]
-tests/test_neg.py .                                                      [ 55%]
-tests/test_null_bad.py .                                                 [ 58%]
-tests/test_parse.py .                                                    [ 61%]
-tests/test_poject.py ..                                                  [ 66%]
-tests/test_project.py .                                                  [ 69%]
-tests/test_scatter_example.py .                                          [ 72%]
-tests/test_scoring_example.py .                                          [ 75%]
-tests/test_select_stacking.py .                                          [ 77%]
-tests/test_simple.py .....                                               [ 91%]
-tests/test_sqlite.py .                                                   [ 94%]
+tests/test_datatable.py .                                                [ 29%]
+tests/test_drop_columns.py .                                             [ 32%]
+tests/test_example_data_ops.py .                                         [ 35%]
+tests/test_exp.py .                                                      [ 37%]
+tests/test_export_neg.py .                                               [ 40%]
+tests/test_free_expr.py .                                                [ 43%]
+tests/test_if_else.py .                                                  [ 45%]
+tests/test_math.py .                                                     [ 48%]
+tests/test_natural_join.py .                                             [ 51%]
+tests/test_neg.py .                                                      [ 54%]
+tests/test_null_bad.py .                                                 [ 56%]
+tests/test_parse.py .                                                    [ 59%]
+tests/test_poject.py ..                                                  [ 64%]
+tests/test_project.py .                                                  [ 67%]
+tests/test_scatter_example.py .                                          [ 70%]
+tests/test_scoring_example.py .                                          [ 72%]
+tests/test_select_stacking.py .                                          [ 75%]
+tests/test_simple.py .....                                               [ 89%]
+tests/test_sqlite.py .                                                   [ 91%]
+tests/test_strat_example.py .                                            [ 94%]
 tests/test_window2.py .                                                  [ 97%]
 tests/test_window_fns.py .                                               [100%]
 
@@ -61,4 +62,4 @@ data_algebra/yaml.py                119     15    87%
 TOTAL                              3396   1021    70%
 
 
-========================== 36 passed in 7.65 seconds ===========================
+========================== 37 passed in 7.58 seconds ===========================
diff --git a/data_algebra/pandas_model.py b/data_algebra/pandas_model.py
@@ -128,28 +128,31 @@ def project_step(self, op, *, data_map, eval_env):
         # build an agg list: https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
         # https://stackoverflow.com/questions/44635626/rename-result-columns-from-pandas-aggregation-futurewarning-using-a-dict-with
         for (k, opk) in op.ops.items():
-            if len(opk.args) != 1:
+            if len(opk.args) > 1:
                 raise ValueError(
                     "non-trivial aggregation expression: " + str(k) + ": " + str(opk)
                 )
-            if not isinstance(opk.args[0], data_algebra.expr_rep.ColumnReference):
-                raise ValueError(
-                    "windows expression argument must be a column: "
-                    + str(k)
-                    + ": "
-                    + str(opk)
-                )
+            if len(opk.args) > 0:
+                if not isinstance(opk.args[0], data_algebra.expr_rep.ColumnReference):
+                    raise ValueError(
+                        "windows expression argument must be a column: "
+                        + str(k)
+                        + ": "
+                        + str(opk)
+                    )
         res = op.sources[0].eval_implementation(
             data_map=data_map, eval_env=eval_env, data_model=self
         )
+        res['_data_table_temp_col'] = 1
         if len(op.group_by) > 0:
             res = res.groupby(op.group_by)
-        remove_temp_col = False
         if len(op.ops) > 0:
-            cols = {k: res[str(opk.args[0])].agg(opk.op) for (k, opk) in op.ops.items()}
+            cols = {k: (res[str(opk.args[0])].agg(opk.op) if
+                        len(opk.args)>0 else
+                        res['_data_table_temp_col'].agg(opk.op))
+                    for (k, opk) in op.ops.items()}
         else:
-            cols = {'_data_table_temp_col': res[op.sources[0].column_names[0]].agg('sum') }
-            remove_temp_col = True
+            cols = {'_data_table_temp_col': res['_data_table_temp_col'].agg('sum') }
 
         # agg can return scalars, which then can't be made into a pandas.DataFrame
         def promote_scalar(v):
@@ -164,7 +167,7 @@ def promote_scalar(v):
         res = self.columns_to_frame(cols).reset_index(
             drop=len(op.group_by) < 1
         )  # grouping variables in the index
-        if remove_temp_col:
+        if '_data_table_temp_col' in res.columns:
             res = res.drop('_data_table_temp_col', 1)
         return res
 
diff --git a/dist/data_algebra-0.2.5-py3-none-any.whl b/dist/data_algebra-0.2.5-py3-none-any.whl
diff --git a/dist/data_algebra-0.2.5.tar.gz b/dist/data_algebra-0.2.5.tar.gz
diff --git a/tests/test_strat_example.py b/tests/test_strat_example.py
@@ -0,0 +1,33 @@
+
+# https://github.com/WinVector/pyvtreat/blob/master/Examples/StratifiedCrossPlan/StratifiedCrossPlan.ipynb
+
+import pandas
+import data_algebra.util
+from data_algebra.data_ops import *
+
+
+def test_strat_example():
+    prepared_stratified = pandas.DataFrame({
+        'y': [1, 0, 0, 1, 0, 0],
+        'g': [0, 0, 0, 1, 1, 1],
+        'x': [1, 2, 3, 4, 5, 6]
+    })
+
+    ops = describe_table(prepared_stratified). \
+        project({
+        'sum': 'y.sum()',
+        'mean': 'y.mean()',
+        'size': '_size()',
+    },
+        group_by=['g'])
+
+    res = ops.transform(prepared_stratified)
+
+    expect = pandas.DataFrame({
+        'g': [0, 1],
+        'sum': [1, 1],
+        'mean': [0.3333333333333333, 0.3333333333333333],
+        'size': [3, 3],
+        })
+
+    assert data_algebra.util.equivalent_frames(res, expect)