Add fully associative arrow representation

JohnMount · JohnMount · commit bf4d0689c90b · 2019-10-03T11:25:47.000-07:00
diff --git a/Examples/WindowFunctions/Arrow.ipynb b/Examples/WindowFunctions/Arrow.ipynb
@@ -59,7 +59,7 @@
     {
      "name": "stdout",
      "text": [
-      "[{'g', 'v', 'x'} -> ['g', 'x', 'v', 'ngroup']]\n"
+      "[{'g', 'x', 'v'} -> ['g', 'x', 'v', 'ngroup']]\n"
      ],
      "output_type": "stream"
     }
@@ -114,7 +114,7 @@
     {
      "name": "stdout",
      "text": [
-      "[{'g', 'v', 'ngroup', 'x'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v']]\n"
+      "[{'x', 'g', 'v', 'ngroup'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v']]\n"
      ],
      "output_type": "stream"
     }
@@ -174,7 +174,7 @@
     {
      "name": "stdout",
      "text": [
-      "[{'g', 'x', 'row_number', 'v', 'shift_v', 'ngroup'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v', 'size', 'max_v', 'min_v', 'sum_v', 'mean_v', 'count_v', 'size_v']]\n"
+      "[{'x', 'ngroup', 'g', 'row_number', 'v', 'shift_v'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v', 'size', 'max_v', 'min_v', 'sum_v', 'mean_v', 'count_v', 'size_v']]\n"
      ],
      "output_type": "stream"
     }
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ can perform data engineering in [`Pandas`](https://pandas.pydata.org) and genera
 Install `data_algebra` with either of:
 
   * `pip install data_algebra`
-  * `pip install https://github.com/WinVector/data_algebra/raw/master/dist/data_algebra-0.2.4.tar.gz`
+  * `pip install https://github.com/WinVector/data_algebra/raw/master/dist/data_algebra-0.2.5.tar.gz`
 
 # Announcement
 
diff --git a/build/lib/data_algebra/__init__.py b/build/lib/data_algebra/__init__.py
@@ -57,7 +57,7 @@
 
 
 __docformat__ = "restructuredtext"
-__version__ = "0.2.4"
+__version__ = "0.2.5"
 
 __doc__ = """
 `data_algebra`<https://github.com/WinVector/data_algebra> is a piped data wrangling system
diff --git a/build/lib/data_algebra/arrow.py b/build/lib/data_algebra/arrow.py
@@ -0,0 +1,68 @@
+
+import copy
+
+import pandas
+
+import data_algebra.data_ops
+
+
+
+class DataOpArrow:
+    """ Represent a section of operators as a categorical arrow."""
+
+    def __init__(self, v):
+        if not isinstance(v, data_algebra.data_ops.ViewRepresentation):
+            raise TypeError("expected v to be data_algebra.data_ops")
+        self.v = v
+        cused = v.columns_used()
+        if len(cused) != 1:
+            raise ValueError("v must use exactly one table")
+        k = [k for k in cused.keys()][0]
+        self.incoming_columns = cused[k]
+        self.outgoing_columns = v.column_names
+
+    def _r_copy_replace(self, ops):
+        """re-write ops replacing any TableDescription with self.v"""
+        if isinstance(ops, data_algebra.data_ops.TableDescription):
+            return self.v
+        node = copy.copy(ops)
+        node.sources = [self._r_copy_replace(s) for s in node.sources]
+        return node
+
+    def transform(self, other):
+        """replace self input table with other"""
+        if isinstance(other, pandas.DataFrame):
+            cols = set(other.columns)
+            missing = set(self.incoming_columns) - cols
+            if len(missing) > 0:
+                raise ValueError("missing required columns: " + str(missing))
+            if len(cols - set(self.incoming_columns)):
+                other = other[self.incoming_columns]
+            return self.v.transform(other)
+        if isinstance(other, data_algebra.data_ops.ViewRepresentation):
+            other = DataOpArrow(other)
+        if not isinstance(other, DataOpArrow):
+            raise TypeError("other must be a DataOpArrow")
+        missing = set(self.incoming_columns) - set(other.outgoing_columns)
+        if len(missing) > 0:
+            raise ValueError("missing required columns: " + str(missing))
+        if len(set(other.outgoing_columns) - set(self.incoming_columns)):
+            # extra columns, in a strict categorical formulation we would
+            # reject this. instead insert a select columns node to get the match
+            other = DataOpArrow(other.v.select_columns([c for c in self.incoming_columns]))
+        # check categorical arrow composition conditions
+        if set(self.incoming_columns) != set(other.outgoing_columns):
+            raise ValueError("arrow composition conditions not met (incoming column set doesn't match outgoing)")
+        return DataOpArrow(other._r_copy_replace(self.v))
+
+    def __rshift__(self, other):  # override self >> other
+        return other.transform(self)
+
+    def __rrshift__(self, other):  # override other >> self
+        return self.transform(other)
+
+    def __repr__(self):
+        return "DataOpArrow(" + self.v.__repr__() + ")"
+
+    def __str__(self):
+        return "[" + str(self.incoming_columns) + " -> " + str(self.outgoing_columns) + "]"
diff --git a/build/lib/data_algebra/pipe.py b/build/lib/data_algebra/pipe.py
@@ -1,3 +1,4 @@
+
 class PipeStep:
     """class to extend to make pipe transform stages
     Examples:
diff --git a/coverage.txt b/coverage.txt
@@ -39,6 +39,7 @@ data_algebra/PostgreSQL.py           21      4    81%
 data_algebra/SQLite.py               91      5    95%
 data_algebra/SparkSQL.py             21     21     0%
 data_algebra/__init__.py             36     10    72%
+data_algebra/arrow.py                49     49     0%
 data_algebra/cdata.py               105     21    80%
 data_algebra/cdata_impl.py          152     60    61%
 data_algebra/dask_model.py          121     23    81%
@@ -57,7 +58,7 @@ data_algebra/pipe.py                 65     19    71%
 data_algebra/util.py                 84      7    92%
 data_algebra/yaml.py                119     15    87%
 -----------------------------------------------------
-TOTAL                              3347    972    71%
+TOTAL                              3396   1021    70%
 
 
-========================== 36 passed in 7.56 seconds ===========================
+========================== 36 passed in 7.65 seconds ===========================
diff --git a/data_algebra.egg-info/PKG-INFO b/data_algebra.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: data-algebra
-Version: 0.2.4
+Version: 0.2.5
 Summary: data_algebra is a data manipulation language that can both generate SQL queries and work on Pandas DataFrames. 
 Home-page: https://github.com/WinVector/data_algebra
 Author: John Mount
diff --git a/data_algebra.egg-info/SOURCES.txt b/data_algebra.egg-info/SOURCES.txt
@@ -4,6 +4,7 @@ data_algebra/PostgreSQL.py
 data_algebra/SQLite.py
 data_algebra/SparkSQL.py
 data_algebra/__init__.py
+data_algebra/arrow.py
 data_algebra/cdata.py
 data_algebra/cdata_impl.py
 data_algebra/dask_model.py
diff --git a/data_algebra/__init__.py b/data_algebra/__init__.py
@@ -57,7 +57,7 @@
 
 
 __docformat__ = "restructuredtext"
-__version__ = "0.2.4"
+__version__ = "0.2.5"
 
 __doc__ = """
 `data_algebra`<https://github.com/WinVector/data_algebra> is a piped data wrangling system
diff --git a/dist/data_algebra-0.2.4.tar.gz b/dist/data_algebra-0.2.4.tar.gz
diff --git a/dist/data_algebra-0.2.5-py3-none-any.whl b/dist/data_algebra-0.2.5-py3-none-any.whl
diff --git a/dist/data_algebra-0.2.5.tar.gz b/dist/data_algebra-0.2.5.tar.gz
diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@
 
 setuptools.setup(
     name='data_algebra',
-    version='0.2.4',
+    version='0.2.5',
     author='John Mount',
     author_email='jmount@win-vector.com',
     url='https://github.com/WinVector/data_algebra',

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@`
`59`	`59`	`{`
`60`	`60`	`"name": "stdout",`
`61`	`61`	`"text": [`
`62`		`- "[{'g', 'v', 'x'} -> ['g', 'x', 'v', 'ngroup']]\n"`
	`62`	`+ "[{'g', 'x', 'v'} -> ['g', 'x', 'v', 'ngroup']]\n"`
`63`	`63`	`],`
`64`	`64`	`"output_type": "stream"`
`65`	`65`	`}`
`@@ -114,7 +114,7 @@`
`114`	`114`	`{`
`115`	`115`	`"name": "stdout",`
`116`	`116`	`"text": [`
`117`		`- "[{'g', 'v', 'ngroup', 'x'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v']]\n"`
	`117`	`+ "[{'x', 'g', 'v', 'ngroup'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v']]\n"`
`118`	`118`	`],`
`119`	`119`	`"output_type": "stream"`
`120`	`120`	`}`
`@@ -174,7 +174,7 @@`
`174`	`174`	`{`
`175`	`175`	`"name": "stdout",`
`176`	`176`	`"text": [`
`177`		`- "[{'g', 'x', 'row_number', 'v', 'shift_v', 'ngroup'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v', 'size', 'max_v', 'min_v', 'sum_v', 'mean_v', 'count_v', 'size_v']]\n"`
	`177`	`+ "[{'x', 'ngroup', 'g', 'row_number', 'v', 'shift_v'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v', 'size', 'max_v', 'min_v', 'sum_v', 'mean_v', 'count_v', 'size_v']]\n"`
`178`	`178`	`],`
`179`	`179`	`"output_type": "stream"`
`180`	`180`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+`
`1`	`2`	`class PipeStep:`
`2`	`3`	`"""class to extend to make pipe transform stages`
`3`	`4`	`Examples:`