Skip to content

Commit bf4d068

Browse files
committed
Add fully associative arrow representation
1 parent 9edeee1 commit bf4d068

File tree

13 files changed

+81
-10
lines changed

13 files changed

+81
-10
lines changed

Examples/WindowFunctions/Arrow.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
{
6060
"name": "stdout",
6161
"text": [
62-
"[{'g', 'v', 'x'} -> ['g', 'x', 'v', 'ngroup']]\n"
62+
"[{'g', 'x', 'v'} -> ['g', 'x', 'v', 'ngroup']]\n"
6363
],
6464
"output_type": "stream"
6565
}
@@ -114,7 +114,7 @@
114114
{
115115
"name": "stdout",
116116
"text": [
117-
"[{'g', 'v', 'ngroup', 'x'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v']]\n"
117+
"[{'x', 'g', 'v', 'ngroup'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v']]\n"
118118
],
119119
"output_type": "stream"
120120
}
@@ -174,7 +174,7 @@
174174
{
175175
"name": "stdout",
176176
"text": [
177-
"[{'g', 'x', 'row_number', 'v', 'shift_v', 'ngroup'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v', 'size', 'max_v', 'min_v', 'sum_v', 'mean_v', 'count_v', 'size_v']]\n"
177+
"[{'x', 'ngroup', 'g', 'row_number', 'v', 'shift_v'} -> ['g', 'x', 'v', 'ngroup', 'row_number', 'shift_v', 'size', 'max_v', 'min_v', 'sum_v', 'mean_v', 'count_v', 'size_v']]\n"
178178
],
179179
"output_type": "stream"
180180
}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ can perform data engineering in [`Pandas`](https://pandas.pydata.org) and genera
1313
Install `data_algebra` with either of:
1414

1515
* `pip install data_algebra`
16-
* `pip install https://github.com/WinVector/data_algebra/raw/master/dist/data_algebra-0.2.4.tar.gz`
16+
* `pip install https://github.com/WinVector/data_algebra/raw/master/dist/data_algebra-0.2.5.tar.gz`
1717

1818
# Announcement
1919

build/lib/data_algebra/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757

5858

5959
__docformat__ = "restructuredtext"
60-
__version__ = "0.2.4"
60+
__version__ = "0.2.5"
6161

6262
__doc__ = """
6363
`data_algebra`<https://github.com/WinVector/data_algebra> is a piped data wrangling system

build/lib/data_algebra/arrow.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
2+
import copy
3+
4+
import pandas
5+
6+
import data_algebra.data_ops
7+
8+
9+
10+
class DataOpArrow:
11+
""" Represent a section of operators as a categorical arrow."""
12+
13+
def __init__(self, v):
14+
if not isinstance(v, data_algebra.data_ops.ViewRepresentation):
15+
raise TypeError("expected v to be data_algebra.data_ops")
16+
self.v = v
17+
cused = v.columns_used()
18+
if len(cused) != 1:
19+
raise ValueError("v must use exactly one table")
20+
k = [k for k in cused.keys()][0]
21+
self.incoming_columns = cused[k]
22+
self.outgoing_columns = v.column_names
23+
24+
def _r_copy_replace(self, ops):
25+
"""re-write ops replacing any TableDescription with self.v"""
26+
if isinstance(ops, data_algebra.data_ops.TableDescription):
27+
return self.v
28+
node = copy.copy(ops)
29+
node.sources = [self._r_copy_replace(s) for s in node.sources]
30+
return node
31+
32+
def transform(self, other):
33+
"""replace self input table with other"""
34+
if isinstance(other, pandas.DataFrame):
35+
cols = set(other.columns)
36+
missing = set(self.incoming_columns) - cols
37+
if len(missing) > 0:
38+
raise ValueError("missing required columns: " + str(missing))
39+
if len(cols - set(self.incoming_columns)):
40+
other = other[self.incoming_columns]
41+
return self.v.transform(other)
42+
if isinstance(other, data_algebra.data_ops.ViewRepresentation):
43+
other = DataOpArrow(other)
44+
if not isinstance(other, DataOpArrow):
45+
raise TypeError("other must be a DataOpArrow")
46+
missing = set(self.incoming_columns) - set(other.outgoing_columns)
47+
if len(missing) > 0:
48+
raise ValueError("missing required columns: " + str(missing))
49+
if len(set(other.outgoing_columns) - set(self.incoming_columns)):
50+
# extra columns, in a strict categorical formulation we would
51+
# reject this. instead insert a select columns node to get the match
52+
other = DataOpArrow(other.v.select_columns([c for c in self.incoming_columns]))
53+
# check categorical arrow composition conditions
54+
if set(self.incoming_columns) != set(other.outgoing_columns):
55+
raise ValueError("arrow composition conditions not met (incoming column set doesn't match outgoing)")
56+
return DataOpArrow(other._r_copy_replace(self.v))
57+
58+
def __rshift__(self, other): # override self >> other
59+
return other.transform(self)
60+
61+
def __rrshift__(self, other): # override other >> self
62+
return self.transform(other)
63+
64+
def __repr__(self):
65+
return "DataOpArrow(" + self.v.__repr__() + ")"
66+
67+
def __str__(self):
68+
return "[" + str(self.incoming_columns) + " -> " + str(self.outgoing_columns) + "]"

build/lib/data_algebra/pipe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
class PipeStep:
23
"""class to extend to make pipe transform stages
34
Examples:

coverage.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ data_algebra/PostgreSQL.py 21 4 81%
3939
data_algebra/SQLite.py 91 5 95%
4040
data_algebra/SparkSQL.py 21 21 0%
4141
data_algebra/__init__.py 36 10 72%
42+
data_algebra/arrow.py 49 49 0%
4243
data_algebra/cdata.py 105 21 80%
4344
data_algebra/cdata_impl.py 152 60 61%
4445
data_algebra/dask_model.py 121 23 81%
@@ -57,7 +58,7 @@ data_algebra/pipe.py 65 19 71%
5758
data_algebra/util.py 84 7 92%
5859
data_algebra/yaml.py 119 15 87%
5960
-----------------------------------------------------
60-
TOTAL 3347 972 71%
61+
TOTAL 3396 1021 70%
6162

6263

63-
========================== 36 passed in 7.56 seconds ===========================
64+
========================== 36 passed in 7.65 seconds ===========================

data_algebra.egg-info/PKG-INFO

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Metadata-Version: 2.1
22
Name: data-algebra
3-
Version: 0.2.4
3+
Version: 0.2.5
44
Summary: data_algebra is a data manipulation language that can both generate SQL queries and work on Pandas DataFrames.
55
Home-page: https://github.com/WinVector/data_algebra
66
Author: John Mount

data_algebra.egg-info/SOURCES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ data_algebra/PostgreSQL.py
44
data_algebra/SQLite.py
55
data_algebra/SparkSQL.py
66
data_algebra/__init__.py
7+
data_algebra/arrow.py
78
data_algebra/cdata.py
89
data_algebra/cdata_impl.py
910
data_algebra/dask_model.py

data_algebra/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757

5858

5959
__docformat__ = "restructuredtext"
60-
__version__ = "0.2.4"
60+
__version__ = "0.2.5"
6161

6262
__doc__ = """
6363
`data_algebra`<https://github.com/WinVector/data_algebra> is a piped data wrangling system

dist/data_algebra-0.2.4.tar.gz

-44.7 KB
Binary file not shown.

0 commit comments

Comments
 (0)