Skip to content

Commit 449c3cd

Browse files
committed
column name re-use on joins
1 parent 9405c15 commit 449c3cd

File tree

8 files changed

+133
-84
lines changed

8 files changed

+133
-84
lines changed

build/lib/data_algebra/arrow.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,9 @@ def __init__(
9696
"free_table_key must be a table key used in the pipeline"
9797
)
9898
self.free_table_key = free_table_key
99-
self.incoming_columns = t_used[free_table_key].column_names.copy()
99+
self.incoming_columns = list(t_used[free_table_key].column_names)
100100
self.disallowed_columns = pipeline.forbidden_columns()[free_table_key]
101-
self.outgoing_columns = pipeline.column_names.copy()
101+
self.outgoing_columns = list(pipeline.column_names)
102102
self.outgoing_columns.sort()
103103
Arrow.__init__(self)
104104

build/lib/data_algebra/data_ops.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55

66
from abc import ABC
7-
from typing import Iterable, Set, Dict, List, Optional, Union
7+
from typing import Iterable, Set, Dict, List, Optional, Tuple, Union
88
import numbers
99
import re
1010

@@ -61,32 +61,36 @@ class ViewRepresentation(OperatorPlatform, ABC):
6161
"""Structure to represent the columns of a query or a table.
6262
Abstract base class."""
6363

64-
column_names: List[str]
65-
sources: List[
64+
column_names: Tuple[str]
65+
sources: Tuple[
6666
"ViewRepresentation"
6767
] # https://www.python.org/dev/peps/pep-0484/#forward-references
6868

6969
def __init__(
7070
self,
7171
column_names: Iterable[str],
7272
*,
73-
sources: Optional[List["ViewRepresentation"]] = None,
73+
sources: Optional[Iterable["ViewRepresentation"]] = None,
7474
node_name: str,
7575
):
76-
if isinstance(column_names, str):
77-
column_names = [column_names]
78-
else:
79-
column_names = list(column_names) # make sure a list and a disjoint copy
80-
self.column_names = column_names
81-
assert len(self.column_names) > 0
82-
for v in self.column_names:
76+
# don't let instances masquarade as iterables
77+
assert not isinstance(column_names, str)
78+
assert not isinstance(sources, OperatorPlatform)
79+
if not isinstance(column_names, tuple):
80+
column_names = tuple(column_names)
81+
assert len(column_names) > 0
82+
for v in column_names:
8383
assert isinstance(v, str)
8484
assert len(column_names) == len(set(column_names))
85+
self.column_names = column_names
8586
if sources is None:
86-
sources = []
87+
sources = ()
88+
else:
89+
if not isinstance(sources, tuple):
90+
sources = tuple(sources)
8791
for si in sources:
8892
assert isinstance(si, ViewRepresentation)
89-
self.sources = [si for si in sources]
93+
self.sources = sources
9094
OperatorPlatform.__init__(self, node_name=node_name)
9195

9296
def column_map(self) -> collections.OrderedDict:
@@ -155,7 +159,7 @@ def columns_used_from_sources(self, using=None):
155159
raise NotImplementedError("base method called")
156160

157161
def columns_produced(self):
158-
return self.column_names.copy()
162+
return list(self.column_names)
159163

160164
def _columns_used_implementation(self, *, using, columns_currently_using_records):
161165
self_merged_rep_id = self.merged_rep_id()
@@ -383,7 +387,7 @@ def as_table_description(
383387
):
384388
return TableDescription(
385389
table_name=table_name,
386-
column_names=self.column_names.copy(),
390+
column_names=self.column_names,
387391
qualifiers=qualifiers,
388392
)
389393

@@ -937,7 +941,7 @@ def __init__(
937941
if isinstance(reverse, str):
938942
reverse = [reverse]
939943
self.reverse = reverse
940-
column_names = source.column_names.copy()
944+
column_names = list(source.column_names)
941945
consumed_cols = set()
942946
for (k, o) in parsed_ops.items():
943947
o.get_column_names(consumed_cols)
@@ -1547,7 +1551,7 @@ def _equiv_nodes(self, other):
15471551
return True
15481552

15491553
def columns_used_from_sources(self, using=None):
1550-
cols = set(self.column_names.copy())
1554+
cols = set(self.column_names)
15511555
if using is None:
15521556
return [cols]
15531557
cols = cols.intersection(using).union(self.order_columns)
@@ -1705,7 +1709,7 @@ def __init__(self, a, b, *, by, jointype, check_all_common_keys_in_by=False):
17051709
"Different definition of table object on a/b for: " + k
17061710
)
17071711
# check columns
1708-
column_names = a.column_names.copy()
1712+
column_names = list(a.column_names)
17091713
columns_seen = set(column_names)
17101714
for ci in b.column_names:
17111715
if ci not in columns_seen:
@@ -1729,6 +1733,12 @@ def __init__(self, a, b, *, by, jointype, check_all_common_keys_in_by=False):
17291733
"check_all_common_keys_in_by set, and the following common keys are are not in the by-clause: "
17301734
+ str(missing_common)
17311735
)
1736+
# try to re-use column names if possible, saves space in deeply nested join trees.
1737+
column_names = tuple(column_names)
1738+
if isinstance(a.column_names, tuple) and (set(column_names) == set(a.column_names)):
1739+
column_names = a.column_names
1740+
elif isinstance(b.column_names, tuple) and (set(column_names) == set(b.column_names)):
1741+
column_names = b.column_names
17321742
ViewRepresentation.__init__(
17331743
self,
17341744
column_names=column_names,
@@ -1825,7 +1835,7 @@ def __init__(self, a, b, *, id_column="table_name", a_name="a", b_name="b"):
18251835
raise ValueError("a and b should have same set of column names")
18261836
if id_column is not None and id_column in sources[0].column_names:
18271837
raise ValueError("id_column should not be an input table column name")
1828-
column_names = sources[0].column_names.copy()
1838+
column_names = list(sources[0].column_names)
18291839
if id_column is not None:
18301840
assert id_column not in column_names
18311841
column_names.append(id_column)

coverage.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ data_algebra/arrow.py 141 32 77%
110110
data_algebra/cdata.py 339 66 81%
111111
data_algebra/connected_components.py 22 0 100%
112112
data_algebra/data_model.py 40 17 58%
113-
data_algebra/data_ops.py 1221 208 83%
113+
data_algebra/data_ops.py 1229 208 83%
114114
data_algebra/data_ops_types.py 84 34 60%
115115
data_algebra/data_ops_utils.py 49 7 86%
116116
data_algebra/db_model.py 845 86 90%
@@ -126,7 +126,7 @@ data_algebra/python3_lark.py 1 0 100%
126126
data_algebra/test_util.py 215 42 80%
127127
data_algebra/util.py 137 29 79%
128128
----------------------------------------------------------
129-
TOTAL 5059 897 82%
129+
TOTAL 5067 897 82%
130130

131131

132-
============================= 228 passed in 22.34s =============================
132+
============================= 228 passed in 23.06s =============================

data_algebra/data_ops.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ def __init__(
7373
sources: Optional[Iterable["ViewRepresentation"]] = None,
7474
node_name: str,
7575
):
76+
# don't let instances masquarade as iterables
77+
assert not isinstance(column_names, str)
78+
assert not isinstance(sources, OperatorPlatform)
7679
if not isinstance(column_names, tuple):
7780
column_names = tuple(column_names)
7881
assert len(column_names) > 0
@@ -1730,6 +1733,12 @@ def __init__(self, a, b, *, by, jointype, check_all_common_keys_in_by=False):
17301733
"check_all_common_keys_in_by set, and the following common keys are are not in the by-clause: "
17311734
+ str(missing_common)
17321735
)
1736+
# try to re-use column names if possible, saves space in deeply nested join trees.
1737+
column_names = tuple(column_names)
1738+
if isinstance(a.column_names, tuple) and (set(column_names) == set(a.column_names)):
1739+
column_names = a.column_names
1740+
elif isinstance(b.column_names, tuple) and (set(column_names) == set(b.column_names)):
1741+
column_names = b.column_names
17331742
ViewRepresentation.__init__(
17341743
self,
17351744
column_names=column_names,
138 Bytes
Binary file not shown.

dist/data_algebra-1.1.0.tar.gz

130 Bytes
Binary file not shown.

docs/data_algebra/arrow.html

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,9 +237,9 @@ <h1 class="modulename">
237237
<span class="s2">&quot;free_table_key must be a table key used in the pipeline&quot;</span>
238238
<span class="p">)</span>
239239
<span class="bp">self</span><span class="o">.</span><span class="n">free_table_key</span> <span class="o">=</span> <span class="n">free_table_key</span>
240-
<span class="bp">self</span><span class="o">.</span><span class="n">incoming_columns</span> <span class="o">=</span> <span class="n">t_used</span><span class="p">[</span><span class="n">free_table_key</span><span class="p">]</span><span class="o">.</span><span class="n">column_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
240+
<span class="bp">self</span><span class="o">.</span><span class="n">incoming_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">t_used</span><span class="p">[</span><span class="n">free_table_key</span><span class="p">]</span><span class="o">.</span><span class="n">column_names</span><span class="p">)</span>
241241
<span class="bp">self</span><span class="o">.</span><span class="n">disallowed_columns</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">forbidden_columns</span><span class="p">()[</span><span class="n">free_table_key</span><span class="p">]</span>
242-
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">column_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
242+
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">pipeline</span><span class="o">.</span><span class="n">column_names</span><span class="p">)</span>
243243
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span><span class="o">.</span><span class="n">sort</span><span class="p">()</span>
244244
<span class="n">Arrow</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
245245

@@ -774,9 +774,9 @@ <h1 class="modulename">
774774
<span class="s2">&quot;free_table_key must be a table key used in the pipeline&quot;</span>
775775
<span class="p">)</span>
776776
<span class="bp">self</span><span class="o">.</span><span class="n">free_table_key</span> <span class="o">=</span> <span class="n">free_table_key</span>
777-
<span class="bp">self</span><span class="o">.</span><span class="n">incoming_columns</span> <span class="o">=</span> <span class="n">t_used</span><span class="p">[</span><span class="n">free_table_key</span><span class="p">]</span><span class="o">.</span><span class="n">column_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
777+
<span class="bp">self</span><span class="o">.</span><span class="n">incoming_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">t_used</span><span class="p">[</span><span class="n">free_table_key</span><span class="p">]</span><span class="o">.</span><span class="n">column_names</span><span class="p">)</span>
778778
<span class="bp">self</span><span class="o">.</span><span class="n">disallowed_columns</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">forbidden_columns</span><span class="p">()[</span><span class="n">free_table_key</span><span class="p">]</span>
779-
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">column_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
779+
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">pipeline</span><span class="o">.</span><span class="n">column_names</span><span class="p">)</span>
780780
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span><span class="o">.</span><span class="n">sort</span><span class="p">()</span>
781781
<span class="n">Arrow</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
782782

@@ -984,9 +984,9 @@ <h1 class="modulename">
984984
<span class="s2">&quot;free_table_key must be a table key used in the pipeline&quot;</span>
985985
<span class="p">)</span>
986986
<span class="bp">self</span><span class="o">.</span><span class="n">free_table_key</span> <span class="o">=</span> <span class="n">free_table_key</span>
987-
<span class="bp">self</span><span class="o">.</span><span class="n">incoming_columns</span> <span class="o">=</span> <span class="n">t_used</span><span class="p">[</span><span class="n">free_table_key</span><span class="p">]</span><span class="o">.</span><span class="n">column_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
987+
<span class="bp">self</span><span class="o">.</span><span class="n">incoming_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">t_used</span><span class="p">[</span><span class="n">free_table_key</span><span class="p">]</span><span class="o">.</span><span class="n">column_names</span><span class="p">)</span>
988988
<span class="bp">self</span><span class="o">.</span><span class="n">disallowed_columns</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">forbidden_columns</span><span class="p">()[</span><span class="n">free_table_key</span><span class="p">]</span>
989-
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">column_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
989+
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">pipeline</span><span class="o">.</span><span class="n">column_names</span><span class="p">)</span>
990990
<span class="bp">self</span><span class="o">.</span><span class="n">outgoing_columns</span><span class="o">.</span><span class="n">sort</span><span class="p">()</span>
991991
<span class="n">Arrow</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
992992
</pre></div>

0 commit comments

Comments
 (0)