dagify table defs

JohnMount · JohnMount · commit 84f08dc2e530 · 2019-09-22T14:31:17.000-07:00
diff --git a/build/lib/data_algebra/data_ops.py b/build/lib/data_algebra/data_ops.py
@@ -131,13 +131,26 @@ def get_column_symbols(self):
 
     # characterization
 
-    def get_tables(self, tables=None):
+    def get_tables(self, *, replacements=None):
         """Get a dictionary of all tables used in an operator DAG,
         raise an exception if the values are not consistent."""
-        if tables is None:
-            tables = {}
-        for s in self.sources:
-            tables = s.get_tables(tables)
+        tables = {}
+        for i in range(len(self.sources)):
+            s = self.sources[i]
+            if isinstance(s, TableDescription):
+                if replacements is not None and s.key in replacements:
+                    orig_table = replacements[s.key]
+                    if s.column_set != orig_table.column_set:
+                        raise ValueError("table " + s.key + " has two incompatible definitions")
+                    self.sources[i] = orig_table
+                    s = orig_table
+            ti = s.get_tables(replacements=replacements)
+            for (k, v) in ti.items():
+                if k in tables.keys():
+                    if not tables[k] is v:
+                        raise ValueError("Table " + k + " has two different representation objects")
+                else:
+                    tables[k] = v
         return tables
 
     def columns_used_from_sources(self, using=None):
@@ -533,22 +546,12 @@ def to_python_implementation(self, *, indent=0, strict=True, print_sources=True)
         s = s + ")"
         return s
 
-    def get_tables(self, tables=None):
+    def get_tables(self, *, replacements=None):
         """get a dictionary of all tables used in an operator DAG,
         raise an exception if the values are not consistent"""
-        if tables is None:
-            tables = {}
-        if self.key in tables.keys():
-            other = tables[self.key]
-            if self.column_set != other.column_set:
-                raise ValueError(
-                    "Two tables with key " + self.key + " have different column sets."
-                )
-            if other is not self:
-                raise ValueError("Two different table definitions for table: " + self.key)
-        else:
-            tables[self.key] = self
-        return tables
+        if replacements is not None and self.key in replacements.keys():
+            return {self.key: replacements[self.key]}
+        return {self.key: self}
 
     def eval_implementation(self, *, data_map, eval_env, data_model):
         return data_model.table_step(op=self, data_map=data_map, eval_env=eval_env)
@@ -1115,7 +1118,7 @@ class NaturalJoinNode(ViewRepresentation):
 
     def __init__(self, a, b, *, by=None, jointype="INNER"):
         a_tables = a.get_tables()
-        b_tables = b.get_tables()
+        b_tables = b.get_tables(replacements=a_tables)
         common_keys = set(a_tables.keys()).intersection(b_tables.keys())
         for k in common_keys:
             if a_tables[k] is not b_tables[k]:
@@ -1195,7 +1198,6 @@ def eval_implementation(self, *, data_map, eval_env, data_model):
 
 
 class ConvertRecordsNode(ViewRepresentation):
-    blocks_out_table: TableDescription
 
     def __init__(self, source, record_map, *, blocks_out_table=None):
         sources = [source]
@@ -1206,24 +1208,28 @@ def __init__(self, source, record_map, *, blocks_out_table=None):
                 + [c for c in record_map.blocks_out.control_table.columns],
             )
         if blocks_out_table is not None:
-            sources = sources + [blocks_out_table]
             # check blocks_out_table is a direct table
             if not isinstance(blocks_out_table, TableDescription):
                 raise TypeError("expected blocks_out_table to be a data_algebra.data_ops.TableDescription")
-            # check it is the exact same definition object if already present
+            # ensure table is the exact same definition object if already present
             a_tables = source.get_tables()
             if blocks_out_table.key in a_tables.keys():
                 a_table = a_tables[blocks_out_table.key]
+                if not a_table.column_set == blocks_out_table.column_set:
+                    raise ValueError("blocks_out_table column definition does not match table already in op DAG")
                 if not blocks_out_table is a_table:
-                    raise ValueError("different definiton object for: " + blocks_out_table.key)
+                    blocks_out_table = a_table
+            # check blocks_out_table is a direct table
+            if not isinstance(blocks_out_table, TableDescription):
+                raise TypeError("expected blocks_out_table to be a data_algebra.data_ops.TableDescription")
             # check it has at least the columns we expect
             expect = [c for c in record_map.blocks_out.record_keys] + \
                         [c for c in record_map.blocks_out.control_table.columns]
             unknown = set(expect) - set(blocks_out_table.column_names)
             if len(unknown) > 0:
                 raise ValueError("blocks_out_table missing columns: " + str(unknown))
+            sources = sources + [blocks_out_table]
         self.record_map = record_map
-        self.blocks_out_table = blocks_out_table
         unknown = set(self.record_map.columns_needed) - set(source.column_names)
         if len(unknown) > 0:
             raise ValueError("missing required columns: " + str(unknown))
@@ -1244,8 +1250,11 @@ def collect_representation_implementation(self, *, pipeline=None, dialect="Pytho
         od["op"] = "ConvertRecords"
         od["record_map"] = self.record_map.to_simple_obj()
         od['blocks_out_table'] = None
-        if self.blocks_out_table is not None:
-            od['blocks_out_table'] = self.blocks_out_table.collect_representation(dialect=dialect)[0]
+        blocks_out_table = None
+        if len(self.sources) > 1:
+            blocks_out_table = self.sources[1]
+        if blocks_out_table is not None:
+            od['blocks_out_table'] = blocks_out_table.collect_representation(dialect=dialect)[0]
         pipeline.insert(0, od)
         return self.sources[0].collect_representation_implementation(
             pipeline=pipeline, dialect=dialect
@@ -1261,10 +1270,13 @@ def to_python_implementation(self, *, indent=0, strict=True, print_sources=True)
             )
         rm_str = self.record_map.__repr__()
         rm_str = re.sub("\n", "\n   ", rm_str)
-        s = s + ("convert_record(" + rm_str +
+        s = s + "convert_record(" + rm_str
+        if len(self.sources) > 1:
+            s = s + (
                  "\n,   blocks_out_table=" +
-                 self.blocks_out_table.to_python_implementation(indent=indent+3, strict=strict) +
-                 ")")
+                 self.sources[1].to_python_implementation(indent=indent+3, strict=strict)
+            )
+        s = s + ")"
         return s
 
     def to_sql_implementation(self, db_model, *, using, temp_id_source):
@@ -1279,7 +1291,7 @@ def to_sql_implementation(self, db_model, *, using, temp_id_source):
             res = db_model.row_recs_to_blocks_query(
                 res,
                 record_spec=self.record_map.blocks_out,
-                record_view=self.blocks_out_table,
+                record_view=self.sources[1],
             )
         return res
 
diff --git a/build/lib/data_algebra/db_model.py b/build/lib/data_algebra/db_model.py
@@ -192,7 +192,8 @@ def table_def_to_sql(self, table_def, *, using=None, force_sql=False):
             missing = using - table_def.column_set
             if len(missing) > 0:
                 raise KeyError("referred to unknown columns: " + str(missing))
-            cols = [self.quote_identifier(ci) for ci in using]
+            cols_using = [c for c in table_def.column_names if c in using]
+            cols = [self.quote_identifier(ci) for ci in cols_using]
             sql_str = (
                 "SELECT "
                 + ", ".join(cols)
@@ -252,7 +253,8 @@ def extend_to_sql(self, extend_node, *, using=None, temp_id_source=None):
         ]
         origcols = [k for k in using if k not in subops.keys()]
         if len(origcols) > 0:
-            derived = [self.quote_identifier(ci) for ci in set(origcols)] + derived
+            ordered_orig = [c for c in extend_node.column_names if c in set(origcols)]
+            derived = [self.quote_identifier(ci) for ci in ordered_orig] + derived
         sql_str = (
             "SELECT "
             + ", ".join(derived)
diff --git a/build/lib/data_algebra/pandas_model.py b/build/lib/data_algebra/pandas_model.py
@@ -39,7 +39,7 @@ def table_step(self, op, *, data_map, eval_env):
         # check all columns we expect are present
         columns_using = op.column_names
         if op.columns_currently_used is not None and len(op.columns_currently_used) > 0:
-            columns_using = [c for c in op.columns_currently_used]
+            columns_using = [c for c in columns_using if c in op.columns_currently_used]
         missing = set(columns_using) - set([c for c in df.columns])
         if len(missing) > 0:
             raise ValueError("missing required columns: " + str(missing))
diff --git a/coverage.txt b/coverage.txt
@@ -38,11 +38,11 @@ data_algebra/cdata.py               103     20    81%
 data_algebra/cdata_impl.py          152     60    61%
 data_algebra/dask_model.py          121     23    81%
 data_algebra/data_model.py           41     15    63%
-data_algebra/data_ops.py            800    166    79%
+data_algebra/data_ops.py            813    173    79%
 data_algebra/data_pipe.py           183     41    78%
 data_algebra/data_types.py           39     19    51%
 data_algebra/datatable_model.py     131     81    38%
-data_algebra/db_model.py            362     83    77%
+data_algebra/db_model.py            364     83    77%
 data_algebra/diagram.py              52     52     0%
 data_algebra/env.py                  46      7    85%
 data_algebra/expr.py                 20      4    80%
@@ -53,7 +53,7 @@ data_algebra/pipe.py                 65     19    71%
 data_algebra/util.py                 84      7    92%
 data_algebra/yaml.py                120     15    88%
 -----------------------------------------------------
-TOTAL                              2928    795    73%
+TOTAL                              2943    802    73%
 
 
-========================== 30 passed in 7.22 seconds ===========================
+========================== 30 passed in 7.51 seconds ===========================
diff --git a/data_algebra/data_ops.py b/data_algebra/data_ops.py
@@ -131,13 +131,26 @@ def get_column_symbols(self):
 
     # characterization
 
-    def get_tables(self, tables=None):
+    def get_tables(self, *, replacements=None):
         """Get a dictionary of all tables used in an operator DAG,
         raise an exception if the values are not consistent."""
-        if tables is None:
-            tables = {}
-        for s in self.sources:
-            tables = s.get_tables(tables)
+        tables = {}
+        for i in range(len(self.sources)):
+            s = self.sources[i]
+            if isinstance(s, TableDescription):
+                if replacements is not None and s.key in replacements:
+                    orig_table = replacements[s.key]
+                    if s.column_set != orig_table.column_set:
+                        raise ValueError("table " + s.key + " has two incompatible definitions")
+                    self.sources[i] = orig_table
+                    s = orig_table
+            ti = s.get_tables(replacements=replacements)
+            for (k, v) in ti.items():
+                if k in tables.keys():
+                    if not tables[k] is v:
+                        raise ValueError("Table " + k + " has two different representation objects")
+                else:
+                    tables[k] = v
         return tables
 
     def columns_used_from_sources(self, using=None):
@@ -533,22 +546,12 @@ def to_python_implementation(self, *, indent=0, strict=True, print_sources=True)
         s = s + ")"
         return s
 
-    def get_tables(self, tables=None):
+    def get_tables(self, *, replacements=None):
         """get a dictionary of all tables used in an operator DAG,
         raise an exception if the values are not consistent"""
-        if tables is None:
-            tables = {}
-        if self.key in tables.keys():
-            other = tables[self.key]
-            if self.column_set != other.column_set:
-                raise ValueError(
-                    "Two tables with key " + self.key + " have different column sets."
-                )
-            if other is not self:
-                raise ValueError("Two different table definitions for table: " + self.key)
-        else:
-            tables[self.key] = self
-        return tables
+        if replacements is not None and self.key in replacements.keys():
+            return {self.key: replacements[self.key]}
+        return {self.key: self}
 
     def eval_implementation(self, *, data_map, eval_env, data_model):
         return data_model.table_step(op=self, data_map=data_map, eval_env=eval_env)
@@ -1115,7 +1118,7 @@ class NaturalJoinNode(ViewRepresentation):
 
     def __init__(self, a, b, *, by=None, jointype="INNER"):
         a_tables = a.get_tables()
-        b_tables = b.get_tables()
+        b_tables = b.get_tables(replacements=a_tables)
         common_keys = set(a_tables.keys()).intersection(b_tables.keys())
         for k in common_keys:
             if a_tables[k] is not b_tables[k]:
@@ -1195,7 +1198,6 @@ def eval_implementation(self, *, data_map, eval_env, data_model):
 
 
 class ConvertRecordsNode(ViewRepresentation):
-    blocks_out_table: TableDescription
 
     def __init__(self, source, record_map, *, blocks_out_table=None):
         sources = [source]
@@ -1206,24 +1208,28 @@ def __init__(self, source, record_map, *, blocks_out_table=None):
                 + [c for c in record_map.blocks_out.control_table.columns],
             )
         if blocks_out_table is not None:
-            sources = sources + [blocks_out_table]
             # check blocks_out_table is a direct table
             if not isinstance(blocks_out_table, TableDescription):
                 raise TypeError("expected blocks_out_table to be a data_algebra.data_ops.TableDescription")
-            # check it is the exact same definition object if already present
+            # ensure table is the exact same definition object if already present
             a_tables = source.get_tables()
             if blocks_out_table.key in a_tables.keys():
                 a_table = a_tables[blocks_out_table.key]
+                if not a_table.column_set == blocks_out_table.column_set:
+                    raise ValueError("blocks_out_table column definition does not match table already in op DAG")
                 if not blocks_out_table is a_table:
-                    raise ValueError("different definiton object for: " + blocks_out_table.key)
+                    blocks_out_table = a_table
+            # check blocks_out_table is a direct table
+            if not isinstance(blocks_out_table, TableDescription):
+                raise TypeError("expected blocks_out_table to be a data_algebra.data_ops.TableDescription")
             # check it has at least the columns we expect
             expect = [c for c in record_map.blocks_out.record_keys] + \
                         [c for c in record_map.blocks_out.control_table.columns]
             unknown = set(expect) - set(blocks_out_table.column_names)
             if len(unknown) > 0:
                 raise ValueError("blocks_out_table missing columns: " + str(unknown))
+            sources = sources + [blocks_out_table]
         self.record_map = record_map
-        self.blocks_out_table = blocks_out_table
         unknown = set(self.record_map.columns_needed) - set(source.column_names)
         if len(unknown) > 0:
             raise ValueError("missing required columns: " + str(unknown))
@@ -1244,8 +1250,11 @@ def collect_representation_implementation(self, *, pipeline=None, dialect="Pytho
         od["op"] = "ConvertRecords"
         od["record_map"] = self.record_map.to_simple_obj()
         od['blocks_out_table'] = None
-        if self.blocks_out_table is not None:
-            od['blocks_out_table'] = self.blocks_out_table.collect_representation(dialect=dialect)[0]
+        blocks_out_table = None
+        if len(self.sources) > 1:
+            blocks_out_table = self.sources[1]
+        if blocks_out_table is not None:
+            od['blocks_out_table'] = blocks_out_table.collect_representation(dialect=dialect)[0]
         pipeline.insert(0, od)
         return self.sources[0].collect_representation_implementation(
             pipeline=pipeline, dialect=dialect
@@ -1261,10 +1270,13 @@ def to_python_implementation(self, *, indent=0, strict=True, print_sources=True)
             )
         rm_str = self.record_map.__repr__()
         rm_str = re.sub("\n", "\n   ", rm_str)
-        s = s + ("convert_record(" + rm_str +
+        s = s + "convert_record(" + rm_str
+        if len(self.sources) > 1:
+            s = s + (
                  "\n,   blocks_out_table=" +
-                 self.blocks_out_table.to_python_implementation(indent=indent+3, strict=strict) +
-                 ")")
+                 self.sources[1].to_python_implementation(indent=indent+3, strict=strict)
+            )
+        s = s + ")"
         return s
 
     def to_sql_implementation(self, db_model, *, using, temp_id_source):
@@ -1279,7 +1291,7 @@ def to_sql_implementation(self, db_model, *, using, temp_id_source):
             res = db_model.row_recs_to_blocks_query(
                 res,
                 record_spec=self.record_map.blocks_out,
-                record_view=self.blocks_out_table,
+                record_view=self.sources[1],
             )
         return res
 
diff --git a/data_algebra/db_model.py b/data_algebra/db_model.py
@@ -192,7 +192,8 @@ def table_def_to_sql(self, table_def, *, using=None, force_sql=False):
             missing = using - table_def.column_set
             if len(missing) > 0:
                 raise KeyError("referred to unknown columns: " + str(missing))
-            cols = [self.quote_identifier(ci) for ci in using]
+            cols_using = [c for c in table_def.column_names if c in using]
+            cols = [self.quote_identifier(ci) for ci in cols_using]
             sql_str = (
                 "SELECT "
                 + ", ".join(cols)
@@ -252,7 +253,8 @@ def extend_to_sql(self, extend_node, *, using=None, temp_id_source=None):
         ]
         origcols = [k for k in using if k not in subops.keys()]
         if len(origcols) > 0:
-            derived = [self.quote_identifier(ci) for ci in set(origcols)] + derived
+            ordered_orig = [c for c in extend_node.column_names if c in set(origcols)]
+            derived = [self.quote_identifier(ci) for ci in ordered_orig] + derived
         sql_str = (
             "SELECT "
             + ", ".join(derived)
diff --git a/data_algebra/pandas_model.py b/data_algebra/pandas_model.py
@@ -39,7 +39,7 @@ def table_step(self, op, *, data_map, eval_env):
         # check all columns we expect are present
         columns_using = op.column_names
         if op.columns_currently_used is not None and len(op.columns_currently_used) > 0:
-            columns_using = [c for c in op.columns_currently_used]
+            columns_using = [c for c in columns_using if c in op.columns_currently_used]
         missing = set(columns_using) - set([c for c in df.columns])
         if len(missing) > 0:
             raise ValueError("missing required columns: " + str(missing))
diff --git a/dist/data_algebra-0.2.1-py3-none-any.whl b/dist/data_algebra-0.2.1-py3-none-any.whl
diff --git a/dist/data_algebra-0.2.1.tar.gz b/dist/data_algebra-0.2.1.tar.gz
diff --git a/tests/test_cols_used.py b/tests/test_cols_used.py