WinVector
diff --git a/‎Examples/LogisticExample/ScoringExample.ipynb‎
Lines changed: 63 additions & 485 deletions b/‎Examples/LogisticExample/ScoringExample.ipynb‎
Lines changed: 63 additions & 485 deletions
diff --git a/‎Examples/Simplification/query_simplification.ipynb‎
Lines changed: 34 additions & 83 deletions b/‎Examples/Simplification/query_simplification.ipynb‎
Lines changed: 34 additions & 83 deletions
diff --git a/‎build/lib/data_algebra/PostgreSQL.py‎
Lines changed: 0 additions & 5 deletions b/‎build/lib/data_algebra/PostgreSQL.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎build/lib/data_algebra/SparkSQL.py‎
Lines changed: 0 additions & 5 deletions b/‎build/lib/data_algebra/SparkSQL.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎build/lib/data_algebra/arrow.py‎
Lines changed: 5 additions & 2 deletions b/‎build/lib/data_algebra/arrow.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎build/lib/data_algebra/data_ops.py‎
Lines changed: 29 additions & 28 deletions b/‎build/lib/data_algebra/data_ops.py‎
Lines changed: 29 additions & 28 deletions
@@ -2,21 +2,18 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   },
    "source": [
     "[`data_algebra`](https://github.com/WinVector/data_algebra) version of this [`rquery` example](http://www.win-vector.com/blog/2019/12/what-is-new-for-rquery-december-2019/).\n",
     "\n",
     "First lets import our modules and set up our operator pipeline."
-   ]
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 1,
    "outputs": [
     {
      "name": "stdout",
@@ -28,7 +25,7 @@
       "   extend({\n",
       "    'sum23': 'col2 + col3',\n",
       "    'x': '5'}) .\\\n",
-      "   select_columns(['x', 'sum23'])\n"
+      "   select_columns(['x', 'sum23', 'col3'])\n"
      ],
      "output_type": "stream"
     }
@@ -63,7 +60,7 @@
     "        extend({\n",
     "        'x': 5\n",
     "    }). \\\n",
-    "    select_columns(['x', 'sum23'])\n",
+    "    select_columns(['x', 'sum23', 'col3'])\n",
     "\n",
     "\n",
     "print(ops)\n"
@@ -85,24 +82,21 @@
     "These operations can be applied to data."
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
    }
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 2,
    "outputs": [
     {
      "data": {
-      "text/plain": "   x  sum23\n0  5      7\n1  5      9",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>x</th>\n      <th>sum23</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>5</td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>9</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+      "text/plain": "   x  sum23  col3\n0  5      7     4\n1  5      9     5",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>x</th>\n      <th>sum23</th>\n      <th>col3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>5</td>\n      <td>7</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>9</td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
      },
      "metadata": {},
      "output_type": "execute_result",
-     "execution_count": 16
+     "execution_count": 2
     }
    ],
    "source": [
@@ -125,7 +119,7 @@
   {
    "cell_type": "markdown",
    "source": [
-    "We are working on adaptors for near-`Pandas` systems such as `modin` and others.\n",
+    "We are working on adapters for near-`Pandas` systems such as `modin` and others.\n",
     "\n",
     "We can also convert the query into `SQL` query."
    ],
@@ -135,64 +129,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 3,
    "outputs": [
     {
      "name": "stdout",
      "text": [
-      "SELECT \"sum23\",\n",
-      "       \"x\"\n",
-      "FROM\n",
-      "  (SELECT \"col2\" + \"col3\" AS \"sum23\",\n",
-      "          5 AS \"x\"\n",
-      "   FROM\n",
-      "     (SELECT \"col2\",\n",
-      "             \"col3\"\n",
-      "      FROM \"d\") \"sq_0\") \"sq_1\"\n"
-     ],
-     "output_type": "stream"
-    }
-   ],
-   "source": [
-    "pg_model = data_algebra.PostgreSQL.PostgreSQLModel()\n",
-    "\n",
-    "print(ops.to_sql(db_model=pg_model, pretty=True))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n",
-     "is_executing": false
-    }
-   }
-  },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "The excess inner query is working around the issue that the `PostgresSQL` `SQL` dialect does not accept table names in parenthesis in some situations.\n",
-    "\n",
-    "When we do not have such a constraint (such as with `SQLite`) we can generate a shorter query. \n"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "outputs": [
-    {
-     "name": "stdout",
-     "text": [
-      "SELECT \"sum23\",\n",
-      "       \"x\"\n",
-      "FROM\n",
-      "  (SELECT \"col2\" + \"col3\" AS \"sum23\",\n",
-      "          5 AS \"x\"\n",
-      "   FROM (\"d\") \"SQ_0\") \"SQ_1\"\n"
+      "SELECT 5 AS \"x\",\n",
+      "       \"col2\" + \"col3\" AS \"sum23\",\n",
+      "       \"col3\"\n",
+      "FROM \"d\"\n"
      ],
      "output_type": "stream"
     }
@@ -213,33 +158,39 @@
   {
    "cell_type": "markdown",
    "source": [
-    "One per-`SQL` dialect translations and affordances is one of the intents of the `data_algebra`.\n",
+    "Notice this query is fairly compact.  `data_algebra` optimizations do not combine steps with different concerns, but they do have some nice features:\n",
+    "\n",
+    "  * Queries are shortened: some steps that are not used are not preserved.\n",
+    "  * Queries are narrowed: values not used in the result are not brought through intermediate queries.\n",
+    "  * Non-terminal row-orders are thrown away (as they are not semantic in many data-stores).\n",
+    "  * `select_column()` steps are implicit, change other steps but not translated as explicit queries.\n",
+    "  * Tables are used by name when deeper in queries.\n",
+    " \n",
+    "This make for tighter query generation than the current version of [`rquery`](https://github.com/WinVector/rquery/) (which [itself one of the best query generators in `R`](http://www.win-vector.com/blog/2019/12/what-is-new-for-rquery-december-2019/)).\n",
     "\n",
     "And we can easily demonstrate the query in action."
    ],
    "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
+    "collapsed": false
    }
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 4,
    "outputs": [
     {
      "data": {
-      "text/plain": "   sum23  x\n0      7  5\n1      9  5",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>sum23</th>\n      <th>x</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>7</td>\n      <td>5</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>9</td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+      "text/plain": "   x  sum23  col3\n0  5      7     4\n1  5      9     5",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>x</th>\n      <th>sum23</th>\n      <th>col3</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>5</td>\n      <td>7</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>5</td>\n      <td>9</td>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
      },
      "metadata": {},
      "output_type": "execute_result",
-     "execution_count": 19
+     "execution_count": 4
     }
    ],
    "source": [
     "conn = sqlite3.connect(':memory:')\n",
+    "sql_model.prepare_connection(conn)\n",
     "sql_model.insert_table(conn, d, table_name='d')\n",
     "\n",
     "conn.execute('CREATE TABLE res AS ' + ops.to_sql(db_model=sql_model))\n",
@@ -255,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 5,
    "outputs": [],
    "source": [
     "conn.close()\n"
 
@@ -31,8 +31,3 @@ def build_qualified_table_name(self, table_name, *, qualifiers=None):
         if "schema" in qualifiers.keys():
             qt = self.quote_identifier(qualifiers["schema"]) + "." + qt
         return qt
-
-    def table_def_to_sql(self, table_def, *, using=None, force_sql=False):
-        return super().table_def_to_sql(
-            table_def=table_def, using=using, force_sql=True
-        )
@@ -31,8 +31,3 @@ def build_qualified_table_name(self, table_name, *, qualifiers=None):
         if "schema" in qualifiers.keys():
             qt = self.quote_identifier(qualifiers["schema"]) + "." + qt
         return qt
-
-    def table_def_to_sql(self, table_def, *, using=None, force_sql=False):
-        return super().table_def_to_sql(
-            table_def=table_def, using=using, force_sql=True
-        )
@@ -66,7 +66,10 @@ def __init__(self, pipeline, *, free_table_key=None, strict=False):
         self.outgoing_columns = pipeline.column_names.copy()
         self.outgoing_columns.sort()
         self.outgoing_types = None
-        if isinstance(pipeline, data_algebra.data_ops.TableDescription) and self.incoming_types is not None:
+        if (
+            isinstance(pipeline, data_algebra.data_ops.TableDescription)
+            and self.incoming_types is not None
+        ):
             self.outgoing_types = self.incoming_types.copy()
         Arrow.__init__(self)
 
@@ -104,7 +107,7 @@ def apply_to(self, b):
         new_pipeline = self.pipeline.apply_to(
             b.pipeline, target_table_key=self.free_table_key
         )
-        new_pipeline.get_tables()   # check tables are compatible
+        new_pipeline.get_tables()  # check tables are compatible
         res = DataOpArrow(pipeline=new_pipeline, free_table_key=b.free_table_key)
         res.incoming_types = b.incoming_types
         res.outgoing_types = self.outgoing_types
 
@@ -13,6 +13,7 @@
 import data_algebra.env
 from data_algebra.data_ops_types import *
 import data_algebra.data_ops_utils
+import data_algebra.near_sql
 
 _have_black = False
 try:
@@ -234,6 +235,9 @@ def to_sql(self, db_model, *, pretty=False, encoding=None, sqlparse_options=None
         sql_str = self.to_sql_implementation(
             db_model=db_model, using=None, temp_id_source=temp_id_source
         )
+        if isinstance(sql_str, str):
+            print("break")
+        sql_str = sql_str.to_sql(db_model=db_model, force_sql=True)
         if pretty and _have_sqlparse:
             try:
                 sql_str = sqlparse.format(
@@ -664,22 +668,10 @@ def eval_implementation(self, *, data_map, eval_env, data_model):
     def columns_used_from_sources(self, using=None):
         return []  # no inputs to table description
 
-    def to_sql(self, db_model, *, pretty=False, encoding=None, sqlparse_options=None):
-        if sqlparse_options is None:
-            sqlparse_options = {"reindent": True, "keyword_case": "upper"}
-        self.columns_used()  # for table consistency check/raise
-        temp_id_source = [0]
-        sql_str = self.to_sql_implementation(
-            db_model=db_model, using=None, temp_id_source=temp_id_source, force_sql=True
+    def to_sql_implementation(self, db_model, *, using, temp_id_source):
+        return db_model.table_def_to_sql(
+            self, using=using, temp_id_source=temp_id_source
         )
-        if pretty and _have_sqlparse:
-            sql_str = sqlparse.format(sql_str, encoding=encoding, **sqlparse_options)
-        return sql_str
-
-    def to_sql_implementation(
-        self, db_model, *, using, temp_id_source, force_sql=False
-    ):
-        return db_model.table_def_to_sql(self, using=using, force_sql=force_sql)
 
     # comparable to other table descriptions
     def __lt__(self, other):
@@ -768,7 +760,7 @@ def apply_to(self, a, *, target_table_key=None):
         data_map, a = self._reach_in(a)
         return WrappedOperatorPlatform(
             underlying=self.underlying.apply_to(a, target_table_key=target_table_key),
-            data_map=data_map
+            data_map=data_map,
         )
 
     # noinspection PyPep8Naming
@@ -1200,7 +1192,9 @@ def apply_to(self, a, *, target_table_key=None):
         new_sources = [
             s.apply_to(a, target_table_key=target_table_key) for s in self.sources
         ]
-        return new_sources[0].project_parsed(parsed_ops=self.ops, group_by=self.group_by)
+        return new_sources[0].project_parsed(
+            parsed_ops=self.ops, group_by=self.group_by
+        )
 
     def _equiv_nodes(self, other):
         if not self.group_by == other.group_by:
@@ -1517,9 +1511,7 @@ def apply_to(self, a, *, target_table_key=None):
             s.apply_to(a, target_table_key=target_table_key) for s in self.sources
         ]
         return new_sources[0].order_rows(
-            columns=self.order_columns,
-            reverse=self.reverse,
-            limit=self.limit,
+            columns=self.order_columns, reverse=self.reverse, limit=self.limit
         )
 
     def _equiv_nodes(self, other):
@@ -1948,8 +1940,7 @@ def apply_to(self, a, *, target_table_key=None):
             s.apply_to(a, target_table_key=target_table_key) for s in self.sources
         ]
         return new_sources[0].convert_records(
-            record_map=self.record_map,
-            blocks_out_table=self.blocks_out_table,
+            record_map=self.record_map, blocks_out_table=self.blocks_out_table
         )
 
     def _equiv_nodes(self, other):
@@ -2007,18 +1998,28 @@ def to_python_implementation(self, *, indent=0, strict=True, print_sources=True)
         return s
 
     def to_sql_implementation(self, db_model, *, using, temp_id_source):
-        res = self.sources[0].to_sql_implementation(
+        sub_query = self.sources[0].to_sql_implementation(
             db_model=db_model, using=using, temp_id_source=temp_id_source
         )
+        query = sub_query.to_sql(columns=using, db_model=db_model)
         if self.record_map.blocks_in is not None:
-            res = db_model.blocks_to_row_recs_query(
-                res, record_spec=self.record_map.blocks_in
+            query = db_model.blocks_to_row_recs_query(
+                query, record_spec=self.record_map.blocks_in
             )
         if self.record_map.blocks_out is not None:
-            res = db_model.row_recs_to_blocks_query(
-                res, record_spec=self.record_map.blocks_out, record_view=self.sources[1]
+            query = db_model.row_recs_to_blocks_query(
+                query,
+                record_spec=self.record_map.blocks_out,
+                record_view=self.sources[1],
             )
-        return res
+        if temp_id_source is None:
+            temp_id_source = [0]
+        view_name = "convert_records_" + str(temp_id_source[0])
+        temp_id_source[0] = temp_id_source[0] + 1
+        near_sql = data_algebra.near_sql.NearSQLq(
+            quoted_query_name=db_model.quote_identifier(view_name), query=query
+        )
+        return near_sql
 
     def eval_implementation(self, *, data_map, eval_env, data_model):
         if data_model is None: