work on composition

JohnMount · JohnMount · commit c0295cd95994 · 2019-09-28T23:43:28.000-07:00
diff --git a/Examples/LogisticExample/Logistic2.ipynb b/Examples/LogisticExample/Logistic2.ipynb
@@ -0,0 +1,155 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "TableDescription(table_name='d', column_names=['subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1', 'irrelevantCol2']) .\\\n   extend({'probability': '(assessmentTotal * 0.237).exp()'}) .\\\n   extend({'total': 'probability.sum()'}, partition_by=['subjectID']) .\\\n   extend({'probability': 'probability / total'}) .\\\n   extend({'sort_key': '-probability'}) .\\\n   extend({'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['sort_key']) .\\\n   select_rows('row_number == 1') .\\\n   select_columns(['subjectID', 'surveyCategory', 'probability']) .\\\n   rename_columns({'diagnosis': 'surveyCategory'})\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "import pandas\n",
+    "\n",
+    "from data_algebra.data_ops import *  # https://github.com/WinVector/data_algebra\n",
+    "from data_algebra.data_pipe import Locum\n",
+    "\n",
+    "d_local = pandas.DataFrame({\n",
+    "    'subjectID':[1, 1, 2, 2],\n",
+    "    'surveyCategory': [ \"withdrawal behavior\", \"positive re-framing\", \"withdrawal behavior\", \"positive re-framing\"],\n",
+    "    'assessmentTotal': [5, 2, 3, 4],\n",
+    "    'irrelevantCol1': ['irrel1']*4,\n",
+    "    'irrelevantCol2': ['irrel2']*4,\n",
+    "})\n",
+    "\n",
+    "scale = 0.237\n",
+    "\n",
+    "with data_algebra.env.Env(locals()) as env:\n",
+    "    ops = data_algebra.data_ops.describe_table(d_local, 'd'). \\\n",
+    "        extend({'probability': '(assessmentTotal * scale).exp()'}). \\\n",
+    "        extend({'total': 'probability.sum()'},\n",
+    "               partition_by='subjectID'). \\\n",
+    "        extend({'probability': 'probability/total'}). \\\n",
+    "        extend({'sort_key': '-probability'}). \\\n",
+    "        extend({'row_number': '_row_number()'},\n",
+    "               partition_by=['subjectID'],\n",
+    "               order_by=['sort_key']). \\\n",
+    "        select_rows('row_number == 1'). \\\n",
+    "        select_columns(['subjectID', 'surveyCategory', 'probability']). \\\n",
+    "        rename_columns({'diagnosis': 'surveyCategory'})\n",
+    "\n",
+    "print(ops)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "TableDescription(table_name='d', column_names=['subjectID', 'surveyCategory', 'assessmentTotal', 'irrelevantCol1', 'irrelevantCol2']) .\\\n   extend({'probability': '(assessmentTotal * 0.237).exp()'}) .\\\n   extend({'total': 'probability.sum()'}, partition_by=['subjectID']) .\\\n   extend({'probability': 'probability / total'}) .\\\n   extend({'sort_key': '-probability'}) .\\\n   extend({'row_number': '_row_number()'}, partition_by=['subjectID'], order_by=['sort_key']) .\\\n   select_rows('row_number == 1') .\\\n   select_columns(['subjectID', 'surveyCategory', 'probability']) .\\\n   rename_columns({'diagnosis': 'surveyCategory'})\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "prob_caclulation = Locum(). \\\n",
+    "    extend({'probability': '(assessmentTotal * 0.237).exp()'}). \\\n",
+    "    extend({'total': 'probability.sum()'},\n",
+    "           partition_by='subjectID'). \\\n",
+    "    extend({'probability': 'probability/total'})\n",
+    "\n",
+    "top_rank = Locum(). \\\n",
+    "    extend({'sort_key': '-probability'}). \\\n",
+    "    extend({'row_number': '_row_number()'},\n",
+    "           partition_by=['subjectID'],\n",
+    "           order_by=['sort_key']). \\\n",
+    "    select_rows('row_number == 1')\n",
+    "\n",
+    "clean_up_columns = Locum(). \\\n",
+    "    select_columns(['subjectID', 'surveyCategory', 'probability']). \\\n",
+    "    rename_columns({'diagnosis': 'surveyCategory'})\n",
+    "\n",
+    "ops =  data_algebra.data_ops.describe_table(d_local, 'd') +\\\n",
+    "    prob_caclulation +\\\n",
+    "    top_rank +\\\n",
+    "    clean_up_columns\n",
+    "\n",
+    "print(ops)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "   subjectID            diagnosis  probability\n0          1  withdrawal behavior     0.670622\n1          2  positive re-framing     0.558974",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>subjectID</th>\n      <th>diagnosis</th>\n      <th>probability</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>withdrawal behavior</td>\n      <td>0.670622</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>positive re-framing</td>\n      <td>0.558974</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "output_type": "execute_result",
+     "execution_count": 3
+    }
+   ],
+   "source": [
+    "d_local >> ops\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/data_algebra/data_pipe.py b/data_algebra/data_pipe.py
@@ -213,32 +213,47 @@ def __init__(self):
         data_algebra.data_ops.OperatorPlatform.__init__(self)
         self.ops = []
 
-    # noinspection PyPep8Naming
-    def realize(self, X):
-        pipeline = data_algebra.data_ops.describe_table(X, table_name="X")
+    def apply_to(self, pipeline):
+        if not isinstance(pipeline, data_algebra.data_ops.OperatorPlatform):
+            raise TypeError("Expected othter to be a data_algebra.data_ops.OperatorPlatform")
         for s in self.ops:
             # pipeline = pipeline >> s
             pipeline = s.apply(pipeline)
         return pipeline
 
+    def append(self, other):
+        if isinstance(other, Locum):
+            for o in other.ops:
+                self.ops.append(o)
+        elif isinstance(other, data_algebra.pipe.PipeStep):
+            self.ops.append(other)
+        else:
+            raise TypeError("unexpeted type for Locum + " + str(type(other)))
+        return self
+
+    def realize(self, x):
+        pipeline = data_algebra.data_ops.describe_table(x, table_name="x")
+        return self.apply_to(pipeline)
+
     # noinspection PyPep8Naming
     def transform(self, X):
+        if isinstance(X, data_algebra.data_ops.OperatorPlatform):
+            return self.apply_to(X)
         pipeline = self.realize(X)
         return pipeline.transform(X)
 
     def __rrshift__(self, other):  # override other >> self
         return self.transform(other)
 
-    def __add__(self, other):
-        if not isinstance(other, Locum):
-            raise TypeError("Expected other to be of type data_algebra.data_pipe.Locum")
+    def __add__(self, other):  # override self + other
         res = Locum()
-        for o in self.ops:
-            res.ops.append(o)
-        for o in other.ops:
-            res.ops.append(o)
+        res.append(self)
+        res.append(other)
         return res
 
+    def __radd__(self, other):  # override other + self
+        return self.apply_to(other)
+
     # print
 
     def __repr__(self):