work on example

JohnMount · JohnMount · commit 7d6c2ad0a5bf · 2022-01-16T13:23:48.000-08:00
diff --git a/Examples/Methods/MethodWarnings.ipynb b/Examples/Methods/MethodWarnings.ipynb
@@ -28,6 +28,7 @@
    },
    "outputs": [],
    "source": [
+    "import numpy as np\n",
     "import pandas as pd\n",
     "from data_algebra.data_ops import descr\n",
     "import data_algebra.test_util\n",
@@ -153,6 +154,34 @@
     }
    }
   },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can check this matches expectations."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [],
+   "source": [
+    "for group in set(d['g']):\n",
+    "    assert np.all(pandas_res.loc[pandas_res['g'] == group, 'xm']\n",
+    "                    == np.median(d.loc[d['g'] == group, 'x']))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
   {
    "cell_type": "markdown",
    "source": [
@@ -171,13 +200,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "outputs": [
     {
      "data": {
       "text/plain": "(TableDescription(table_name=\"d\", column_names=[\"id\", \"x\", \"g\"]))"
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -209,7 +238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "outputs": [],
    "source": [
     "bigquery_sql = bigquery_handle.to_sql(ops)\n"
@@ -235,14 +264,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "outputs": [
     {
      "data": {
-      "text/plain": "   g  id     x    xm\n0  a   0   4.0   3.0\n1  b   1  50.0  26.1\n2  a   2   1.0   3.0\n3  a   3   3.0   3.0\n4  b   4   2.2  26.1",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>g</th>\n      <th>id</th>\n      <th>x</th>\n      <th>xm</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>a</td>\n      <td>0</td>\n      <td>4.0</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>b</td>\n      <td>1</td>\n      <td>50.0</td>\n      <td>26.1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>a</td>\n      <td>2</td>\n      <td>1.0</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>a</td>\n      <td>3</td>\n      <td>3.0</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>b</td>\n      <td>4</td>\n      <td>2.2</td>\n      <td>26.1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+      "text/plain": "   id     x  g    xm\n0   0   4.0  a   3.0\n1   1  50.0  b  26.1\n2   2   1.0  a   3.0\n3   3   3.0  a   3.0\n4   4   2.2  b  26.1",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>x</th>\n      <th>g</th>\n      <th>xm</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>4.0</td>\n      <td>a</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>50.0</td>\n      <td>b</td>\n      <td>26.1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>1.0</td>\n      <td>a</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>3.0</td>\n      <td>a</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>2.2</td>\n      <td>b</td>\n      <td>26.1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -262,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "outputs": [],
    "source": [
     "assert data_algebra.test_util.equivalent_frames(pandas_res, db_res)"
@@ -290,7 +319,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "outputs": [],
    "source": [
     "ops_p = (\n",
@@ -311,7 +340,7 @@
   {
    "cell_type": "markdown",
    "source": [
-    "This pipeline works as follows."
+    "This pipeline works in Pandas as follows."
    ],
    "metadata": {
     "collapsed": false,
@@ -322,14 +351,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "outputs": [
     {
      "data": {
       "text/plain": "   g    xm\n0  a   3.0\n1  b  26.1",
       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>g</th>\n      <th>xm</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>a</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>b</td>\n      <td>26.1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
      },
-     "execution_count": 10,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -349,7 +378,7 @@
   {
    "cell_type": "markdown",
    "source": [
-    "But we get a warning if we attempt to convert this to BigQuery SQL."
+    "And we again see expected results."
    ],
    "metadata": {
     "collapsed": false,
@@ -360,13 +389,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
+   "outputs": [],
+   "source": [
+    "for group in set(d['g']):\n",
+    "    assert (pandas_res_p.loc[pandas_res_p['g'] == group, 'xm'].values[0]\n",
+    "                    == np.median(d.loc[d['g'] == group, 'x']))\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "But, we get a warning if we attempt to convert this to BigQuery SQL."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/johnmount/Documents/work/data_algebra/data_algebra/db_model.py:1692: UserWarning: BigQueryModel translation doesn't fully support method context: [MethodUse(op_name='median', is_project=True, is_windowed=False, is_ordered=False)]\n",
+      "/Users/johnmount/Documents/work/data_algebra/data_algebra/db_model.py:1694: UserWarning: BigQueryModel translation doesn't fully support method context: [MethodUse(op_name='median', is_project=True, is_windowed=False, is_ordered=False)]\n",
       "  warnings.warn(f\"{self} translation doesn't fully support method context: {non_rec}\", UserWarning)\n"
      ]
     }
@@ -396,15 +453,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "caught: 400 percentile_cont aggregate function is not supported.\n",
       "\n",
-      "(job ID: 0f2cd57d-9e20-4366-a95c-109997b8c75f)\n",
+      "(job ID: f5d44be0-65d1-4860-b985-ca427b38a83b)\n",
       "\n",
       "                 -----Query Job SQL Follows-----                  \n",
       "\n",
@@ -416,8 +473,8 @@
       "   5:WITH\n",
       "   6: `table_reference_0` AS (\n",
       "   7:  SELECT\n",
-      "   8:   `g` ,\n",
-      "   9:   `x`\n",
+      "   8:   `x` ,\n",
+      "   9:   `g`\n",
       "  10:  FROM\n",
       "  11:   `data-algebra-test.test_1.d`\n",
       "  12: )\n",
@@ -463,7 +520,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 15,
    "outputs": [
     {
      "name": "stdout",
@@ -476,8 +533,8 @@
       "WITH\n",
       " `table_reference_0` AS (\n",
       "  SELECT\n",
-      "   `g` ,\n",
-      "   `x`\n",
+      "   `x` ,\n",
+      "   `g`\n",
       "  FROM\n",
       "   `data-algebra-test.test_1.d`\n",
       " )\n",
@@ -516,7 +573,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 16,
    "outputs": [
     {
      "name": "stdout",
@@ -529,9 +586,9 @@
       "WITH\n",
       " `extend_0` AS (\n",
       "  SELECT  -- .extend({ 'xm': 'x.median()'}, partition_by=['g'])\n",
-      "   `g` ,\n",
       "   `id` ,\n",
       "   `x` ,\n",
+      "   `g` ,\n",
       "   PERCENTILE_CONT(`x`, 0.5) OVER ( PARTITION BY `g`  )  AS `xm`\n",
       "  FROM\n",
       "   `data-algebra-test.test_1.d`\n",
@@ -559,7 +616,7 @@
   {
    "cell_type": "markdown",
    "source": [
-    "The above failure can come as a surprise. But the new feature of the data algebra is: the \"translate to SQL\" step warned we had a potential problem. This doesn't even require a full database handle, it is data incorporated into the database model during package assembly.\n",
+    "Given how similar the two SQL queries are, the above failure can come as a surprise. But a new feature of the data algebra is: the \"translate to SQL\" step warns we have a potential problem. This doesn't even require a full database handle, it is data incorporated into the database model during package assembly.\n",
     "\n",
     "## Patching The Solution\n",
     "\n",
@@ -574,14 +631,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 17,
    "outputs": [
     {
      "data": {
       "text/plain": "     xm  g\n0   3.0  a\n1  26.1  b",
       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>xm</th>\n      <th>g</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>3.0</td>\n      <td>a</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>26.1</td>\n      <td>b</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
      },
-     "execution_count": 15,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -608,7 +665,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "outputs": [],
    "source": [
     "assert data_algebra.test_util.equivalent_frames(pandas_res_p, db_res_p)"
@@ -641,13 +698,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
    "outputs": [
     {
      "data": {
       "text/plain": "(\n    TableDescription(table_name=\"d\", column_names=[\"id\", \"x\", \"g\"])\n    .extend({\"xm\": \"x.median()\"}, partition_by=[\"g\"])\n    .project({\"xm\": \"xm.mean()\"}, group_by=[\"g\"])\n)"
      },
-     "execution_count": 17,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -676,7 +733,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 20,
    "outputs": [],
    "source": [
     "# clean up\n",