#76 #44 progress on banking example data processing

drbenvincent · drbenvincent · commit 10a9c826f05d · 2022-11-19T19:49:53.000Z
diff --git a/docs/notebooks/did_pymc_banks.ipynb b/docs/notebooks/did_pymc_banks.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -22,7 +22,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -32,18 +32,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
@@ -60,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -89,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -111,12 +102,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As a final processing step to make the data amenable to analysis, we will convert from wide form into long form."
+    "Just a few more a final processing steps to make the data amenable to analysis. We will convert from wide form into long form. Then we will add a new column `treated` which indicates the observations where treatment has taken place."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -238,7 +229,7 @@
        "11  1934  Eighth District  110.0"
       ]
      },
-     "execution_count": 126,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -255,11 +246,226 @@
     "df_long"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>year</th>\n",
+       "      <th>district</th>\n",
+       "      <th>bib</th>\n",
+       "      <th>treated</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1929</td>\n",
+       "      <td>Sixth District</td>\n",
+       "      <td>141.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1929</td>\n",
+       "      <td>Eighth District</td>\n",
+       "      <td>170.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1930</td>\n",
+       "      <td>Sixth District</td>\n",
+       "      <td>135.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>1930</td>\n",
+       "      <td>Eighth District</td>\n",
+       "      <td>165.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1931</td>\n",
+       "      <td>Sixth District</td>\n",
+       "      <td>121.0</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>1931</td>\n",
+       "      <td>Eighth District</td>\n",
+       "      <td>132.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1932</td>\n",
+       "      <td>Sixth District</td>\n",
+       "      <td>113.0</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1932</td>\n",
+       "      <td>Eighth District</td>\n",
+       "      <td>120.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1933</td>\n",
+       "      <td>Sixth District</td>\n",
+       "      <td>102.0</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>1933</td>\n",
+       "      <td>Eighth District</td>\n",
+       "      <td>111.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1934</td>\n",
+       "      <td>Sixth District</td>\n",
+       "      <td>102.0</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>1934</td>\n",
+       "      <td>Eighth District</td>\n",
+       "      <td>110.0</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    year         district    bib  treated\n",
+       "0   1929   Sixth District  141.0    False\n",
+       "6   1929  Eighth District  170.0    False\n",
+       "1   1930   Sixth District  135.0    False\n",
+       "7   1930  Eighth District  165.0    False\n",
+       "2   1931   Sixth District  121.0     True\n",
+       "8   1931  Eighth District  132.0    False\n",
+       "3   1932   Sixth District  113.0     True\n",
+       "9   1932  Eighth District  120.0    False\n",
+       "4   1933   Sixth District  102.0     True\n",
+       "10  1933  Eighth District  111.0    False\n",
+       "5   1934   Sixth District  102.0     True\n",
+       "11  1934  Eighth District  110.0    False"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# df_long.assign(treated=lambda x: True if (x.year>=1931 & x.district==\"Sixth District\") else False)\n",
+    "\n",
+    "df_long[\"treated\"] = (df_long.year >= 1931) & (df_long.district == \"Sixth District\")\n",
+    "df_long"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Run the analysis"
+    "## Analysis 1\n",
+    "\n",
+    "First we'll do an analysis just looking at data from 1930 and 1931. This way we just have a single pre-intervention and a single post-intervention measurement."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df1 = df_long[df_long.year.isin([1930, 1931])]\n",
+    "# df1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from causalpy.pymc_experiments import DifferenceInDifferences\n",
+    "from causalpy.pymc_models import LinearRegression\n",
+    "\n",
+    "result = DifferenceInDifferences(\n",
+    "    df_long[df_long.year.isin([1930, 1931])],\n",
+    "    formula=\"bib ~ 1 + district + year + district:treated\",\n",
+    "    time_variable_name=\"year\",\n",
+    "    prediction_model=LinearRegression(),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, ax = result.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax = az.plot_posterior(result.causal_impact, ref_val=0)\n",
+    "ax.set(title=\"Posterior estimate of causal impact\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis 2\n",
+    "\n",
+    "Now we'll do a difference in differences analysis of the full dataset."
    ]
   },
   {
@@ -273,7 +479,7 @@
     "\n",
     "result = DifferenceInDifferences(\n",
     "    df_long,\n",
-    "    formula=\"bib ~ 1 + district + year + district:year\",\n",
+    "    formula=\"bib ~ 1 + district + year + district:treated\",\n",
     "    time_variable_name=\"year\",\n",
     "    prediction_model=LinearRegression(),\n",
     ")"