update data validation

NathanielF · NathanielF · commit 60dc6e2cef02 · 2024-03-17T14:30:51.000Z
Signed-off-by: Nathaniel &lt;NathanielF@users.noreply.github.com&gt;
diff --git a/causalpy/data_validation.py b/causalpy/data_validation.py
@@ -140,4 +140,22 @@ class PropensityDataValidator:
 
     def _input_validation(self):
         """Validate the input data and model formula for correctness"""
-        pass
+        treatment = self.formula.split("~")[0]
+        test = treatment.strip() in self.data.columns
+        test  = test & (self.outcome_variable in self.data.columns)
+        if not test:
+            raise DataException(
+                f"""
+                The treatment variable:
+                {treatment} must appear in the data to be used
+                as an outcome variable. And {self.outcome_variable}
+                must also be available in the data to be re-weighted
+                """
+            )
+        T = self.data[treatment.strip()]
+        check_binary = len(np.unique(T)) > 2
+        if check_binary:
+            raise DataException(
+                """Warning. The treatment variable is not 0-1 Binary.
+                """
+            )
diff --git a/docs/source/notebooks/inv_prop_pymc.ipynb b/docs/source/notebooks/inv_prop_pymc.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 357,
+   "execution_count": 378,
    "metadata": {},
    "outputs": [
     {
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 325,
+   "execution_count": 376,
    "metadata": {},
    "outputs": [
     {
@@ -83,104 +83,53 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1.374096</td>\n",
-       "      <td>0.373163</td>\n",
-       "      <td>1</td>\n",
-       "      <td>6.054919</td>\n",
+       "      <td>-0.700611</td>\n",
+       "      <td>0.215690</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-1.060506</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1.051587</td>\n",
-       "      <td>0.834493</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2.927939</td>\n",
+       "      <td>0.880796</td>\n",
+       "      <td>1.082451</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3.778433</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>-0.450553</td>\n",
-       "      <td>0.232016</td>\n",
+       "      <td>-0.121070</td>\n",
+       "      <td>0.767333</td>\n",
        "      <td>0</td>\n",
-       "      <td>-0.043942</td>\n",
+       "      <td>0.617862</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>0.720264</td>\n",
-       "      <td>-0.539953</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.739484</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0.778325</td>\n",
-       "      <td>1.534670</td>\n",
+       "      <td>0.149978</td>\n",
+       "      <td>1.146856</td>\n",
        "      <td>1</td>\n",
-       "      <td>4.425341</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
+       "      <td>2.831018</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>9995</th>\n",
-       "      <td>0.890611</td>\n",
-       "      <td>1.266610</td>\n",
+       "      <th>4</th>\n",
+       "      <td>-0.506154</td>\n",
+       "      <td>0.113415</td>\n",
        "      <td>0</td>\n",
-       "      <td>2.732242</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9996</th>\n",
-       "      <td>1.428810</td>\n",
-       "      <td>1.557557</td>\n",
-       "      <td>1</td>\n",
-       "      <td>5.068505</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9997</th>\n",
-       "      <td>1.678820</td>\n",
-       "      <td>1.254265</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4.317824</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9998</th>\n",
-       "      <td>1.341190</td>\n",
-       "      <td>1.002567</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4.527394</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9999</th>\n",
-       "      <td>1.330508</td>\n",
-       "      <td>0.702635</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2.982631</td>\n",
+       "      <td>-0.106079</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>10000 rows × 4 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "            x1        x2  trt   outcome\n",
-       "0     1.374096  0.373163    1  6.054919\n",
-       "1     1.051587  0.834493    0  2.927939\n",
-       "2    -0.450553  0.232016    0 -0.043942\n",
-       "3     0.720264 -0.539953    0  0.739484\n",
-       "4     0.778325  1.534670    1  4.425341\n",
-       "...        ...       ...  ...       ...\n",
-       "9995  0.890611  1.266610    0  2.732242\n",
-       "9996  1.428810  1.557557    1  5.068505\n",
-       "9997  1.678820  1.254265    1  4.317824\n",
-       "9998  1.341190  1.002567    1  4.527394\n",
-       "9999  1.330508  0.702635    1  2.982631\n",
-       "\n",
-       "[10000 rows x 4 columns]"
+       "         x1        x2  trt   outcome\n",
+       "0 -0.700611  0.215690    0 -1.060506\n",
+       "1  0.880796  1.082451    1  3.778433\n",
+       "2 -0.121070  0.767333    0  0.617862\n",
+       "3  0.149978  1.146856    1  2.831018\n",
+       "4 -0.506154  0.113415    0 -0.106079"
       ]
      },
-     "execution_count": 325,
+     "execution_count": 376,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -189,7 +138,7 @@
     "df1 = pd.DataFrame(np.random.multivariate_normal([0.5, 1], [[2, 1], [1, 1]], size=10000), columns=['x1', 'x2'])\n",
     "df1['trt'] = np.where(-0.5 + 0.25 * df1['x1'] + 0.75 * df1['x2'] +  np.random.normal(0, 1, size=10000) > 0, 1, 0)\n",
     "df1['outcome'] = 2 * df1['trt'] + df1['x1'] + df1['x2'] + np.random.normal(0, 1, size=10000)\n",
-    "df1"
+    "df1.head()"
    ]
   },
   {
@@ -208,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 338,
+   "execution_count": 379,
    "metadata": {},
    "outputs": [
     {
@@ -227,10 +176,10 @@
     {
      "data": {
       "text/plain": [
-       "<causalpy.pymc_experiments.InversePropensityWeighting at 0x2aebe6110>"
+       "<causalpy.pymc_experiments.InversePropensityWeighting at 0x32412ee50>"
       ]
      },
-     "execution_count": 338,
+     "execution_count": 379,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -878,7 +827,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 373,
+   "execution_count": 380,
    "metadata": {},
    "outputs": [
     {
@@ -969,7 +918,7 @@
        "4   40     0    0              20        19   4.989251"
       ]
      },
-     "execution_count": 373,
+     "execution_count": 380,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -981,7 +930,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 365,
+   "execution_count": 381,
    "metadata": {},
    "outputs": [
     {
@@ -1000,10 +949,10 @@
     {
      "data": {
       "text/plain": [
-       "<causalpy.pymc_experiments.InversePropensityWeighting at 0x2e6f4ba90>"
+       "<causalpy.pymc_experiments.InversePropensityWeighting at 0x3bdbaa4d0>"
       ]
      },
-     "execution_count": 365,
+     "execution_count": 381,
      "metadata": {},
      "output_type": "execute_result"
     }