CausalInferenceLab
diff --git a/‎.DS_Store‎
-6 KB b/‎.DS_Store‎
-6 KB
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎book/scm/backdoor_criterion.ipynb‎
Lines changed: 58 additions & 104 deletions b/‎book/scm/backdoor_criterion.ipynb‎
Lines changed: 58 additions & 104 deletions
diff --git a/‎book/scm/causal_model.png‎
81.9 KB b/‎book/scm/causal_model.png‎
81.9 KB
@@ -136,3 +136,5 @@ _build/
 
 # poetry files
 pyproject.toml
+
+.DS_Store
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "dd8821be",
    "metadata": {},
    "outputs": [],
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "3f63bbe8",
    "metadata": {},
    "outputs": [
@@ -282,7 +282,7 @@
        "[5 rows x 67 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -320,7 +320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "ff2f4b4b",
    "metadata": {},
    "outputs": [
@@ -358,23 +358,6 @@
     "print(\"처치군/대조군:\\n\", df_clean[treatment].value_counts())"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "d985c724",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 범주형 변수 원핫 인코딩\n",
-    "categorical_vars = [\"sex\", \"race\", \"education\", \"active\", \"exercise\"]\n",
-    "df_encoded = pd.get_dummies(df_clean, columns=categorical_vars, drop_first=True)\n",
-    "\n",
-    "# 연속형 변수 스케일링\n",
-    "numeric_confounders = [\"age\", \"smokeintensity\", \"smokeyrs\", \"wt71\"]\n",
-    "scaler = StandardScaler()\n",
-    "df_encoded[numeric_confounders] = scaler.fit_transform(df_encoded[numeric_confounders])"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "25ebfea3",
@@ -385,7 +368,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 5,
    "id": "65df40db",
    "metadata": {},
    "outputs": [
@@ -401,8 +384,7 @@
     }
    ],
    "source": [
-    "# 시각화용 DAG (원핫 인코딩 전 변수 기준)\n",
-    "gml_graph_viz = \"\"\"\n",
+    "gml_graph = \"\"\"\n",
     "graph [\n",
     "  directed 1\n",
     "\n",
@@ -442,41 +424,12 @@
     "]\n",
     "\"\"\"\n",
     "\n",
-    "cm_for_viz = CausalModel(data=df_clean, treatment=treatment, outcome=outcome, graph=gml_graph_viz)\n",
-    "cm_for_viz.view_model(\n",
+    "cm = CausalModel(data=df_clean, treatment=treatment, outcome=outcome, graph=gml_graph)\n",
+    "cm.view_model(\n",
     "    layout=\"dot\"\n",
     ")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "2539d612",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 추정용 DAG\n",
-    "#   - 모든 공변량(원핫 포함) -> {qsmk, wt82_71}\n",
-    "#   - qsmk -> wt82_71\n",
-    "\n",
-    "all_cols = df_encoded.columns.tolist()\n",
-    "confounder_cols = [c for c in all_cols if c not in [treatment, outcome]]\n",
-    "\n",
-    "gml_nodes = []\n",
-    "gml_nodes.append(f'  node [ id \"{treatment}\" label \"{treatment}\" ]')\n",
-    "gml_nodes.append(f'  node [ id \"{outcome}\" label \"{outcome}\" ]')\n",
-    "for c in confounder_cols:\n",
-    "    gml_nodes.append(f'  node [ id \"{c}\" label \"{c}\" ]')\n",
-    "\n",
-    "gml_edges = []\n",
-    "for c in confounder_cols:\n",
-    "    gml_edges.append(f'  edge [ source \"{c}\" target \"{treatment}\" ]')\n",
-    "    gml_edges.append(f'  edge [ source \"{c}\" target \"{outcome}\" ]')\n",
-    "gml_edges.append(f'  edge [ source \"{treatment}\" target \"{outcome}\" ]')\n",
-    "\n",
-    "gml_graph = 'graph [\\n  directed 1\\n\\n' + \"\\n\".join(gml_nodes) + \"\\n\\n\" + \"\\n\".join(gml_edges) + \"\\n]\""
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "5aa85b75",
@@ -487,7 +440,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "5d714d62",
    "metadata": {},
    "outputs": [
@@ -501,13 +454,13 @@
       "Estimand name: backdoor\n",
       "Estimand expression:\n",
       "   d                                                                           ↪\n",
-      "───────(E[wt_82_71|sex_1,active_1,education_4,race_1,education_2,wt71,educatio ↪\n",
+      "───────(E[wt_82_71|sex,age,smokeyrs,race,active,wt71,education,exercise,smokei ↪\n",
       "d[qsmk]                                                                        ↪\n",
       "\n",
-      "↪                                                                             \n",
-      "↪ n_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age])\n",
-      "↪                                                                             \n",
-      "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age,U) = P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age)\n",
+      "↪           \n",
+      "↪ ntensity])\n",
+      "↪           \n",
+      "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity,U) = P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity)\n",
       "\n",
       "### Estimand : 2\n",
       "Estimand name: iv\n",
@@ -521,20 +474,20 @@
       "Estimand name: general_adjustment\n",
       "Estimand expression:\n",
       "   d                                                                           ↪\n",
-      "───────(E[wt_82_71|sex_1,active_1,education_4,race_1,education_2,wt71,educatio ↪\n",
+      "───────(E[wt_82_71|sex,age,smokeyrs,race,active,wt71,education,exercise,smokei ↪\n",
       "d[qsmk]                                                                        ↪\n",
       "\n",
-      "↪                                                                             \n",
-      "↪ n_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age])\n",
-      "↪                                                                             \n",
-      "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age,U) = P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age)\n",
+      "↪           \n",
+      "↪ ntensity])\n",
+      "↪           \n",
+      "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity,U) = P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity)\n",
       "\n"
      ]
     }
    ],
    "source": [
     "est_model = CausalModel(\n",
-    "    data=df_encoded,\n",
+    "    data=df_clean,\n",
     "    treatment=treatment,\n",
     "    outcome=outcome,\n",
     "    graph=gml_graph\n",
@@ -579,7 +532,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "id": "830fe396",
    "metadata": {},
    "outputs": [
@@ -588,7 +541,7 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "[ATE] Linear Regression: 3.3811710339880983\n"
+      "[ATE] Linear Regression: 3.3811710339880823\n"
      ]
     }
    ],
@@ -610,18 +563,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "id": "a7aa0ee5",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[ATE] DR Learner: 3.893642417299098\n",
-      "[ATE] DR Learner 95% CI: [[[3.35046186]]\n",
+      "[ATE] DR Learner: 3.8479455513705227\n",
+      "[ATE] DR Learner 95% CI: [[[3.12167472]]\n",
       "\n",
-      " [[5.20724188]]]\n"
+      " [[5.29732315]]]\n"
      ]
     }
    ],
@@ -657,18 +610,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "id": "a91cdd48",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[ATE] DML: 3.464128951245904\n",
-      "[ATE] DML 95% CI: [[[2.4019656]]\n",
+      "[ATE] DML: 3.8611289087159135\n",
+      "[ATE] DML 95% CI: [[[3.06774555]]\n",
       "\n",
-      " [[4.4395689]]]\n"
+      " [[5.17092571]]]\n"
      ]
     }
    ],
@@ -696,17 +649,25 @@
     "\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "60080461",
+   "metadata": {},
+   "source": [
+    "## Refute"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "869355b3",
+   "execution_count": 15,
+   "id": "18481fc5",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[ATE] DML: 3.464128951245904\n"
+      "[ATE] DML: 3.8611289087159135\n"
      ]
     }
    ],
@@ -725,14 +686,6 @@
     "print(\"[ATE] DML:\", estimate_dml.value)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "60080461",
-   "metadata": {},
-   "source": [
-    "## Refute"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "bdcda503",
@@ -749,7 +702,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 11,
    "id": "9ad86368",
    "metadata": {},
    "outputs": [
@@ -758,9 +711,9 @@
      "output_type": "stream",
      "text": [
       "Refute: Add a random common cause\n",
-      "Estimated effect:3.6087034824487034\n",
-      "New effect:3.4568539723790566\n",
-      "p value:0.42\n",
+      "Estimated effect:3.5111934415796173\n",
+      "New effect:3.4686090208687586\n",
+      "p value:0.72\n",
       "\n"
      ]
     }
@@ -785,12 +738,13 @@
     "- **기대**: New effect ≈ Estimated effect\n",
     "- **해석**:\n",
     "   - 크게 변하지 않으면 → 잠재적 누락변수(confounder) 에도 견고(robust)\n",
-    "   - 크게 변하면 → 모델이 숨은 교란에 민감, 추가 변수 고려 필요"
+    "   - 크게 변하면 → 모델이 숨은 교란에 민감, 추가 변수 고려 필요\n",
+    "- **참고**: 도메인 지식을 기반으로, 교란이 처리변수와 결과에 미치는 영향의 크기는 사용자가 직접 설정해야 합니다."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 12,
    "id": "dcce63b4",
    "metadata": {},
    "outputs": [
@@ -799,8 +753,8 @@
      "output_type": "stream",
      "text": [
       "Refute: Add an Unobserved Common Cause\n",
-      "Estimated effect:3.6087034824487034\n",
-      "New effect:3.4630068223974586\n",
+      "Estimated effect:3.5111934415796173\n",
+      "New effect:2.993841050044186\n",
       "\n"
      ]
     }
@@ -834,7 +788,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 13,
    "id": "545e6293",
    "metadata": {},
    "outputs": [
@@ -843,9 +797,9 @@
      "output_type": "stream",
      "text": [
       "Refute: Use a Placebo Treatment\n",
-      "Estimated effect:3.6087034824487034\n",
-      "New effect:-0.01017831576756214\n",
-      "p value:0.4903498144600218\n",
+      "Estimated effect:3.5111934415796173\n",
+      "New effect:0.14283539321647556\n",
+      "p value:0.3866345448655441\n",
       "\n"
      ]
     }
@@ -879,7 +833,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 14,
    "id": "69481385",
    "metadata": {},
    "outputs": [
@@ -888,9 +842,9 @@
      "output_type": "stream",
      "text": [
       "Refute: Use a subset of data\n",
-      "Estimated effect:3.6087034824487034\n",
-      "New effect:3.5177935748179356\n",
-      "p value:0.36231391604195595\n",
+      "Estimated effect:3.5111934415796173\n",
+      "New effect:3.4052136015402126\n",
+      "p value:0.4004970164482444\n",
       "\n"
      ]
     }