|
29 | 29 | }, |
30 | 30 | { |
31 | 31 | "cell_type": "code", |
32 | | - "execution_count": 2, |
| 32 | + "execution_count": 1, |
33 | 33 | "id": "dd8821be", |
34 | 34 | "metadata": {}, |
35 | 35 | "outputs": [], |
|
62 | 62 | }, |
63 | 63 | { |
64 | 64 | "cell_type": "code", |
65 | | - "execution_count": 3, |
| 65 | + "execution_count": 2, |
66 | 66 | "id": "3f63bbe8", |
67 | 67 | "metadata": {}, |
68 | 68 | "outputs": [ |
|
282 | 282 | "[5 rows x 67 columns]" |
283 | 283 | ] |
284 | 284 | }, |
285 | | - "execution_count": 3, |
| 285 | + "execution_count": 2, |
286 | 286 | "metadata": {}, |
287 | 287 | "output_type": "execute_result" |
288 | 288 | } |
|
320 | 320 | }, |
321 | 321 | { |
322 | 322 | "cell_type": "code", |
323 | | - "execution_count": 4, |
| 323 | + "execution_count": 3, |
324 | 324 | "id": "ff2f4b4b", |
325 | 325 | "metadata": {}, |
326 | 326 | "outputs": [ |
|
358 | 358 | "print(\"처치군/대조군:\\n\", df_clean[treatment].value_counts())" |
359 | 359 | ] |
360 | 360 | }, |
361 | | - { |
362 | | - "cell_type": "code", |
363 | | - "execution_count": 5, |
364 | | - "id": "d985c724", |
365 | | - "metadata": {}, |
366 | | - "outputs": [], |
367 | | - "source": [ |
368 | | - "# 범주형 변수 원핫 인코딩\n", |
369 | | - "categorical_vars = [\"sex\", \"race\", \"education\", \"active\", \"exercise\"]\n", |
370 | | - "df_encoded = pd.get_dummies(df_clean, columns=categorical_vars, drop_first=True)\n", |
371 | | - "\n", |
372 | | - "# 연속형 변수 스케일링\n", |
373 | | - "numeric_confounders = [\"age\", \"smokeintensity\", \"smokeyrs\", \"wt71\"]\n", |
374 | | - "scaler = StandardScaler()\n", |
375 | | - "df_encoded[numeric_confounders] = scaler.fit_transform(df_encoded[numeric_confounders])" |
376 | | - ] |
377 | | - }, |
378 | 361 | { |
379 | 362 | "cell_type": "markdown", |
380 | 363 | "id": "25ebfea3", |
|
385 | 368 | }, |
386 | 369 | { |
387 | 370 | "cell_type": "code", |
388 | | - "execution_count": 21, |
| 371 | + "execution_count": 5, |
389 | 372 | "id": "65df40db", |
390 | 373 | "metadata": {}, |
391 | 374 | "outputs": [ |
|
401 | 384 | } |
402 | 385 | ], |
403 | 386 | "source": [ |
404 | | - "# 시각화용 DAG (원핫 인코딩 전 변수 기준)\n", |
405 | | - "gml_graph_viz = \"\"\"\n", |
| 387 | + "gml_graph = \"\"\"\n", |
406 | 388 | "graph [\n", |
407 | 389 | " directed 1\n", |
408 | 390 | "\n", |
|
442 | 424 | "]\n", |
443 | 425 | "\"\"\"\n", |
444 | 426 | "\n", |
445 | | - "cm_for_viz = CausalModel(data=df_clean, treatment=treatment, outcome=outcome, graph=gml_graph_viz)\n", |
446 | | - "cm_for_viz.view_model(\n", |
| 427 | + "cm = CausalModel(data=df_clean, treatment=treatment, outcome=outcome, graph=gml_graph)\n", |
| 428 | + "cm.view_model(\n", |
447 | 429 | " layout=\"dot\"\n", |
448 | 430 | ")" |
449 | 431 | ] |
450 | 432 | }, |
451 | | - { |
452 | | - "cell_type": "code", |
453 | | - "execution_count": 7, |
454 | | - "id": "2539d612", |
455 | | - "metadata": {}, |
456 | | - "outputs": [], |
457 | | - "source": [ |
458 | | - "# 추정용 DAG\n", |
459 | | - "# - 모든 공변량(원핫 포함) -> {qsmk, wt82_71}\n", |
460 | | - "# - qsmk -> wt82_71\n", |
461 | | - "\n", |
462 | | - "all_cols = df_encoded.columns.tolist()\n", |
463 | | - "confounder_cols = [c for c in all_cols if c not in [treatment, outcome]]\n", |
464 | | - "\n", |
465 | | - "gml_nodes = []\n", |
466 | | - "gml_nodes.append(f' node [ id \"{treatment}\" label \"{treatment}\" ]')\n", |
467 | | - "gml_nodes.append(f' node [ id \"{outcome}\" label \"{outcome}\" ]')\n", |
468 | | - "for c in confounder_cols:\n", |
469 | | - " gml_nodes.append(f' node [ id \"{c}\" label \"{c}\" ]')\n", |
470 | | - "\n", |
471 | | - "gml_edges = []\n", |
472 | | - "for c in confounder_cols:\n", |
473 | | - " gml_edges.append(f' edge [ source \"{c}\" target \"{treatment}\" ]')\n", |
474 | | - " gml_edges.append(f' edge [ source \"{c}\" target \"{outcome}\" ]')\n", |
475 | | - "gml_edges.append(f' edge [ source \"{treatment}\" target \"{outcome}\" ]')\n", |
476 | | - "\n", |
477 | | - "gml_graph = 'graph [\\n directed 1\\n\\n' + \"\\n\".join(gml_nodes) + \"\\n\\n\" + \"\\n\".join(gml_edges) + \"\\n]\"" |
478 | | - ] |
479 | | - }, |
480 | 433 | { |
481 | 434 | "cell_type": "markdown", |
482 | 435 | "id": "5aa85b75", |
|
487 | 440 | }, |
488 | 441 | { |
489 | 442 | "cell_type": "code", |
490 | | - "execution_count": 8, |
| 443 | + "execution_count": null, |
491 | 444 | "id": "5d714d62", |
492 | 445 | "metadata": {}, |
493 | 446 | "outputs": [ |
|
501 | 454 | "Estimand name: backdoor\n", |
502 | 455 | "Estimand expression:\n", |
503 | 456 | " d ↪\n", |
504 | | - "───────(E[wt_82_71|sex_1,active_1,education_4,race_1,education_2,wt71,educatio ↪\n", |
| 457 | + "───────(E[wt_82_71|sex,age,smokeyrs,race,active,wt71,education,exercise,smokei ↪\n", |
505 | 458 | "d[qsmk] ↪\n", |
506 | 459 | "\n", |
507 | | - "↪ \n", |
508 | | - "↪ n_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age])\n", |
509 | | - "↪ \n", |
510 | | - "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age,U) = P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age)\n", |
| 460 | + "↪ \n", |
| 461 | + "↪ ntensity])\n", |
| 462 | + "↪ \n", |
| 463 | + "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity,U) = P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity)\n", |
511 | 464 | "\n", |
512 | 465 | "### Estimand : 2\n", |
513 | 466 | "Estimand name: iv\n", |
|
521 | 474 | "Estimand name: general_adjustment\n", |
522 | 475 | "Estimand expression:\n", |
523 | 476 | " d ↪\n", |
524 | | - "───────(E[wt_82_71|sex_1,active_1,education_4,race_1,education_2,wt71,educatio ↪\n", |
| 477 | + "───────(E[wt_82_71|sex,age,smokeyrs,race,active,wt71,education,exercise,smokei ↪\n", |
525 | 478 | "d[qsmk] ↪\n", |
526 | 479 | "\n", |
527 | | - "↪ \n", |
528 | | - "↪ n_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age])\n", |
529 | | - "↪ \n", |
530 | | - "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age,U) = P(wt82_71|qsmk,sex_1,active_1,education_4,race_1,education_2,wt71,education_5,active_2,smokeintensity,exercise_2,exercise_1,education_3,smokeyrs,age)\n", |
| 480 | + "↪ \n", |
| 481 | + "↪ ntensity])\n", |
| 482 | + "↪ \n", |
| 483 | + "Estimand assumption 1, Unconfoundedness: If U→{qsmk} and U→wt82_71 then P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity,U) = P(wt82_71|qsmk,sex,age,smokeyrs,race,active,wt71,education,exercise,smokeintensity)\n", |
531 | 484 | "\n" |
532 | 485 | ] |
533 | 486 | } |
534 | 487 | ], |
535 | 488 | "source": [ |
536 | 489 | "est_model = CausalModel(\n", |
537 | | - " data=df_encoded,\n", |
| 490 | + " data=df_clean,\n", |
538 | 491 | " treatment=treatment,\n", |
539 | 492 | " outcome=outcome,\n", |
540 | 493 | " graph=gml_graph\n", |
|
579 | 532 | }, |
580 | 533 | { |
581 | 534 | "cell_type": "code", |
582 | | - "execution_count": 9, |
| 535 | + "execution_count": 7, |
583 | 536 | "id": "830fe396", |
584 | 537 | "metadata": {}, |
585 | 538 | "outputs": [ |
|
588 | 541 | "output_type": "stream", |
589 | 542 | "text": [ |
590 | 543 | "\n", |
591 | | - "[ATE] Linear Regression: 3.3811710339880983\n" |
| 544 | + "[ATE] Linear Regression: 3.3811710339880823\n" |
592 | 545 | ] |
593 | 546 | } |
594 | 547 | ], |
|
610 | 563 | }, |
611 | 564 | { |
612 | 565 | "cell_type": "code", |
613 | | - "execution_count": 10, |
| 566 | + "execution_count": 8, |
614 | 567 | "id": "a7aa0ee5", |
615 | 568 | "metadata": {}, |
616 | 569 | "outputs": [ |
617 | 570 | { |
618 | 571 | "name": "stdout", |
619 | 572 | "output_type": "stream", |
620 | 573 | "text": [ |
621 | | - "[ATE] DR Learner: 3.893642417299098\n", |
622 | | - "[ATE] DR Learner 95% CI: [[[3.35046186]]\n", |
| 574 | + "[ATE] DR Learner: 3.8479455513705227\n", |
| 575 | + "[ATE] DR Learner 95% CI: [[[3.12167472]]\n", |
623 | 576 | "\n", |
624 | | - " [[5.20724188]]]\n" |
| 577 | + " [[5.29732315]]]\n" |
625 | 578 | ] |
626 | 579 | } |
627 | 580 | ], |
|
657 | 610 | }, |
658 | 611 | { |
659 | 612 | "cell_type": "code", |
660 | | - "execution_count": 11, |
| 613 | + "execution_count": 9, |
661 | 614 | "id": "a91cdd48", |
662 | 615 | "metadata": {}, |
663 | 616 | "outputs": [ |
664 | 617 | { |
665 | 618 | "name": "stdout", |
666 | 619 | "output_type": "stream", |
667 | 620 | "text": [ |
668 | | - "[ATE] DML: 3.464128951245904\n", |
669 | | - "[ATE] DML 95% CI: [[[2.4019656]]\n", |
| 621 | + "[ATE] DML: 3.8611289087159135\n", |
| 622 | + "[ATE] DML 95% CI: [[[3.06774555]]\n", |
670 | 623 | "\n", |
671 | | - " [[4.4395689]]]\n" |
| 624 | + " [[5.17092571]]]\n" |
672 | 625 | ] |
673 | 626 | } |
674 | 627 | ], |
|
696 | 649 | "\n" |
697 | 650 | ] |
698 | 651 | }, |
| 652 | + { |
| 653 | + "cell_type": "markdown", |
| 654 | + "id": "60080461", |
| 655 | + "metadata": {}, |
| 656 | + "source": [ |
| 657 | + "## Refute" |
| 658 | + ] |
| 659 | + }, |
699 | 660 | { |
700 | 661 | "cell_type": "code", |
701 | | - "execution_count": 20, |
702 | | - "id": "869355b3", |
| 662 | + "execution_count": 15, |
| 663 | + "id": "18481fc5", |
703 | 664 | "metadata": {}, |
704 | 665 | "outputs": [ |
705 | 666 | { |
706 | 667 | "name": "stdout", |
707 | 668 | "output_type": "stream", |
708 | 669 | "text": [ |
709 | | - "[ATE] DML: 3.464128951245904\n" |
| 670 | + "[ATE] DML: 3.8611289087159135\n" |
710 | 671 | ] |
711 | 672 | } |
712 | 673 | ], |
|
725 | 686 | "print(\"[ATE] DML:\", estimate_dml.value)" |
726 | 687 | ] |
727 | 688 | }, |
728 | | - { |
729 | | - "cell_type": "markdown", |
730 | | - "id": "60080461", |
731 | | - "metadata": {}, |
732 | | - "source": [ |
733 | | - "## Refute" |
734 | | - ] |
735 | | - }, |
736 | 689 | { |
737 | 690 | "cell_type": "markdown", |
738 | 691 | "id": "bdcda503", |
|
749 | 702 | }, |
750 | 703 | { |
751 | 704 | "cell_type": "code", |
752 | | - "execution_count": 21, |
| 705 | + "execution_count": 11, |
753 | 706 | "id": "9ad86368", |
754 | 707 | "metadata": {}, |
755 | 708 | "outputs": [ |
|
758 | 711 | "output_type": "stream", |
759 | 712 | "text": [ |
760 | 713 | "Refute: Add a random common cause\n", |
761 | | - "Estimated effect:3.6087034824487034\n", |
762 | | - "New effect:3.4568539723790566\n", |
763 | | - "p value:0.42\n", |
| 714 | + "Estimated effect:3.5111934415796173\n", |
| 715 | + "New effect:3.4686090208687586\n", |
| 716 | + "p value:0.72\n", |
764 | 717 | "\n" |
765 | 718 | ] |
766 | 719 | } |
|
785 | 738 | "- **기대**: New effect ≈ Estimated effect\n", |
786 | 739 | "- **해석**:\n", |
787 | 740 | " - 크게 변하지 않으면 → 잠재적 누락변수(confounder) 에도 견고(robust)\n", |
788 | | - " - 크게 변하면 → 모델이 숨은 교란에 민감, 추가 변수 고려 필요" |
| 741 | + " - 크게 변하면 → 모델이 숨은 교란에 민감, 추가 변수 고려 필요\n", |
| 742 | + "- **참고**: 도메인 지식을 기반으로, 교란이 처리변수와 결과에 미치는 영향의 크기는 사용자가 직접 설정해야 합니다." |
789 | 743 | ] |
790 | 744 | }, |
791 | 745 | { |
792 | 746 | "cell_type": "code", |
793 | | - "execution_count": 22, |
| 747 | + "execution_count": 12, |
794 | 748 | "id": "dcce63b4", |
795 | 749 | "metadata": {}, |
796 | 750 | "outputs": [ |
|
799 | 753 | "output_type": "stream", |
800 | 754 | "text": [ |
801 | 755 | "Refute: Add an Unobserved Common Cause\n", |
802 | | - "Estimated effect:3.6087034824487034\n", |
803 | | - "New effect:3.4630068223974586\n", |
| 756 | + "Estimated effect:3.5111934415796173\n", |
| 757 | + "New effect:2.993841050044186\n", |
804 | 758 | "\n" |
805 | 759 | ] |
806 | 760 | } |
|
834 | 788 | }, |
835 | 789 | { |
836 | 790 | "cell_type": "code", |
837 | | - "execution_count": 23, |
| 791 | + "execution_count": 13, |
838 | 792 | "id": "545e6293", |
839 | 793 | "metadata": {}, |
840 | 794 | "outputs": [ |
|
843 | 797 | "output_type": "stream", |
844 | 798 | "text": [ |
845 | 799 | "Refute: Use a Placebo Treatment\n", |
846 | | - "Estimated effect:3.6087034824487034\n", |
847 | | - "New effect:-0.01017831576756214\n", |
848 | | - "p value:0.4903498144600218\n", |
| 800 | + "Estimated effect:3.5111934415796173\n", |
| 801 | + "New effect:0.14283539321647556\n", |
| 802 | + "p value:0.3866345448655441\n", |
849 | 803 | "\n" |
850 | 804 | ] |
851 | 805 | } |
|
879 | 833 | }, |
880 | 834 | { |
881 | 835 | "cell_type": "code", |
882 | | - "execution_count": 24, |
| 836 | + "execution_count": 14, |
883 | 837 | "id": "69481385", |
884 | 838 | "metadata": {}, |
885 | 839 | "outputs": [ |
|
888 | 842 | "output_type": "stream", |
889 | 843 | "text": [ |
890 | 844 | "Refute: Use a subset of data\n", |
891 | | - "Estimated effect:3.6087034824487034\n", |
892 | | - "New effect:3.5177935748179356\n", |
893 | | - "p value:0.36231391604195595\n", |
| 845 | + "Estimated effect:3.5111934415796173\n", |
| 846 | + "New effect:3.4052136015402126\n", |
| 847 | + "p value:0.4004970164482444\n", |
894 | 848 | "\n" |
895 | 849 | ] |
896 | 850 | } |
|
0 commit comments