TARGENE
diff --git a/‎Project.toml‎
Lines changed: 0 additions & 2 deletions b/‎Project.toml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 3 additions & 1 deletion b/‎docs/make.jl‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/src/assets/interaction_graph.png‎
86.6 KB b/‎docs/src/assets/interaction_graph.png‎
86.6 KB
diff --git a/‎examples/interactions_correlated.jl‎
Lines changed: 225 additions & 0 deletions b/‎examples/interactions_correlated.jl‎
Lines changed: 225 additions & 0 deletions
diff --git a/‎src/TMLE.jl‎
Lines changed: 0 additions & 1 deletion b/‎src/TMLE.jl‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/counterfactual_mean_based/estimands.jl‎
Lines changed: 8 additions & 1 deletion b/‎src/counterfactual_mean_based/estimands.jl‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎test/Project.toml‎
Lines changed: 3 additions & 1 deletion b/‎test/Project.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎test/counterfactual_mean_based/3points_interactions.jl‎
Lines changed: 8 additions & 2 deletions b/‎test/counterfactual_mean_based/3points_interactions.jl‎
Lines changed: 8 additions & 2 deletions
@@ -19,7 +19,6 @@ MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
 MetaGraphsNext = "fa8bd995-216d-47f1-8a91-f3b68fbeb377"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SplitApplyCombine = "03a91e81-4c3e-53e1-a0a4-9c0c8f19dd66"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -55,7 +54,6 @@ MLJModels = "0.15, 0.16, 0.17"
 MetaGraphsNext = "0.7"
 Missings = "1.0"
 OrderedCollections = "1.6.3"
-PrecompileTools = "1.1.1"
 SplitApplyCombine = "1.2.2"
 TableOperations = "1.2"
 Tables = "1.6"
 
@@ -24,6 +24,7 @@ makedocs(;
         prettyurls=get(ENV, "CI", "false") == "true",
         canonical="https://TARGENE.github.io/TMLE.jl",
         assets=String["assets/logo.ico"],
+        size_threshold=nothing
     ),
     pages=[
         "Home" => "index.md",
@@ -32,7 +33,8 @@ makedocs(;
             ("scm.md", "estimands.md", "estimation.md")],
         "Examples" => [
             joinpath("examples", "super_learning.md"),
-            joinpath("examples", "double_robustness.md")
+            joinpath("examples", "double_robustness.md"),
+            joinpath("examples", "interactions_correlated.md"),
             ],
         "Integrations" => "integrations.md",
         "Estimators' Cheat Sheet" => "estimators_cheatsheet.md",
 
@@ -0,0 +1,225 @@
+#=
+# Interaction Estimation
+
+In this example we aim to estimate the average interaction effect of two, potentially correlated, 
+treatment variables `T1` and `T2` on an outcome `Y`.
+
+## Data Generating Process
+
+Let's consider the following structural causal model where the shaded nodes represent the observed variables.
+
+![interaction-graph](../assets/interaction_graph.png)
+
+In other words, only one confounding variable is observed (`W1`). This would be a major problem if we wanted to estimate the 
+average treatment effect of `T1` or `T2` on `Y` separately. However, here, we are interested in interactions and thus `W1` is 
+a sufficient adjustment set. This artificial situation is ubiquitous in genetics, where two main sources of confounding exist. 
+Ancestry, can be estimated (here `W1`) and linkage disequilibrium is usually more challenging to address (here `W2`).
+
+Let us first define some helper functions and import some libraries.
+=#
+using Distributions
+using Random
+using DataFrames
+using Statistics
+using CategoricalArrays
+using TMLE
+using CairoMakie
+using MLJXGBoostInterface
+using MLJ
+using MLJLinearModels
+Random.seed!(123)
+
+μT(w) = [sum(w), sum(w)]
+
+μY(t, w) = 1 + 10t[1] - 3t[2] * t[1] * w
+
+#=
+We assume that `W1` and `W2`, the confounding variables, follow a uniform distribution.
+=#
+
+generate_W(n) = rand(Uniform(0, 1), 2, n)
+
+#=
+`T1` and `T2` are generated via a copula method through a multivariate normal to induce some statistical dependence (via σ).
+=#
+
+function generate_T(W, n; σ=0.5, threshold=0)
+    covariance = [
+        1. σ
+        σ 1.
+    ]
+    T = zeros(Bool, 2, n)
+    for i in 1:n
+        dTi = MultivariateNormal(μT(W[:, i]), covariance)
+        T[:, i] = rand(dTi) .> threshold
+    end
+    return T
+end
+
+#=
+Finally, `Y` is generated through a simple linear model with an interaction term.
+=#
+
+function generate_Y(T, W1, n; σY=1)
+    Y = zeros(Float64, n)
+    for i in 1:n
+        dY = Normal(μY(T[:, i], W1[i]), σY)
+        Y[i] = rand(dY)
+    end
+    return Y
+end
+
+#=
+Importantly, the average interaction effect between `T1` and `T2` is thus ``-3 \mathbb{E}[W] = -1.5``.
+
+We will generate a full dataset with the following function.
+=#
+
+function generate_dataset(;n=1000, σ=0.5, threshold=0., σY=1)
+
+    W = generate_W(n)
+    T = generate_T(W, n; σ=σ, threshold=threshold)
+
+    W = permutedims(W)
+    W1 = W[:, 1]
+    W2 = W[:, 2]
+
+    Y = generate_Y(T, W1, n; σY=σY)
+
+    T = permutedims(T)
+    T1 = categorical(T[:, 1])
+    T2 = categorical(T[:, 2])
+
+    return DataFrame(W1=W1, W2=W2, T1=T1, T2=T2, Y=Y)
+end
+
+dataset = generate_dataset()
+
+first(dataset, 5)
+#=
+Let's verify that each treatment level is sufficiently present in the dataset (≈positivity).
+=#
+
+combine(groupby(dataset, [:T1, :T2]), proprow => :JOINT_TREATMENT_FREQ)
+
+#=
+And that `T1` and `T2` are indeed correlated.
+=#
+
+treatment_correlation(dataset) = cor(unwrap.(dataset.T1), unwrap.(dataset.T2))
+@assert treatment_correlation(dataset) > 0.2 #hide
+treatment_correlation(dataset)
+
+#=
+## Estimation
+
+We can now proceed to estimation using TMLE and default (linear) models. 
+
+Interactions are defined via the `AIE` function (note that we only set `W1` as a confounder).
+=#
+
+Ψ = AIE(
+    outcome=:Y,
+    treatment_values= (
+        T1=(case=1, control=0), 
+        T2=(case=1, control=0)
+    ),
+    treatment_confounders = [:W1]
+)
+linear_models = default_models(G=LogisticClassifier(lambda=0), Q_continuous=LinearRegressor())
+estimator = TMLEE(models=linear_models, weighted=true)
+result, _ = estimator(Ψ, dataset; verbosity=0)
+@assert pvalue(significance_test(result, -1.5)) > 0.05 #hide
+significance_test(result)
+
+#=
+The true effect size is thus covered by our confidence interval.
+
+## Varying levels of correlation
+
+We now vary the correlation level between `T1` and `T2` to observe how it affects the estimation results. 
+First, let's see how the parameter σ affects the correlation between `T1` and `T2`.
+=#
+
+function plot_correlations(;σs = 0.1:0.1:1, n=1000, threshold=0., σY=1.)
+    fig = Figure()
+    ax = Axis(fig[1, 1], xlabel="σ", ylabel="Correlation(T1, T2)")
+    correlations = map(σs) do σ
+        dataset = generate_dataset(;n=n, σ=σ, threshold=threshold, σY=σY)
+        return treatment_correlation(dataset)
+    end
+    scatter!(ax, σs, correlations, color=:blue)
+    return fig
+end
+
+σs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]
+plot_correlations(;σs=σs, n=10_000)
+
+#=
+As expected, the correlation between `T1` and `T2` increases with σ. Let's see how this affects estimation, 
+for this, we will vary both the dataset size and the correlation level.
+=#
+
+function estimate_across_correlation_levels(σs; n=1000, estimator=TMLEE(weighted=true))
+    results = []
+    for σ in σs
+        dataset = generate_dataset(n=n, σ=σ)
+        result, _ = estimator(Ψ, dataset; verbosity=0)
+        push!(results, result)
+    end
+    Ψ̂s = TMLE.estimate.(results)
+    errors = last.(confint.(significance_test.(results))) .- Ψ̂s
+    return Ψ̂s, errors
+end
+
+function estimate_across_sample_sizes_and_correlation_levels(ns, σs; estimator=TMLEE(models=linear_models, weighted=true))
+    results = []
+    for n in ns
+        Ψ̂s, errors = estimate_across_correlation_levels(σs; n=n, estimator=estimator)
+        push!(results, (Ψ̂s, errors))
+    end
+    return results
+end
+
+function plot_across_sample_sizes_and_correlation_levels(results, ns, σs; title="Estimation via TMLE (GLMs)")
+    fig = Figure(size=(800, 800))
+    for (index, n) in enumerate(ns)
+        Ψ̂s, errors = results[index]
+        ax = if n == last(ns)
+            Axis(fig[index, 1], xlabel="σ", ylabel="AIE\n(n=$n)")
+        else
+            Axis(fig[index, 1], ylabel="AIE\n(n=$n)", xticklabelsvisible=false)
+        end
+        errorbars!(ax, σs, Ψ̂s, errors, color = :blue, whiskerwidth = 10)
+        scatter!(ax, σs, Ψ̂s, color=:red, markersize=10)
+        hlines!(ax, [-1.5], color=:green, linestyle=:dash)
+    end
+    Label(fig[0, :], title; tellwidth=false, fontsize=24)
+    return fig
+end
+
+ns = [100, 1000, 10_000, 100_000, 1_000_000]
+σs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.999]
+results = estimate_across_sample_sizes_and_correlation_levels(ns, σs; estimator=TMLEE(models=linear_models, weighted=true))
+plot_across_sample_sizes_and_correlation_levels(results, ns, σs; title="Estimation via TMLE (GLMs)")
+
+#=
+First, notice that only extreme correlations (>0.9) tend to blow up the size of the confidence intervals. This implies that statistical power may be limited in such circumstances.
+
+Furthermore, and perhaps unexpectedly, coverage decreases as sample size grows for larger correlations. Since we have used simple linear models until now, 
+this could be due to model misspecification. We can verify this by using a more flexible modelling strategy. Here we will use XGBoost 
+(with tree_method=`hist` to speed things up a little). Because this model is prone to overfitting we will also use cross-validation (this will take a few minutes).
+=#
+
+xgboost_estimator = TMLEE(
+    models=default_models(G=XGBoostClassifier(tree_method="hist"), Q_continuous=XGBoostRegressor(tree_method="hist")),
+    weighted=true,
+    resampling=StratifiedCV(nfolds=3)
+)
+xgboost_results = estimate_across_sample_sizes_and_correlation_levels(ns, σs, estimator=xgboost_estimator)
+plot_across_sample_sizes_and_correlation_levels(xgboost_results, ns, σs; title="Estimation via TMLE (XGboost)")
+
+#=
+As expected, XGBoost improves estimation performance in the asymptotic regime, furthermore, 
+the correlation between `T1` and `T2` seems harmless (except when σ > 0.9 as before).
+=#
@@ -13,7 +13,6 @@ using Statistics
 using Distributions
 using Zygote
 using LogExpFunctions
-using PrecompileTools
 using Random
 using DifferentiationInterface
 using Graphs
 
@@ -124,7 +124,14 @@ outcome_mean(Ψ::StatisticalCMCompositeEstimand) = ExpectedValue(Ψ.outcome, Tup
 
 outcome_mean_key(Ψ::StatisticalCMCompositeEstimand) = variables(outcome_mean(Ψ))
 
-propensity_score(Ψ::StatisticalCMCompositeEstimand) = Tuple(ConditionalDistribution(T, Ψ.treatment_confounders[T]) for T in treatments(Ψ))
+function propensity_score(Ψ::StatisticalCMCompositeEstimand)
+    Ψtreatments = TMLE.treatments(Ψ)
+    return Tuple(map(eachindex(Ψtreatments)) do index
+        T = Ψtreatments[index]
+        confounders = (Ψ.treatment_confounders[T]..., Ψtreatments[index+1:end]...)
+        ConditionalDistribution(T, confounders)
+    end)
+end
 
 propensity_score_key(Ψ::StatisticalCMCompositeEstimand) = Tuple(variables(x) for x ∈ propensity_score(Ψ))
 
 
@@ -12,6 +12,7 @@ MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
 MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
+MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -29,6 +30,7 @@ CSV = "0.10"
 DataFrames = "1.5"
 MLJLinearModels = "0.10"
 StableRNGs = "1.0"
-StatisticalMeasures = "0.1.3"
+StatisticalMeasures = "0.2"
+MLJXGBoostInterface = "0.3"
 StatsBase = "0.34"
 YAML = "0.4.9"
@@ -36,13 +36,19 @@ end
         ),
         treatment_confounders = (T₁=[:W], T₂=[:W], T₃=[:W])
     )
+    ## Check propensity score is well formed
+    propensity_score = TMLE.propensity_score(Ψ)
+    @test propensity_score[1] == TMLE.ConditionalDistribution(:T₁, (:T₂, :T₃, :W))
+    @test propensity_score[2] == TMLE.ConditionalDistribution(:T₂, (:T₃, :W))
+    @test propensity_score[3] == TMLE.ConditionalDistribution(:T₃, (:W,))
+    ## Define models
     models = Dict(
         :Y  => with_encoder(InteractionTransformer(order=3) |> LinearRegressor()),
         :T₁ => LogisticClassifier(lambda=0),
         :T₂ => LogisticClassifier(lambda=0),
         :T₃ => LogisticClassifier(lambda=0)
     )
-
+    ## Estimate
     tmle = TMLEE(models=models, machine_cache=true, max_iter=3, tol=0)
     result, cache = tmle(Ψ, dataset, verbosity=0);
     test_coverage(result, Ψ₀)
@@ -54,7 +60,7 @@ end
     test_coverage(result, Ψ₀)
     test_mean_inf_curve_almost_zero(result; atol=1e-10)
 
-    # CHecking cache accessors
+    # Checking cache accessors
     @test length(gradients(cache)) == 3
     @test length(estimates(cache)) == 3
     @test length(epsilons(cache)) == 3