Merge pull request #185 from TARGENE/brh_data_source

roskamsh · web-flow · commit 33c3ff8e6a5f · 2023-11-17T08:53:40.000Z
Add functionality to run from_actors mode with more than one TF
diff --git a/src/confounders.jl b/src/confounders.jl
@@ -106,4 +106,4 @@ function adapt_flashpca(parsed_args)
     pcs = CSV.File(parsed_args["input"], drop=["FID"]) |> DataFrame
     rename!(pcs, :IID => :SAMPLE_ID)
     CSV.write(parsed_args["output"], pcs)
-end
+end
diff --git a/src/tmle_inputs/from_actors.jl b/src/tmle_inputs/from_actors.jl
@@ -11,12 +11,15 @@ function treatments_from_actors(bqtl_file, env_file, trans_actors_prefix)
     bqtls, transactors, extraT
 end
 
+filter_snps_by_tf(df::DataFrame, tf_name::AbstractString) = filter(row -> row.TF == tf_name, df)
+filter_snps_by_tf(df::DataFrame, tf_name::Nothing) = df
+filter_snps_by_tf(df_vector, tf_name::AbstractString) = [filter_snps_by_tf(df, tf_name) for df in df_vector]
+filter_snps_by_tf(df_vector, tf_name::Nothing) = df_vector
 
 combine_trans_actors(trans_actors::Vector{DataFrame}, extraT::DataFrame, order) = combinations([trans_actors..., extraT], order)
 combine_trans_actors(trans_actors::Vector{DataFrame}, extraT::Nothing, order) = combinations(trans_actors, order)
 combine_trans_actors(trans_actors::Nothing, extraT::DataFrame, order) = [[extraT]]
 
-
 function combine_by_bqtl(bqtls::DataFrame, trans_actors::Union{Vector{DataFrame}, Nothing}, extraT::Union{DataFrame, Nothing}, order::Int)
     treatment_combinations = Vector{Symbol}[]
     if order == 1
@@ -44,8 +47,11 @@ all_variants(bqtls::DataFrame, transactors::Vector{DataFrame}) = Set(vcat(bqtls.
 
 
 read_snps_from_csv(path::Nothing) = nothing
-read_snps_from_csv(path::String) = unique(CSV.read(path, DataFrame; select=[:ID, :CHR]), :ID)
-
+function read_snps_from_csv(path::String)
+    df = CSV.read(path, DataFrame)
+    df = "TF" in names(df) ? unique(df[:, [:ID, :CHR, :TF]], [:ID, :TF]) : unique(df[:, [:ID, :CHR]], :ID)
+    return(df)
+end
 
 trans_actors_from_prefix(trans_actors_prefix::Nothing) = nothing
 function trans_actors_from_prefix(trans_actors_prefix::String)
@@ -156,7 +162,14 @@ function parameters_from_actors(bqtls, transactors, data, variables, orders, out
     for order in orders
         # First generate the `T` section
         treatment_combinations = TargeneCore.combine_by_bqtl(bqtls, transactors, extraT_df, order)
+        # If there are duplicates here, remove them
+        treatment_combinations = unique(treatment_combinations)
         for treatments in treatment_combinations
+            # If RSID is duplicated in treatments, skip
+            if length(treatments) != length(unique(treatments))
+                continue
+            end
+
             addParameters!(parameters, treatments, variables, data; positivity_constraint=positivity_constraint)
 
             if batch_size !== nothing && size(parameters, 1) >= batch_size
@@ -208,11 +221,19 @@ function tmle_inputs_from_actors(parsed_args)
 
     # Parameter files
     variables = TargeneCore.get_variables(pcs, traits, extraW, extraC, extraT)
-    TargeneCore.parameters_from_actors(
-        bqtls, transactors, data, variables, orders, outprefix; 
+
+    # Loop through each TF present in bqtls file 
+    tfs = "TF" in names(bqtls) ? unique(bqtls.TF) : [nothing] 
+    for tf in tfs
+        outprefix_tf = tf !== nothing ? string(outprefix,".",tf) : outprefix
+        bqtls_tf = TargeneCore.filter_snps_by_tf(bqtls, tf)
+        transactors_tf = TargeneCore.filter_snps_by_tf(transactors, tf)
+        TargeneCore.parameters_from_actors(
+        bqtls_tf, transactors_tf, data, variables, orders, outprefix_tf; 
         positivity_constraint=positivity_constraint, batch_size=batch_size
-    )
+        )
+    end
 
     # write data
     Arrow.write(string(outprefix, ".data.arrow"), data)
-end
+end
diff --git a/src/tmle_inputs/tmle_inputs.jl b/src/tmle_inputs/tmle_inputs.jl
@@ -1,6 +1,6 @@
 const CHR_REG = r"chr[1-9]+"
 
-param_batch_name(outprefix, batch_id) = string(outprefix, ".param_", batch_id, ".yaml")
+param_batch_name(outprefix, batch_id) = string(outprefix, ".param_", batch_id, ".yaml") 
 
 
 """
diff --git a/test/confounders.jl b/test/confounders.jl
@@ -56,7 +56,7 @@ end
     
     clean(parsed_args)
 
-    # No qc file provided
+    # No QC file provided
     parsed_args = Dict(
         "input"  => SnpArrays.datadir("mouse"),
         "output" => joinpath("data", "filtered-mouse"),
@@ -138,4 +138,4 @@ end
 
 end;
 
-true
+true
diff --git a/test/data/bqtls_1.csv b/test/data/bqtls_1.csv
diff --git a/test/data/bqtls_2.csv b/test/data/bqtls_2.csv
@@ -0,0 +1,5 @@
+ID,CHR,TF
+RSID_17,12,TF1
+RSID_99,12,TF1
+RSID_17,12,TF2
+RSID_198,12,TF2
diff --git a/test/data/trans_actors_3.csv b/test/data/trans_actors_3.csv
@@ -0,0 +1,3 @@
+ID,CHR,TF
+RSID_102,2,TF1
+RSID_2,chr,TF2
diff --git a/test/tmle_inputs/from_actors.jl b/test/tmle_inputs/from_actors.jl
@@ -222,7 +222,7 @@ end
     @test_throws ArgumentError TargeneCore.treatments_from_actors(1, nothing, nothing)
     @test_throws ArgumentError TargeneCore.treatments_from_actors(nothing, 1, nothing)
 
-    bqtl_file = joinpath("data", "bqtls.csv")
+    bqtl_file = joinpath("data", "bqtls_1.csv")
     trans_actors_prefix = joinpath("data", "trans_actors_1.csv")
     env_file = joinpath("data", "extra_treatments.txt")
     # bqtls and trans_actors
@@ -259,7 +259,7 @@ end
     # - Order 1,2
     parsed_args = Dict(
         "from-actors" => Dict{String, Any}(
-            "bqtls" => joinpath("data", "bqtls.csv"), 
+            "bqtls" => joinpath("data", "bqtls_1.csv"), 
             "trans-actors-prefix" => joinpath("data", "trans_actors_1.csv"),
             "extra-covariates" => joinpath("data", "extra_covariates.txt"),
             "extra-treatments" => joinpath("data", "extra_treatments.txt"),
@@ -333,7 +333,7 @@ end
     # - batched
     parsed_args = Dict(
         "from-actors" => Dict{String, Any}(
-            "bqtls" => joinpath("data", "bqtls.csv"), 
+            "bqtls" => joinpath("data", "bqtls_1.csv"), 
             "trans-actors-prefix" => joinpath("data", "trans_actors_2"),
             "extra-covariates" => nothing,
             "extra-treatments" => nothing,
@@ -361,10 +361,10 @@ end
     @test size(traits) == (490, 13)
 
     # Parameter files: 
-    ouparameters_1 = parameters_from_yaml("final.param_1.yaml")
-    @test size(ouparameters_1, 1) == 100
-    ouparameters_2 = parameters_from_yaml("final.param_2.yaml")
-    outparameters = vcat(ouparameters_1, ouparameters_2)
+    outparameters_1 = parameters_from_yaml("final.param_1.yaml")
+    @test size(outparameters_1, 1) == 100
+    outparameters_2 = parameters_from_yaml("final.param_2.yaml")
+    outparameters = vcat(outparameters_1, outparameters_2)
     
     found_targets = Dict(
         :BINARY_1 => 0,
@@ -405,6 +405,83 @@ end
     cleanup()
 end
 
+@testset "Test tmle_inputs from-actors: scenario 3" begin
+    # Scenario:
+    # - Trans-actors
+    # - Extra Treatment
+    # - Extra Covariates
+    # - Order 1,2
+    # - More than 1 TF present
+    parsed_args = Dict(
+        "from-actors" => Dict{String, Any}(
+            "bqtls" => joinpath("data", "bqtls_2.csv"), 
+            "trans-actors-prefix" => joinpath("data", "trans_actors_3.csv"),
+            "extra-covariates" => joinpath("data", "extra_covariates.txt"),
+            "extra-treatments" => joinpath("data", "extra_treatments.txt"),
+            "extra-confounders" => nothing,
+            "orders" => "1,2",
+            ),
+        "traits" => joinpath("data", "traits_1.csv"),
+        "pcs" => joinpath("data", "pcs.csv"),
+        "call-threshold" => 0.8,  
+        "%COMMAND%" => "from-actors", 
+        "bgen-prefix" => joinpath("data", "ukbb", "imputed" ,"ukbb"), 
+        "out-prefix" => "final", 
+        "batch-size" => nothing,
+        "positivity-constraint" => 0.
+    )
+    bqtls = Symbol.(unique(CSV.read(parsed_args["from-actors"]["bqtls"], DataFrame).ID))
+    tmle_inputs(parsed_args)
+
+    ## Dataset file
+    trait_data = DataFrame(Arrow.Table("final.data.arrow"))
+    @test names(trait_data) == [
+        "SAMPLE_ID", "BINARY_1", "BINARY_2", "CONTINUOUS_1", "CONTINUOUS_2", 
+        "COV_1", "21003", "22001", "TREAT_1", "PC1", "PC2", "RSID_2", "RSID_102", 
+        "RSID_17", "RSID_198", "RSID_99"]
+    @test size(trait_data) == (490, 16)
+    
+    ## Parameter file: 
+    outparameters = [parameters_from_yaml("final.TF1.param_1.yaml"), parameters_from_yaml("final.TF2.param_1.yaml")]
+    found_targets = Dict(
+        :BINARY_1 => 0,
+        :CONTINUOUS_2 => 0,
+        :CONTINUOUS_1 => 0,
+        :BINARY_2 => 0
+    )
+    for tf in [1,2]
+        outparameters_tf = outparameters[tf]
+        for Ψ in outparameters_tf
+            if Ψ isa ATE
+                ntreatments = length(Ψ.treatment)
+                if ntreatments > 1
+                    @test all(Ψ.treatment[index].case == Ψ.treatment[index].control for index ∈ 2:ntreatments)
+                end
+            else
+                @test Ψ isa IATE
+                @test all(cc.case != cc.control for cc ∈ Ψ.treatment)
+            end
+            @test Ψ.covariates == [:COV_1, Symbol("21003"), Symbol("22001")]
+            @test Ψ.confounders == [:PC1, :PC2]
+            # The first treatment will be a bqtl
+            @test keys(Ψ.treatment)[1] ∈ bqtls
+            @test Ψ.treatment[1].case isa AbstractString
+            @test length(Ψ.treatment) ∈ [1, 2]
+            found_targets[Ψ.target] += 1
+        end
+    end
+    # The number of parameters with various targets should be the same
+    @test all(x == found_targets[:BINARY_1] for x in values(found_targets))
+    # This is difficult to really check the ordering
+    # Those correspond to the simple bQTL ATE
+    for tf in [1,2]
+        first_treatments = keys(outparameters[tf][1].treatment)
+        @test all(keys(Ψ.treatment) == first_treatments for Ψ in outparameters[tf][1:12])
+    end
+
+    cleanup()
+end
+
 end
 
 true

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+ID,CHR,TF`
	`2`	`+RSID_102,2,TF1`
	`3`	`+RSID_2,chr,TF2`