Brh - update confounders.jl for non-UKBB cohort (#176)

roskamsh · olivierlabayle · web-flow · commit 4dcf5c4d1e55 · 2023-10-19T16:38:10.000+01:00
* qc file flag optional for this code to execute

* add test for no qc file passed as argument

* add default value for qcfile

* change logic to evaluate whether there is no qcfile argument given

* update test function to be compatible with changes to confounders.jl

* regroup qcfile logic and add clean method for tests

* add optionality of ldblocks file

---------

Co-authored-by: s2223108 &lt;b.j.roskams-hieter@sms.ed.ac.uk&gt;
Co-authored-by: Olivier Labayle &lt;olabayle@gmail.com&gt;
diff --git a/bin/prepare_confounders.jl b/bin/prepare_confounders.jl
@@ -18,6 +18,8 @@ function parse_commandline()
         "--qcfile"
             help = "Path to the UKBiobank ukb_snp_qc.txt"
             arg_type = String
+            default = nothing
+            required = false
         "--maf-threshold", "-t"
             help = "SNPs with MAF lower than this value will be filtered out"
             arg_type = Float64
diff --git a/src/confounders.jl b/src/confounders.jl
@@ -23,6 +23,35 @@ function notin_ldblocks(row, ldblocks)
     end
 end
 
+ld_blocks_filter(data, ld_blocks_file::Nothing) = data
+
+function ld_blocks_filter(data, ld_blocks_file)
+    ld_blocks = CSV.File(
+        ld_blocks_file;
+        header=["rsid", "chr", "pos", "LDblock_lower", "LDblock_upper", "LDblock_length", "lower_bound", "upper_bound"]) |> DataFrame
+    ld_blocks.chr = string.(ld_blocks.chr)
+    transform!(ld_blocks,
+        :,
+        [:pos, :lower_bound, :upper_bound] => ByRow(bounds) => [:lower_bound, :upper_bound],
+        )
+    ld_blocks = groupby(ld_blocks, :chr)
+    return filter(x -> notin_ldblocks(x, ld_blocks), data)
+end
+
+ukb_qc_filter(data, qcfile::Nothing) = data
+
+function ukb_qc_filter(data, qcfile)
+    qc_df = CSV.File(qcfile) |> DataFrame
+    fully_genotyped_snps = innerjoin(
+        data, 
+        qc_df, 
+        on = :snpid => :rs_id,
+        makeunique = true
+    )
+    # Assayed in both genotyping arrays
+    return filter(:array => ==(2), fully_genotyped_snps)
+end
+
 
 """
     filter_chromosome(parsed_args)
@@ -34,53 +63,32 @@ We filter SNPs using quality control metrics from the following resource:
     - https://biobank.ndph.ox.ac.uk/showcase/refer.cgi?id=1955
 """
 function filter_chromosome(parsed_args)
-
-    qc_df = CSV.File(parsed_args["qcfile"]) |> DataFrame
-    
     snp_data = SnpData(parsed_args["input"])
-    # Load and redefine LD bounds
-    ld_blocks = CSV.File(
-        parsed_args["ld-blocks"];
-        header=["rsid","chr","pos","LDblock_lower","LDblock_upper","LDblock_length","lower_bound","upper_bound"]) |> DataFrame
-    ld_blocks.chr = string.(ld_blocks.chr)
-    transform!(ld_blocks,
-        :,
-        [:pos, :lower_bound, :upper_bound] => ByRow(bounds) => [:lower_bound, :upper_bound],
-        )
-    ld_blocks = groupby(ld_blocks, :chr)
 
     # Remove SNP's with MAF < maf-threshold
     maf_threshold = parsed_args["maf-threshold"]
     snp_data.snp_info[!, "MAF"] = SnpArrays.maf(snp_data.snparray)
     mafpassed = filter(:MAF => >=(maf_threshold), snp_data.snp_info)
     
     # Remove LD regions specified by ld_blocks
-    ld_pruned = filter(x -> notin_ldblocks(x, ld_blocks), mafpassed)
+    ld_pruned = ld_blocks_filter(mafpassed, parsed_args["ld-blocks"])
 
     # The QC file contains information on fully genotyped SNPS
     # We only keep those
-    fully_genotyped_snps = innerjoin(
-        ld_pruned, 
-        qc_df, 
-        on = :snpid => :rs_id,
-        makeunique = true
-    )
+    qced = ukb_qc_filter(ld_pruned, parsed_args["qcfile"])
 
     # If an RSID appears multiple times, it is because it has 
     # more than 2 possible alleles: we remove them 
     # (why? maybe because the PCA then cannot tackle them)
-    duplicate_rsids = Set(fully_genotyped_snps.snpid[nonunique(fully_genotyped_snps, ["snpid"])])
-    biallelic = filter(:snpid=>∉(duplicate_rsids), fully_genotyped_snps)
+    duplicate_rsids = Set(qced.snpid[nonunique(qced, ["snpid"])])
+    biallelic = filter(:snpid=>∉(duplicate_rsids), qced)
 
     # Keep only actual SNPs and not other kinds of variants
     actual_snps = subset(biallelic, :allele1 => ByRow(issnp), :allele2 => ByRow(issnp))
 
     # All batches pass QC 
     batch_cols = [x for x in names(actual_snps) if occursin("Batch", x)]
-    batches_ok = filter(row -> all_batches_ok(row, batch_cols), actual_snps)
-
-    # Assayed in both genotyping arrays
-    final = filter(:array => ==(2), batches_ok)
+    final = filter(row -> all_batches_ok(row, batch_cols), actual_snps)
     
     rsids = Set(final.snpid)
     sample_ids = Set(CSV.read(parsed_args["traits"], DataFrame, select=["SAMPLE_ID"], types=String)[!, 1])
diff --git a/test/confounders.jl b/test/confounders.jl
@@ -6,6 +6,12 @@ using TargeneCore
 using DataFrames
 using CSV
 
+function clean(parsed_args)
+    for ext in [".bed", ".bim", ".fam"]
+        rm(parsed_args["output"]*ext)
+    end
+end
+
 @testset "Various functions" begin
     # Test issnp
     @test TargeneCore.issnp("A") == true
@@ -30,11 +36,12 @@ using CSV
 end
 
 @testset "Test filter_chromosome" begin
+    # All options provided
     parsed_args = Dict(
         "input"  => SnpArrays.datadir("mouse"),
         "output" => joinpath("data", "filtered-mouse"),
         "qcfile" => joinpath("data", "ukbb", "qcfile.txt"),
-        "ld-blocks" => joinpath("data", "VDR_LD_blocks.txt"),
+        "ld-blocks" => joinpath("data", "LD_blocks.txt"),
         "maf-threshold" => 0.31,
         "traits" => joinpath("data", "sample_ids.txt")
     )
@@ -46,10 +53,47 @@ end
     @test size(filtered.snparray) == (5, 1)
     @test filtered.person_info.iid == 
         ["A048005080", "A048006063", "A048006555", "A048007096", "A048010273"]
-    # Clean
-    for ext in [".bed", ".bim", ".fam"]
-        rm(parsed_args["output"]*ext)
-    end
+    
+    clean(parsed_args)
+
+    # No qc file provided
+    parsed_args = Dict(
+        "input"  => SnpArrays.datadir("mouse"),
+        "output" => joinpath("data", "filtered-mouse"),
+        "qcfile" => nothing,
+        "ld-blocks" => joinpath("data", "LD_blocks.txt"),
+        "maf-threshold" => 0.495,
+        "traits" => joinpath("data", "sample_ids.txt")
+    )
+    filter_chromosome(parsed_args)
+
+    filtered = SnpData(parsed_args["output"])
+
+    @test size(filtered.snparray) == (5, 88)
+    @test filtered.person_info.iid == 
+        ["A048005080", "A048006063", "A048006555", "A048007096", "A048010273"]
+    
+    clean(parsed_args)
+
+    # No ld-block file provided
+    parsed_args = Dict(
+        "input"  => SnpArrays.datadir("mouse"),
+        "output" => joinpath("data", "filtered-mouse"),
+        "qcfile" => nothing,
+        "ld-blocks" => nothing,
+        "maf-threshold" => 0.495,
+        "traits" => joinpath("data", "sample_ids.txt")
+    )
+    filter_chromosome(parsed_args)
+
+    filtered = SnpData(parsed_args["output"])
+    # More variants than the previous settings
+    @test size(filtered.snparray) == (5, 95)
+    @test filtered.person_info.iid == 
+        ["A048005080", "A048006063", "A048006555", "A048007096", "A048010273"]
+
+    clean(parsed_args)
+
 end
 
 @testset "Test merge_beds" begin
@@ -65,10 +109,7 @@ end
 
     @test length(unique(merged.snp_info.chromosome)) == 3
     # Clean
-
-    for ext in [".bed", ".bim", ".fam"]
-        rm(parsed_args["output"]*ext)
-    end
+    clean(parsed_args)
 
 end
 
diff --git a/test/data/LD_blocks.txt b/test/data/LD_blocks.txt