Skip to content

Commit 4dcf5c4

Browse files
Brh - update confounders.jl for non-UKBB cohort (#176)
* qc file flag optional for this code to execute * add test for no qc file passed as argument * add default value for qcfile * change logic to evaluate whether there is no qcfile argument given * update test function to be compatible with changes to confounders.jl * regroup qcfile logic and add clean method for tests * add optionality of ldblocks file --------- Co-authored-by: s2223108 <[email protected]> Co-authored-by: Olivier Labayle <[email protected]>
1 parent 0c4657c commit 4dcf5c4

File tree

4 files changed

+86
-35
lines changed

4 files changed

+86
-35
lines changed

bin/prepare_confounders.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ function parse_commandline()
1818
"--qcfile"
1919
help = "Path to the UKBiobank ukb_snp_qc.txt"
2020
arg_type = String
21+
default = nothing
22+
required = false
2123
"--maf-threshold", "-t"
2224
help = "SNPs with MAF lower than this value will be filtered out"
2325
arg_type = Float64

src/confounders.jl

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,35 @@ function notin_ldblocks(row, ldblocks)
2323
end
2424
end
2525

26+
ld_blocks_filter(data, ld_blocks_file::Nothing) = data
27+
28+
function ld_blocks_filter(data, ld_blocks_file)
29+
ld_blocks = CSV.File(
30+
ld_blocks_file;
31+
header=["rsid", "chr", "pos", "LDblock_lower", "LDblock_upper", "LDblock_length", "lower_bound", "upper_bound"]) |> DataFrame
32+
ld_blocks.chr = string.(ld_blocks.chr)
33+
transform!(ld_blocks,
34+
:,
35+
[:pos, :lower_bound, :upper_bound] => ByRow(bounds) => [:lower_bound, :upper_bound],
36+
)
37+
ld_blocks = groupby(ld_blocks, :chr)
38+
return filter(x -> notin_ldblocks(x, ld_blocks), data)
39+
end
40+
41+
ukb_qc_filter(data, qcfile::Nothing) = data
42+
43+
function ukb_qc_filter(data, qcfile)
44+
qc_df = CSV.File(qcfile) |> DataFrame
45+
fully_genotyped_snps = innerjoin(
46+
data,
47+
qc_df,
48+
on = :snpid => :rs_id,
49+
makeunique = true
50+
)
51+
# Assayed in both genotyping arrays
52+
return filter(:array => ==(2), fully_genotyped_snps)
53+
end
54+
2655

2756
"""
2857
filter_chromosome(parsed_args)
@@ -34,53 +63,32 @@ We filter SNPs using quality control metrics from the following resource:
3463
- https://biobank.ndph.ox.ac.uk/showcase/refer.cgi?id=1955
3564
"""
3665
function filter_chromosome(parsed_args)
37-
38-
qc_df = CSV.File(parsed_args["qcfile"]) |> DataFrame
39-
4066
snp_data = SnpData(parsed_args["input"])
41-
# Load and redefine LD bounds
42-
ld_blocks = CSV.File(
43-
parsed_args["ld-blocks"];
44-
header=["rsid","chr","pos","LDblock_lower","LDblock_upper","LDblock_length","lower_bound","upper_bound"]) |> DataFrame
45-
ld_blocks.chr = string.(ld_blocks.chr)
46-
transform!(ld_blocks,
47-
:,
48-
[:pos, :lower_bound, :upper_bound] => ByRow(bounds) => [:lower_bound, :upper_bound],
49-
)
50-
ld_blocks = groupby(ld_blocks, :chr)
5167

5268
# Remove SNP's with MAF < maf-threshold
5369
maf_threshold = parsed_args["maf-threshold"]
5470
snp_data.snp_info[!, "MAF"] = SnpArrays.maf(snp_data.snparray)
5571
mafpassed = filter(:MAF => >=(maf_threshold), snp_data.snp_info)
5672

5773
# Remove LD regions specified by ld_blocks
58-
ld_pruned = filter(x -> notin_ldblocks(x, ld_blocks), mafpassed)
74+
ld_pruned = ld_blocks_filter(mafpassed, parsed_args["ld-blocks"])
5975

6076
# The QC file contains information on fully genotyped SNPS
6177
# We only keep those
62-
fully_genotyped_snps = innerjoin(
63-
ld_pruned,
64-
qc_df,
65-
on = :snpid => :rs_id,
66-
makeunique = true
67-
)
78+
qced = ukb_qc_filter(ld_pruned, parsed_args["qcfile"])
6879

6980
# If an RSID appears multiple times, it is because it has
7081
# more than 2 possible alleles: we remove them
7182
# (why? maybe because the PCA then cannot tackle them)
72-
duplicate_rsids = Set(fully_genotyped_snps.snpid[nonunique(fully_genotyped_snps, ["snpid"])])
73-
biallelic = filter(:snpid=>(duplicate_rsids), fully_genotyped_snps)
83+
duplicate_rsids = Set(qced.snpid[nonunique(qced, ["snpid"])])
84+
biallelic = filter(:snpid=>(duplicate_rsids), qced)
7485

7586
# Keep only actual SNPs and not other kinds of variants
7687
actual_snps = subset(biallelic, :allele1 => ByRow(issnp), :allele2 => ByRow(issnp))
7788

7889
# All batches pass QC
7990
batch_cols = [x for x in names(actual_snps) if occursin("Batch", x)]
80-
batches_ok = filter(row -> all_batches_ok(row, batch_cols), actual_snps)
81-
82-
# Assayed in both genotyping arrays
83-
final = filter(:array => ==(2), batches_ok)
91+
final = filter(row -> all_batches_ok(row, batch_cols), actual_snps)
8492

8593
rsids = Set(final.snpid)
8694
sample_ids = Set(CSV.read(parsed_args["traits"], DataFrame, select=["SAMPLE_ID"], types=String)[!, 1])

test/confounders.jl

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ using TargeneCore
66
using DataFrames
77
using CSV
88

9+
function clean(parsed_args)
10+
for ext in [".bed", ".bim", ".fam"]
11+
rm(parsed_args["output"]*ext)
12+
end
13+
end
14+
915
@testset "Various functions" begin
1016
# Test issnp
1117
@test TargeneCore.issnp("A") == true
@@ -30,11 +36,12 @@ using CSV
3036
end
3137

3238
@testset "Test filter_chromosome" begin
39+
# All options provided
3340
parsed_args = Dict(
3441
"input" => SnpArrays.datadir("mouse"),
3542
"output" => joinpath("data", "filtered-mouse"),
3643
"qcfile" => joinpath("data", "ukbb", "qcfile.txt"),
37-
"ld-blocks" => joinpath("data", "VDR_LD_blocks.txt"),
44+
"ld-blocks" => joinpath("data", "LD_blocks.txt"),
3845
"maf-threshold" => 0.31,
3946
"traits" => joinpath("data", "sample_ids.txt")
4047
)
@@ -46,10 +53,47 @@ end
4653
@test size(filtered.snparray) == (5, 1)
4754
@test filtered.person_info.iid ==
4855
["A048005080", "A048006063", "A048006555", "A048007096", "A048010273"]
49-
# Clean
50-
for ext in [".bed", ".bim", ".fam"]
51-
rm(parsed_args["output"]*ext)
52-
end
56+
57+
clean(parsed_args)
58+
59+
# No qc file provided
60+
parsed_args = Dict(
61+
"input" => SnpArrays.datadir("mouse"),
62+
"output" => joinpath("data", "filtered-mouse"),
63+
"qcfile" => nothing,
64+
"ld-blocks" => joinpath("data", "LD_blocks.txt"),
65+
"maf-threshold" => 0.495,
66+
"traits" => joinpath("data", "sample_ids.txt")
67+
)
68+
filter_chromosome(parsed_args)
69+
70+
filtered = SnpData(parsed_args["output"])
71+
72+
@test size(filtered.snparray) == (5, 88)
73+
@test filtered.person_info.iid ==
74+
["A048005080", "A048006063", "A048006555", "A048007096", "A048010273"]
75+
76+
clean(parsed_args)
77+
78+
# No ld-block file provided
79+
parsed_args = Dict(
80+
"input" => SnpArrays.datadir("mouse"),
81+
"output" => joinpath("data", "filtered-mouse"),
82+
"qcfile" => nothing,
83+
"ld-blocks" => nothing,
84+
"maf-threshold" => 0.495,
85+
"traits" => joinpath("data", "sample_ids.txt")
86+
)
87+
filter_chromosome(parsed_args)
88+
89+
filtered = SnpData(parsed_args["output"])
90+
# More variants than the previous settings
91+
@test size(filtered.snparray) == (5, 95)
92+
@test filtered.person_info.iid ==
93+
["A048005080", "A048006063", "A048006555", "A048007096", "A048010273"]
94+
95+
clean(parsed_args)
96+
5397
end
5498

5599
@testset "Test merge_beds" begin
@@ -65,10 +109,7 @@ end
65109

66110
@test length(unique(merged.snp_info.chromosome)) == 3
67111
# Clean
68-
69-
for ext in [".bed", ".bim", ".fam"]
70-
rm(parsed_args["output"]*ext)
71-
end
112+
clean(parsed_args)
72113

73114
end
74115

File renamed without changes.

0 commit comments

Comments
 (0)