@@ -23,6 +23,35 @@ function notin_ldblocks(row, ldblocks)
23
23
end
24
24
end
25
25
26
+ ld_blocks_filter (data, ld_blocks_file:: Nothing ) = data
27
+
28
+ function ld_blocks_filter (data, ld_blocks_file)
29
+ ld_blocks = CSV. File (
30
+ ld_blocks_file;
31
+ header= [" rsid" , " chr" , " pos" , " LDblock_lower" , " LDblock_upper" , " LDblock_length" , " lower_bound" , " upper_bound" ]) |> DataFrame
32
+ ld_blocks. chr = string .(ld_blocks. chr)
33
+ transform! (ld_blocks,
34
+ :,
35
+ [:pos , :lower_bound , :upper_bound ] => ByRow (bounds) => [:lower_bound , :upper_bound ],
36
+ )
37
+ ld_blocks = groupby (ld_blocks, :chr )
38
+ return filter (x -> notin_ldblocks (x, ld_blocks), data)
39
+ end
40
+
41
+ ukb_qc_filter (data, qcfile:: Nothing ) = data
42
+
43
+ function ukb_qc_filter (data, qcfile)
44
+ qc_df = CSV. File (qcfile) |> DataFrame
45
+ fully_genotyped_snps = innerjoin (
46
+ data,
47
+ qc_df,
48
+ on = :snpid => :rs_id ,
49
+ makeunique = true
50
+ )
51
+ # Assayed in both genotyping arrays
52
+ return filter (:array => == (2 ), fully_genotyped_snps)
53
+ end
54
+
26
55
27
56
"""
28
57
filter_chromosome(parsed_args)
@@ -34,53 +63,32 @@ We filter SNPs using quality control metrics from the following resource:
34
63
- https://biobank.ndph.ox.ac.uk/showcase/refer.cgi?id=1955
35
64
"""
36
65
function filter_chromosome (parsed_args)
37
-
38
- qc_df = CSV. File (parsed_args[" qcfile" ]) |> DataFrame
39
-
40
66
snp_data = SnpData (parsed_args[" input" ])
41
- # Load and redefine LD bounds
42
- ld_blocks = CSV. File (
43
- parsed_args[" ld-blocks" ];
44
- header= [" rsid" ," chr" ," pos" ," LDblock_lower" ," LDblock_upper" ," LDblock_length" ," lower_bound" ," upper_bound" ]) |> DataFrame
45
- ld_blocks. chr = string .(ld_blocks. chr)
46
- transform! (ld_blocks,
47
- :,
48
- [:pos , :lower_bound , :upper_bound ] => ByRow (bounds) => [:lower_bound , :upper_bound ],
49
- )
50
- ld_blocks = groupby (ld_blocks, :chr )
51
67
52
68
# Remove SNP's with MAF < maf-threshold
53
69
maf_threshold = parsed_args[" maf-threshold" ]
54
70
snp_data. snp_info[! , " MAF" ] = SnpArrays. maf (snp_data. snparray)
55
71
mafpassed = filter (:MAF => >= (maf_threshold), snp_data. snp_info)
56
72
57
73
# Remove LD regions specified by ld_blocks
58
- ld_pruned = filter (x -> notin_ldblocks (x, ld_blocks), mafpassed )
74
+ ld_pruned = ld_blocks_filter (mafpassed, parsed_args[ " ld-blocks " ] )
59
75
60
76
# The QC file contains information on fully genotyped SNPS
61
77
# We only keep those
62
- fully_genotyped_snps = innerjoin (
63
- ld_pruned,
64
- qc_df,
65
- on = :snpid => :rs_id ,
66
- makeunique = true
67
- )
78
+ qced = ukb_qc_filter (ld_pruned, parsed_args[" qcfile" ])
68
79
69
80
# If an RSID appears multiple times, it is because it has
70
81
# more than 2 possible alleles: we remove them
71
82
# (why? maybe because the PCA then cannot tackle them)
72
- duplicate_rsids = Set (fully_genotyped_snps . snpid[nonunique (fully_genotyped_snps , [" snpid" ])])
73
- biallelic = filter (:snpid => ∉ (duplicate_rsids), fully_genotyped_snps )
83
+ duplicate_rsids = Set (qced . snpid[nonunique (qced , [" snpid" ])])
84
+ biallelic = filter (:snpid => ∉ (duplicate_rsids), qced )
74
85
75
86
# Keep only actual SNPs and not other kinds of variants
76
87
actual_snps = subset (biallelic, :allele1 => ByRow (issnp), :allele2 => ByRow (issnp))
77
88
78
89
# All batches pass QC
79
90
batch_cols = [x for x in names (actual_snps) if occursin (" Batch" , x)]
80
- batches_ok = filter (row -> all_batches_ok (row, batch_cols), actual_snps)
81
-
82
- # Assayed in both genotyping arrays
83
- final = filter (:array => == (2 ), batches_ok)
91
+ final = filter (row -> all_batches_ok (row, batch_cols), actual_snps)
84
92
85
93
rsids = Set (final. snpid)
86
94
sample_ids = Set (CSV. read (parsed_args[" traits" ], DataFrame, select= [" SAMPLE_ID" ], types= String)[! , 1 ])
0 commit comments