Skip to content

Commit 219d313

Browse files
Remove_asint (#183)
* remove genotype_asint * add Arrow writing instead of CSV
1 parent 214122d commit 219d313

14 files changed

+219
-330
lines changed

Manifest.toml

Lines changed: 182 additions & 115 deletions
Large diffs are not rendered by default.

Project.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ version = "0.1.0"
55

66
[deps]
77
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
8+
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
89
BGEN = "6db4b851-9beb-4b83-9d64-eb1cfb37721d"
910
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
1011
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -23,5 +24,5 @@ CategoricalArrays = "0.10"
2324
Combinatorics = "1.0"
2425
DataFrames = "1.2"
2526
SnpArrays = "0.3"
26-
YAML = "0.4"
2727
TMLE = "0.11"
28+
YAML = "0.4"

bin/tmle_inputs.jl

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,6 @@ function parse_commandline()
8282
required = false
8383
help = "Interaction orders to be estimated"
8484
default = "1,2"
85-
86-
"--genotypes-as-int"
87-
action = :store_true
88-
help = "If true, genotypes are encoded as the number of minor alleles."
8985
end
9086

9187
@add_arg_table s["from-param-file"] begin

src/TargeneCore.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ using Mmap
88
using YAML
99
using Combinatorics
1010
using TMLE
11+
using Arrow
1112

1213

1314
###############################################################################

src/tmle_inputs/from_actors.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@ function tmle_inputs_from_actors(parsed_args)
194194
orders = TargeneCore.parse_orders(parsed_args["from-actors"]["orders"])
195195
extraW = TargeneCore.read_txt_file(parsed_args["from-actors"]["extra-confounders"])
196196
extraC = TargeneCore.read_txt_file(parsed_args["from-actors"]["extra-covariates"])
197-
genotypes_asint = parsed_args["from-actors"]["genotypes-as-int"]
198197

199198
# Retrieve SNPs and environmental treatments
200199
bqtls, transactors, extraT = TargeneCore.treatments_from_actors(
@@ -204,7 +203,7 @@ function tmle_inputs_from_actors(parsed_args)
204203
)
205204
# Genotypes and final dataset
206205
variants = TargeneCore.all_variants(bqtls, transactors)
207-
genotypes = TargeneCore.call_genotypes(bgen_prefix, variants, call_threshold; asint=genotypes_asint)
206+
genotypes = TargeneCore.call_genotypes(bgen_prefix, variants, call_threshold)
208207
data = TargeneCore.merge(traits, pcs, genotypes)
209208

210209
# Parameter files
@@ -215,5 +214,5 @@ function tmle_inputs_from_actors(parsed_args)
215214
)
216215

217216
# write data
218-
CSV.write(string(outprefix, ".data.csv"), data)
217+
Arrow.write(string(outprefix, ".data.arrow"), data)
219218
end

src/tmle_inputs/from_param_files.jl

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -26,33 +26,6 @@ end
2626
check_genotypes_encoding(val::T, type) where T =
2727
T <: type || throw(MismatchedCaseControlEncodingError())
2828

29-
get_genotype_encoding(val::NamedTuple) = typeof(val.case) <: Real ? Real : String
30-
get_genotype_encoding(val::T) where {T<:String} = String
31-
get_genotype_encoding(val::T) where {T<:Real} = Real
32-
get_genotype_encoding(val) = throw(ArgumentError(string("Genotype value(s):", val, " not allowed")))
33-
34-
"""
35-
We enforce that all genotypes are encoded in the same way.
36-
That is either with an integer representing the number of minor alleles
37-
or the string representation.
38-
"""
39-
function get_genotype_encoding(parameters, variants::Set{Symbol})
40-
genotypes_encoding = nothing
41-
for Ψ in parameters
42-
for (T, val) in zip(keys.treatment), Ψ.treatment)
43-
if T in variants
44-
if genotypes_encoding === nothing
45-
genotypes_encoding = get_genotype_encoding(val)
46-
else
47-
check_genotypes_encoding(val, genotypes_encoding)
48-
end
49-
end
50-
end
51-
end
52-
return genotypes_encoding <: Real
53-
end
54-
55-
5629

5730
function get_variables(parameters, traits, pcs)
5831
variants = Set{Symbol}()
@@ -75,9 +48,6 @@ end
7548
fix_mismatch(variant, allele::T, actual_alleles) where T =
7649
throw(ArgumentError(string("Can't deal with ", variant, "'s allele ", allele, " of type: ", T)))
7750

78-
fix_mismatch(variant, allele::Real, actual_alleles) =
79-
allele actual_alleles || throw(AbsentAlleleError(variant, allele))
80-
8151
function fix_mismatch(variant, allele::String, actual_alleles)
8252
length(allele) == 2 || throw(NotSNPAndWontFixError(variant, allele))
8353
rev_allele = reverse(allele)
@@ -203,12 +173,10 @@ function tmle_inputs_from_param_files(parsed_args)
203173

204174
# Genotypes and full data
205175
variables = TargeneCore.get_variables(parameters, traits, pcs)
206-
genotypes_asint = TargeneCore.get_genotype_encoding(parameters, variables.variants)
207176
genotypes = TargeneCore.call_genotypes(
208177
bgen_prefix,
209178
Set(string.(variables.variants)),
210-
call_threshold;
211-
asint=genotypes_asint
179+
call_threshold
212180
)
213181
data = TargeneCore.merge(traits, pcs, genotypes)
214182

src/tmle_inputs/tmle_inputs.jl

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ read_data(filepath) = CSV.read(filepath, DataFrame, types=Dict(:SAMPLE_ID => Str
1212

1313
function write_tmle_inputs(outprefix, final_dataset, parameters; batch_size=nothing)
1414
# Write final_dataset
15-
CSV.write(string(outprefix, ".data.csv"), final_dataset)
15+
Arrow.write(string(outprefix, ".data.arrow"), final_dataset)
1616
# Write param_files
1717
if batch_size !== nothing
1818
for (batch_id, batch) in enumerate(Iterators.partition(parameters, batch_size))
@@ -56,22 +56,13 @@ all_snps_called(found_variants::Set{<:AbstractString}, variants::Set{<:AbstractS
5656
variants == found_variants
5757

5858
"""
59-
genotypes_encoding(variant; asint=true)
59+
genotypes_encoding(variant)
6060
61-
If asint is true then the number of minor alleles is reported, otherwise string genotypes are reported.
61+
String genotypes are reported.
6262
"""
63-
function genotypes_encoding(variant; asint=true)
64-
minor = minor_allele(variant)
63+
function genotypes_encoding(variant)
6564
all₁, all₂ = alleles(variant)
66-
if asint
67-
if all₁ == minor
68-
return [2, 1, 0]
69-
else
70-
return [0, 1, 2]
71-
end
72-
else
73-
return [all₁*all₁, all₁*all₂, all₂*all₂]
74-
end
65+
return [all₁*all₁, all₁*all₂, all₂*all₂]
7566
end
7667

7768
NotAllVariantsFoundError(found_snps, snp_list) =
@@ -83,7 +74,7 @@ NotBiAllelicOrUnphasedVariantError(rsid) = ArgumentError(string("Variant: ", rsi
8374
8475
This function assumes the UK-Biobank structure
8576
"""
86-
function call_genotypes(bgen_prefix::String, variants::Set{<:AbstractString}, threshold::Real; asint=true)
77+
function call_genotypes(bgen_prefix::String, variants::Set{<:AbstractString}, threshold::Real)
8778
chr_dir_, prefix_ = splitdir(bgen_prefix)
8879
chr_dir = chr_dir_ == "" ? "." : chr_dir_
8980
genotypes = nothing
@@ -102,7 +93,7 @@ function call_genotypes(bgen_prefix::String, variants::Set{<:AbstractString}, th
10293
continue
10394
end
10495
minor_allele_dosage!(bgenfile, variant)
105-
variant_genotypes = genotypes_encoding(variant; asint=asint)
96+
variant_genotypes = genotypes_encoding(variant)
10697
probabilities = probabilities!(bgenfile, variant)
10798
size(probabilities, 1) != 3 && throw(NotBiAllelicOrUnphasedVariantError(rsid_))
10899
chr_genotypes[!, rsid_] = call_genotypes(probabilities, variant_genotypes, threshold)

test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
[deps]
2+
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
23
BGEN = "6db4b851-9beb-4b83-9d64-eb1cfb37721d"
34
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
45
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
Parameters:
22
- type: IATE
33
target: "BINARY_1"
4-
treatment: (RSID_2 = (case = 1, control = 0), TREAT_1 = (case = 1, control = 0))
4+
treatment: (RSID_2 = (case = "AA", control = "GG"), TREAT_1 = (case = 1, control = 0))
55
confounders: []
66
- type: ATE
77
target: "CONTINUOUS_2"
8-
treatment: (RSID_2 = (case = 1, control = 0),)
8+
treatment: (RSID_2 = (case = "AA", control = "GG"),)
99
confounders: [22001]
1010
covariates: [COV_1, 21003]
1111
- type: CM
1212
target: "*"
13-
treatment: (RSID_2 = 1, )
13+
treatment: (RSID_2 = "AA", )
1414
confounders: [22001]
1515
covariates: [COV_1, 21003]
File renamed without changes.

0 commit comments

Comments
 (0)