Skip to content

Commit 911f473

Browse files
Merge branch 'string-treatments' into gwis
2 parents 5e575f7 + f90e3af commit 911f473

File tree

6 files changed

+93
-18
lines changed

6 files changed

+93
-18
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ dev/
3131
# My personal trash file
3232
sandbox.jl
3333

34+
# MY (Joshua) personal trash file
35+
test/data/scratch/string_encodings.jl
36+
3437
# Jupyter
3538
*checkpoints
3639
src/generate_results.jl

Manifest.toml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -938,9 +938,9 @@ version = "1.0.2"
938938

939939
[[deps.HTTP]]
940940
deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"]
941-
git-tree-sha1 = "d1d712be3164d61d1fb98e7ce9bcbc6cc06b45ed"
941+
git-tree-sha1 = "bc3f416a965ae61968c20d0ad867556367f2817d"
942942
uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
943-
version = "1.10.8"
943+
version = "1.10.9"
944944

945945
[[deps.HarfBuzz_jll]]
946946
deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll"]
@@ -1155,9 +1155,9 @@ version = "0.21.4"
11551155

11561156
[[deps.JSON3]]
11571157
deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"]
1158-
git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b"
1158+
git-tree-sha1 = "1d322381ef7b087548321d3f878cb4c9bd8f8f9b"
11591159
uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
1160-
version = "1.14.0"
1160+
version = "1.14.1"
11611161
weakdeps = ["ArrowTypes"]
11621162

11631163
[deps.JSON3.extensions]
@@ -1589,6 +1589,12 @@ version = "0.3.4"
15891589
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
15901590
version = "2023.1.10"
15911591

1592+
[[deps.MultipleTesting]]
1593+
deps = ["Distributions", "SpecialFunctions", "StatsBase"]
1594+
git-tree-sha1 = "1e98f8f732e7035c4333135b75605b74f3462b9b"
1595+
uuid = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
1596+
version = "0.6.0"
1597+
15921598
[[deps.NLSolversBase]]
15931599
deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"]
15941600
git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c"

Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,22 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
1313
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
1414
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
1515
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
16+
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
1617
HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
1718
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
1819
LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
20+
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
1921
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
2022
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
23+
MultipleTesting = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
2124
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
2225
PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
2326
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2427
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
2528
SnpArrays = "4e780e97-f5bf-4111-9dc4-b70aaf691b06"
2629
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
2730
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
31+
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
2832
TMLE = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf"
2933
TMLECLI = "2573d147-4098-46ba-9db2-8608d210ccac"
3034
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"

src/inputs_from_config.jl

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ end
193193
"""
194194
function treatments_from_variant(variant::String, dataset::DataFrame)
195195
variant_levels = sort(levels(dataset[!, variant], skipmissing=true))
196-
return Dict{Symbol, Vector{UInt8}}(Symbol(variant)=>variant_levels)
196+
return Dict{Symbol, Vector{String}}(Symbol(variant)=>variant_levels)
197197
end
198198

199199
function estimands_from_gwas(dataset, variants, outcomes, confounders;
@@ -244,14 +244,35 @@ function read_bed_chromosome(bedprefix)
244244
return SnpData(bed_file, famnm=fam_file, bimnm=bim_file)
245245
end
246246

247+
function map_allele(value, allele1, allele2)
248+
if value == 0x00
249+
return "$allele1$allele1"
250+
elseif value == 0x01
251+
return missing
252+
elseif value == 0x02
253+
return "$allele1$allele2"
254+
elseif value == 0x03
255+
return "$allele2$allele2"
256+
end
257+
end
258+
259+
function convert_string(snpdata)
260+
genotypes_data = []
261+
for col in 1:snpdata.snps
262+
allele_col = snpdata.snparray[:,col]
263+
allele1 = snpdata.snp_info[col, "allele1"]
264+
allele2 = snpdata.snp_info[col, "allele2"]
265+
mapped_col = map(value -> map_allele(value, allele1, allele2), allele_col)
266+
push!(genotypes_data, mapped_col)
267+
end
268+
return DataFrame(genotypes_data, snpdata.snp_info."snpid", makeunique=true)
269+
end
270+
247271
function get_genotypes_from_beds(bedprefix)
248272
snpdata = read_bed_chromosome(bedprefix)
249-
genotypes = DataFrame(convert(Matrix{UInt8}, snpdata.snparray), snpdata.snp_info."snpid")
250-
genotype_map = Union{UInt8, Missing}[0, missing, 1, 2]
251-
for col in names(genotypes)
252-
genotypes[!, col] = [genotype_map[x+1] for x in genotypes[!, col]]
253-
end
273+
genotypes = convert_string(snpdata)
254274
insertcols!(genotypes, 1, :SAMPLE_ID => snpdata.person_info."iid")
275+
255276
return genotypes
256277
end
257278

test/inputs_from_gwas_config.jl

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,27 @@ function get_summary_stats(estimands)
1919
return sort(combine(groupby(results, :OUTCOME), nrow), :OUTCOME)
2020
end
2121

22-
function check_estimands_levels_order(estimands)
22+
function check_estimands_levels_order(estimands, snp_info)
2323
for Ψ in estimands
2424
# If the two components are present, the first is the 0 -> 1 and the second is the 1 -> 2
2525
variant = only(keys.args[1].treatment_values))
26+
variant_info = filter(:snpid=>x->x==String(variant),snp_info)
27+
allele1, allele2 = variant_info.allele1[1], variant_info.allele2[1]
28+
29+
# Here, we check if the order is sufficient to be able to compute non-linear effects any of these combinations will do
2630
if length.args) == 2
27-
@test Ψ.args[1].treatment_values[variant] == (control = 0x00, case = 0x01)
28-
@test Ψ.args[2].treatment_values[variant] == (control = 0x01, case = 0x02)
31+
@test.args[1].treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) &&
32+
Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2)) ||
33+
.args[1].treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) &&
34+
Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1))
2935
else
3036
# Otherwise we check they are one or the other
3137
arg = only.args)
32-
@test arg.treatment_values[variant]==(control = 0x00, case = 0x01) ||
33-
arg.treatment_values[variant]==( control = 0x01, case = 0x02)
38+
@test arg.treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) ||
39+
arg.treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) ||
40+
arg.treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2) ||
41+
arg.treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1)
42+
3443
end
3544
end
3645
end
@@ -48,6 +57,9 @@ end
4857
"--positivity-constraint=0"
4958
])
5059
TargeneCore.julia_main()
60+
# Define SNP information to check string allele defintions
61+
snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1."))
62+
snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2])
5163
# Check dataset
5264
dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow")))
5365
@test size(dataset) == (1940, 886)
@@ -68,7 +80,7 @@ end
6880
nrow = repeat([875], 5)
6981
)
7082

71-
check_estimands_levels_order(estimands)
83+
check_estimands_levels_order(estimands, snp_info)
7284
end
7385

7486
@testset "Test inputs_from_config gwas: positivity constraint" begin
@@ -85,6 +97,9 @@ end
8597
"--positivity-constraint=0.2"
8698
])
8799
TargeneCore.julia_main()
100+
# Define SNP information to check string allele defintions
101+
snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1."))
102+
snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2])
88103
# Check dataset
89104
dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow")))
90105
@test size(dataset) == (1940, 886)
@@ -103,7 +118,7 @@ end
103118
nrow = repeat([777], 5)
104119
)
105120

106-
check_estimands_levels_order(estimands)
121+
check_estimands_levels_order(estimands, snp_info)
107122
end
108123

109124
end

test/testutils.jl

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,4 +202,30 @@ function make_estimands_configuration_file(config_generator=make_estimands_confi
202202
filename = joinpath(dir, "configuration.yaml")
203203
TMLE.write_yaml(filename, config)
204204
return filename
205-
end
205+
end
206+
207+
# Additional helper functions for testing GWAS treatment strings
208+
209+
get_only_file_with_suffix(files, suffix) = files[only(findall(x -> endswith(x, suffix), files))]
210+
211+
function files_matching_prefix(prefix)
212+
directory, _prefix = splitdir(prefix)
213+
_directory = directory == "" ? "." : directory
214+
215+
return map(
216+
f -> joinpath(directory, f),
217+
filter(
218+
f -> startswith(f, _prefix),
219+
readdir(_directory)
220+
)
221+
)
222+
end
223+
224+
function read_bed_chromosome(bedprefix)
225+
bed_files = files_matching_prefix(bedprefix)
226+
fam_file = get_only_file_with_suffix(bed_files, "fam")
227+
bim_file = get_only_file_with_suffix(bed_files, "bim")
228+
bed_file = get_only_file_with_suffix(bed_files, "bed")[1:end-4]
229+
return SnpData(bed_file, famnm=fam_file, bimnm=bim_file)
230+
end
231+

0 commit comments

Comments
 (0)