Skip to content

Commit 74a4239

Browse files
make targene specific outputs (#194)
* make targene specific outputs * add missing test dep * fix outputs to remove SAMPLE_IDS * add svp into TargeneCore * add MKL * fix missing test dep * add missing cmd function map * fix test * add new pvalue section per component
1 parent 50a79ad commit 74a4239

17 files changed

+1307
-189
lines changed

Manifest.toml

Lines changed: 97 additions & 74 deletions
Large diffs are not rendered by default.

Project.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
1515
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
1616
HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
1717
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
18+
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
19+
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
20+
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
1821
PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
1922
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2023
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -34,10 +37,12 @@ CairoMakie = "0.12"
3437
CategoricalArrays = "0.10"
3538
Combinatorics = "1.0"
3639
DataFrames = "1.2"
40+
OrderedCollections = "1.6.3"
3741
PackageCompiler = "2.1.17"
3842
SnpArrays = "0.3"
3943
StableRNGs = "1.0.1"
4044
Statistics = "1.10"
4145
TMLE = "0.17"
46+
MKL = "0.7"
4247
YAML = "0.4.9"
4348
julia = "1.10, 1"

src/TargeneCore.jl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
module TargeneCore
22

3+
4+
if occursin("Intel", Sys.cpu_info()[1].model)
5+
using MKL
6+
end
7+
38
using ArgParse
49
using DataFrames
510
using CSV
@@ -18,6 +23,8 @@ using JLD2
1823
using CairoMakie
1924
using TMLECLI
2025
using HypothesisTests
26+
using OrderedCollections
27+
using Mmap
2128

2229
###############################################################################
2330
### INCLUDES ###
@@ -26,10 +33,11 @@ using HypothesisTests
2633
include("utils.jl")
2734
include("confounders.jl")
2835
include("dataset.jl")
29-
include("plots.jl")
36+
include("outputs.jl")
3037
include("inputs_from_estimands.jl")
3138
include("inputs_from_config.jl")
3239
include("estimation_inputs.jl")
40+
include("sieve_variance.jl")
3341
include("cli.jl")
3442

3543
###############################################################################
@@ -38,7 +46,7 @@ include("cli.jl")
3846

3947
export estimation_inputs
4048
export make_dataset
41-
export summary_plots
49+
export make_outputs
4250
export filter_chromosome, merge_beds, adapt_flashpca
4351

4452
export get_outcome, get_treatments, get_outcome_extra_covariates, get_confounders, get_all_confounders

src/cli.jl

Lines changed: 64 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
function cli_settings()
22
s = ArgParseSettings(description="TargeneCore CLI.")
33

4-
@add_arg_table s begin
4+
@add_arg_table! s begin
55
"estimation-inputs"
66
action = :command
77
help = "Generate estimation inputs"
@@ -10,7 +10,7 @@ function cli_settings()
1010
action = :command
1111
help = "Generate a dataset."
1212

13-
"summary-plots"
13+
"make-outputs"
1414
action = :command
1515
help = "Generate summary plots."
1616

@@ -25,9 +25,13 @@ function cli_settings()
2525
"adapt-flashpca"
2626
action = :command
2727
help = "Adapts FlashPCA output."
28+
29+
"svp"
30+
action = :command
31+
help = "Run Sieve Variance Plateau."
2832
end
2933

30-
@add_arg_table s["estimation-inputs"] begin
34+
@add_arg_table! s["estimation-inputs"] begin
3135
"config-file"
3236
arg_type = String
3337
help = "Configuration file (.yaml)."
@@ -72,7 +76,7 @@ function cli_settings()
7276
default = 0
7377
end
7478

75-
@add_arg_table s["make-dataset"] begin
79+
@add_arg_table! s["make-dataset"] begin
7680
"--genotypes-prefix"
7781
help = "Prefix path to BGEN/BED chromosomes."
7882
required = true
@@ -108,12 +112,12 @@ function cli_settings()
108112
default = 0
109113
end
110114

111-
@add_arg_table s["summary-plots"] begin
112-
"results-file"
113-
help = "Path to the results file."
115+
@add_arg_table! s["make-outputs"] begin
116+
"input-prefix"
117+
help = "Input prefix to the results file."
114118
required = true
115119

116-
"--outprefix"
120+
"--output-prefix"
117121
help = "Prefix to output plots."
118122
default = "."
119123

@@ -123,7 +127,7 @@ function cli_settings()
123127
default = 0
124128
end
125129

126-
@add_arg_table s["filter-chromosome"] begin
130+
@add_arg_table! s["filter-chromosome"] begin
127131
"input"
128132
help = "Prefix to bed files."
129133

@@ -154,22 +158,58 @@ function cli_settings()
154158
required = false
155159
end
156160

157-
@add_arg_table s["merge-beds"] begin
161+
@add_arg_table! s["merge-beds"] begin
158162
"input-prefix"
159163
help = "Prefix to bed files."
160164

161165
"output"
162166
help = "Output file."
163167
end
164168

165-
@add_arg_table s["adapt-flashpca"] begin
169+
@add_arg_table! s["adapt-flashpca"] begin
166170
"input"
167171
help = "Input file."
168172

169173
"output"
170174
help = "Output file."
171175
end
172176

177+
@add_arg_table! s["svp"] begin
178+
"input-prefix"
179+
arg_type = String
180+
help = "Input prefix to HDF5 files generated by the tmle CLI."
181+
182+
"--out"
183+
arg_type = String
184+
help = "Output filename."
185+
default = "svp.hdf5"
186+
187+
"--grm-prefix"
188+
arg_type = String
189+
help = "Prefix to the aggregated GRM."
190+
default = "GRM"
191+
192+
"--verbosity"
193+
arg_type = Int
194+
default = 0
195+
help = "Verbosity level"
196+
197+
"--n-estimators"
198+
arg_type = Int
199+
default = 10
200+
help = "Number of variance estimators to build for each estimate."
201+
202+
"--max-tau"
203+
arg_type = Float64
204+
default = 0.8
205+
help = "Maximum distance between any two individuals."
206+
207+
"--estimator-key"
208+
arg_type = String
209+
help = "Estimator to use to proceed with sieve variance correction."
210+
default = "1"
211+
end
212+
173213
return s
174214
end
175215

@@ -199,10 +239,10 @@ function julia_main()::Cint
199239
call_threshold=cmd_settings["call-threshold"],
200240
verbosity=cmd_settings["verbosity"]
201241
)
202-
elseif cmd == "summary-plots"
203-
summary_plots(
204-
cmd_settings["results-file"],
205-
outprefix=cmd_settings["outprefix"],
242+
elseif cmd == "make-outputs"
243+
make_outputs(
244+
cmd_settings["input-prefix"];
245+
output_prefix=cmd_settings["output-prefix"],
206246
verbosity=cmd_settings["verbosity"],
207247
)
208248
elseif cmd == "filter-chromosome"
@@ -218,6 +258,15 @@ function julia_main()::Cint
218258
adapt_flashpca(
219259
cmd_settings["input"], cmd_settings["output"]
220260
)
261+
elseif cmd =="svp"
262+
sieve_variance_plateau(cmd_settings["input-prefix"];
263+
out=cmd_settings["out"],
264+
grm_prefix=cmd_settings["grm-prefix"],
265+
verbosity=cmd_settings["verbosity"],
266+
n_estimators=cmd_settings["n-estimators"],
267+
max_tau=cmd_settings["max-tau"],
268+
estimator_key=cmd_settings["estimator-key"]
269+
)
221270
else
222271
throw(ArgumentError(string("No function matching command:", cmd)))
223272
end

src/outputs.jl

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
function add_pvalue_columns!(results)
2+
for colname names(results)
3+
results[!, Symbol(colname, :_PVALUE)] = [pvalue_or_nan(Ψ̂) for Ψ̂ results[!, colname]]
4+
end
5+
end
6+
7+
TMLE.estimate(Ψ̂::TMLECLI.FailedEstimate) = "Failed"
8+
9+
get_treatment_changes(Ψ) = Dict(treatment_name => join(treatment_values, " => ") for (treatment_name, treatment_values) Ψ.treatment_values)
10+
get_treatment_changes::TMLE.JointEstimand) = [get_treatment_changes(Ψᵢ) for Ψᵢ Ψ.args]
11+
12+
get_estimand_type(Ψ) = replace(string(typeof(Ψ)), "TMLE.Statistical" => "")
13+
get_estimand_type::TMLE.JointEstimand) = get_estimand_type(first.args))
14+
15+
log10_uniform_quantiles(n) = -log10.(collect(LinRange(0., 1., n + 1))[2:end])
16+
17+
log10_beta_quantiles(n, alpha) = -log10.([quantile(Beta(k, n + 1 k), alpha) for k in 1:n])
18+
19+
function qqplot(output, results)
20+
fig = Figure()
21+
ax = Axis(fig[1, 1],
22+
title="Q-Q Plot",
23+
xlabel = "-log₁₀(Uniform)",
24+
ylabel = "-log₁₀(Observed)"
25+
)
26+
# QQ plots
27+
pvalue_cols = [colname for colname names(results) if endswith(colname, "PVALUE")]
28+
for pvalue_col pvalue_cols
29+
pvalues = -log10.(filter(x -> !isnan(x), results[!, pvalue_col]))
30+
n = length(pvalues)
31+
unif_quantiles = log10_uniform_quantiles(n)
32+
qqplot!(ax, unif_quantiles, pvalues, qqline=:identity, label=replace(string(pvalue_col), "_PVALUE" => ""))
33+
end
34+
# Confidence bands (from: https://lbelzile.github.io/lineaRmodels/qqplot.html)
35+
n = size(results, 1)
36+
unif_quantiles = log10_uniform_quantiles(n)
37+
ub = log10_beta_quantiles(n, 0.025)
38+
lb = log10_beta_quantiles(n, 0.975)
39+
band!(ax, unif_quantiles, lb, ub, color=(:grey, 0.6))
40+
# Legend
41+
axislegend(position=:lt)
42+
# Save figure
43+
CairoMakie.save(output, fig)
44+
return fig
45+
end
46+
47+
summary_statistics(Ψ̂) = OrderedDict(
48+
:PVALUE => pvalue_or_nan(Ψ̂),
49+
:EFFECT_SIZE => TMLE.estimate(Ψ̂)
50+
)
51+
52+
function summary_statistics(Ψ̂::TMLE.JointEstimate)
53+
return OrderedDict(
54+
:PVALUE => pvalue_or_nan(Ψ̂),
55+
:COMPONENTS => [summary_statistics(Ψ̂ᵢ) for Ψ̂ᵢ Ψ̂.estimates]
56+
)
57+
end
58+
59+
function save_summary_yaml(output, results)
60+
estimator_names = Symbol.(filter(x -> !endswith(x, "PVALUE"), names(results)))
61+
estimator_1 = first(estimator_names)
62+
summary_dicts = []
63+
for row in eachrow(results)
64+
Ψ = row[estimator_1].estimand
65+
estimand_results = OrderedDict(
66+
:EFFECT_TYPE => get_estimand_type(Ψ),
67+
:OUTCOME => get_outcome(Ψ),
68+
:TREATMENTS => get_treatment_changes(Ψ),
69+
)
70+
for estimator_name in estimator_names
71+
estimand_results[estimator_name] = summary_statistics(row[estimator_name])
72+
end
73+
push!(summary_dicts, estimand_results)
74+
end
75+
YAML.write_file(output, summary_dicts)
76+
end
77+
78+
function load_results_as_df(input_prefix)
79+
# Load from all files
80+
input_files = files_matching_prefix(input_prefix)
81+
results = DataFrame(TMLECLI.read_results_from_files(input_files))
82+
# Drop potential SAMPLE_IDS columns
83+
hasproperty(results, :SAMPLE_IDS) && select!(results, Not(:SAMPLE_IDS))
84+
# Add p-values columns for each estimator
85+
add_pvalue_columns!(results)
86+
return results
87+
end
88+
89+
function make_outputs(input_prefix;
90+
output_prefix="final",
91+
verbosity=1
92+
)
93+
# Load results
94+
verbosity > 0 && @info("Loading results from files.")
95+
results = load_results_as_df(input_prefix)
96+
# Write full results
97+
verbosity > 0 && @info("Saving aggregated results.")
98+
jldsave(make_filepath_from_prefix(output_prefix; filename="results.hdf5"); results)
99+
# Write Summary YAML file
100+
verbosity > 0 && @info("Saving summary YAML file.")
101+
save_summary_yaml(make_filepath_from_prefix(output_prefix; filename="results.summary.yaml"), results)
102+
# Save QQ plot
103+
verbosity > 0 && @info("Making QQ plot.")
104+
qqplot(make_filepath_from_prefix(output_prefix; filename="QQ.png"), results)
105+
106+
verbosity > 0 && @info("Done.")
107+
return 0
108+
end

src/plots.jl

Lines changed: 0 additions & 50 deletions
This file was deleted.

0 commit comments

Comments
 (0)