Skip to content

Commit d8d9ed8

Browse files
committed
Add an option to use different correlation methods
1 parent 69ca5bf commit d8d9ed8

File tree

6 files changed

+33
-8
lines changed

6 files changed

+33
-8
lines changed

.test/targets.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ DEMIX:
3333
COV_CUTOFF: 0
3434
MIN_ABUNDANCE: 0.0001
3535
USE_BIONJ: false
36+
COR:
37+
METHOD: "pearson"
38+
EXACT: null
3639
PLOT_GENOME_REGIONS:
3740
"config/nsp_annotation.csv"
3841
REPORT_QMD:

config/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ All of the following variables are pre-defined in [config.yaml](/config/config.y
140140
- `ACCESSION_COLUMN`: name of the column that contains GISAID EPI identifiers in the input target metadata.
141141
- `DIVERSITY_REPS`: number of random sample subsets of the context dataset for the nucleotide diversity comparison.
142142
- `USE_BIONJ`: use the BIONJ algorithm ([Gascuel, 1997](https://doi.org/10.1093/oxfordjournals.molbev.a025808)) instead of NJ (neighbor-joining; [Saitou & Nei, 1987](https://doi.org/10.1093/oxfordjournals.molbev.a040454)) to reconstruct phylogenetic trees from pairwise distances.
143+
- `COR`: configuration for correlation analyses of allele frequency data over time and between variants. This parameter controls how correlation tests are performed using R's `cor.test` and `cor` functions (see [R documentation](https://search.r-project.org/CRAN/refmans/correlation/html/cor_test.html)).
144+
- `METHOD`: correlation method to use. Valid options are "pearson" (default), "kendall", or "spearman".
145+
- `EXACT`: boolean flag indicating whether to compute an exact p-value when possible. This option applies only to certain methods and may be set to `null` (default) to let R decide automatically.
143146
- `LOG_PY_FMT`: logging format string for Python scripts.
144147
- `PLOTS`: path of the R script that sets the design and style of data visualizations.
145148
- `PLOT_GENOME_REGIONS`: path of a CSV file containing genome regions, e.g. SARS-CoV-2 non-structural protein (NSP) coordinates, for data visualization.

config/config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ GISAID:
3333
DIVERSITY_REPS:
3434
1000
3535
USE_BIONJ: false
36+
COR:
37+
METHOD: "pearson"
38+
EXACT: null
3639
LOG_PY_FMT:
3740
"%(asctime)s - %(name)s - %(levelname)-8s - %(message)s"
3841
PLOTS:

template.qmd

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ params:
2323
min_ivar_freq: ""
2424
workflow_version: ""
2525
use_bionj: ""
26+
cor_method: ""
2627
div: ""
2728
freyja: ""
2829
tree: ""
@@ -81,8 +82,17 @@ n.samples <- table %>% pull(Sample) %>% unique() %>% length()
8182
vcf <- read_csv(params$heat_tab)
8283
row.names(vcf) <- vcf$`...1`
8384
vcf <- vcf[-1]
84-
cor.mat <- cor(vcf)
85+
cor.mat <- cor(vcf, method = params$cor_method)
8586
cor.mat[is.na(cor.mat)] <- 0
87+
if (params$cor_method == "pearson") {
88+
cor.method.name <- "Pearson's"
89+
} else if (params$cor_method == "spearman") {
90+
cor.method.name <- "Spearman's"
91+
} else if (params$cor_method == "kendall") {
92+
cor.method.name <- "Kendall's"
93+
} else {
94+
cor.method.name <- "Undetermined"
95+
}
8696
8797
# Distance tree
8898
if (params$use_bionj) {
@@ -217,7 +227,7 @@ at the bottom, and the latest, at the top.](`r params$SNV_s`){#fig-SNV_s}
217227
The correlation of the allele frequency of each NV with the time since the
218228
initial sampling has been calculated (@fig-volcano).
219229

220-
![Pearson’s correlation coefficients and adjusted p-values of
230+
![`r cor.method.name` correlation coefficients and adjusted p-values of
221231
allele frequencies with time. Red dashed line indicates adjusted $p = 0.05$.
222232
Labeled dots represent nucleotide variants correlated with time
223233
(adjusted $p < 0.05$).](`r params$volcano`){#fig-volcano}
@@ -253,7 +263,7 @@ zooming in on specific regions.
253263

254264
```{r heatmap, echo = F, message = F, fig.align = 'center'}
255265
#| label: fig-heatmap
256-
#| fig-cap: "Interactive hierarchically clustered heatmap of the pairwise Pearson’s correlation coefficients between the time series of allele frequencies in the case study."
266+
#| fig-cap: "Interactive hierarchically clustered heatmap of the pairwise correlation coefficients between the time series of allele frequencies in the case study."
257267
258268
heatmaply_cor(
259269
cor.mat,

workflow/rules/report.smk

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,9 @@ rule evo_plots:
140140
rule snp_plots:
141141
conda: "../envs/renv.yaml"
142142
params:
143-
design = config["PLOTS"]
143+
design = config["PLOTS"],
144+
cor_method = config["COR"]["METHOD"],
145+
cor_exact = config["COR"]["EXACT"]
144146
input:
145147
vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
146148
metadata = config["METADATA"]
@@ -196,7 +198,8 @@ rule report:
196198
ufboot_reps = config["UFBOOT_REPS"],
197199
shalrt_reps = config["SHALRT_REPS"],
198200
name = config["OUTPUT_NAME"],
199-
use_bionj = config["USE_BIONJ"]
201+
use_bionj = config["USE_BIONJ"],
202+
cor_method = config["COR"]["METHOD"]
200203
output:
201204
html = report(OUTDIR/f"{OUTPUT_NAME}.report.html")
202205
log:
@@ -211,6 +214,7 @@ rule report:
211214
min_ivar_freq='{params.min_ivar_freq}',\
212215
workflow_version='{params.workflow_version}',\
213216
use_bionj='{params.use_bionj}',\
217+
cor_method='{params.cor_method}',\
214218
div='{input.diversity}',\
215219
freyja ='{input.freyja}',\
216220
tree = '{input.tree}',\

workflow/scripts/report/snp_plots.R

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ cor.df.fill <- lapply(
9393

9494
test <- cor.test(
9595
df$ALT_FREQ,
96-
df$interval
96+
df$interval,
97+
method = snakemake@params$cor_method,
98+
exact = snakemake@params$cor_exact
9799
)
98100

99101
pvalue <- p.adjust(
@@ -174,7 +176,7 @@ subset <- c(sign, dup) %>%
174176
# Plot height depending on the number of SNPs assuming 4 columns in the plot
175177
plot.height <- ceiling(length(subset) / 4) * 42
176178

177-
log_info("PLotting SNPs trends in time")
179+
log_info("Plotting SNPs trends in time")
178180
panel <- vcf %>%
179181
filter(variant %in% subset) %>%
180182
ggplot() +
@@ -221,7 +223,7 @@ log_info("Saving coorelation table")
221223
cor.df.fill %>%
222224
transmute(
223225
NV = snp,
224-
PearsonCor = cor,
226+
correlation_coef = cor,
225227
adj_pvalue = p.value
226228
) %>%
227229
write.csv(

0 commit comments

Comments
 (0)