Skip to content

Commit ccdc068

Browse files
committed
Get variant name for plots from SnpEff annotation
1 parent ab82680 commit ccdc068

File tree

10 files changed

+64
-66
lines changed

10 files changed

+64
-66
lines changed

workflow/rules/distances.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ rule weighted_distances:
55
samples = expand("{sample}", sample = iter_samples()),
66
mask_class = ["mask"]
77
input:
8-
tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
8+
tsv = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
99
vcf = lambda wildcards: select_problematic_vcf(),
1010
ancestor = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
1111
reference = OUTDIR/"reference.fasta"

workflow/rules/report.smk

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ rule demix_plot:
3030
rule heatmap_plot_data:
3131
conda: "../envs/renv.yaml"
3232
input:
33-
vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
33+
vcf = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
3434
metadata = config["METADATA"]
3535
output:
3636
table = report(REPORT_DIR_TABLES/"heatmap.csv")
@@ -46,7 +46,7 @@ rule window:
4646
window = config["WINDOW"]["WIDTH"],
4747
step = config["WINDOW"]["STEP"]
4848
input:
49-
vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
49+
vcf = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
5050
gb = OUTDIR/"reference.gb",
5151
features = config["FEATURES_JSON"]
5252
output:
@@ -105,7 +105,7 @@ rule general_NV_description:
105105
coordinates = config["COORDINATES_JSON"],
106106
regions = config["PLOT_GENOME_REGIONS"],
107107
window = OUTDIR/f"{OUTPUT_NAME}.window.csv",
108-
vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
108+
vcf = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
109109
metadata = config["METADATA"]
110110
output:
111111
fig = report(REPORT_DIR_PLOTS/"figure_5a.png"),
@@ -230,7 +230,7 @@ rule evo_plots:
230230
design = config["PLOTS"]
231231
input:
232232
N_S = OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv",
233-
vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
233+
vcf = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
234234
metadata = config["METADATA"]
235235
output:
236236
plot = report(REPORT_DIR_PLOTS/"figure_11.png"),
@@ -249,7 +249,7 @@ rule snp_plots:
249249
cor_method = config["COR"]["METHOD"],
250250
cor_exact = config["COR"]["EXACT"]
251251
input:
252-
vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
252+
vcf = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
253253
metadata = config["METADATA"]
254254
output:
255255
pseudovolcano = report(REPORT_DIR_PLOTS/"figure_6.png"),

workflow/rules/vaf.smk

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -193,26 +193,26 @@ rule compile_vcf_fields_longer:
193193
"../scripts/compile_vcf_fields_longer.R"
194194

195195

196-
rule vcf_to_tsv:
196+
rule merge_annotation:
197197
threads: 1
198198
conda: "../envs/renv.yaml"
199199
input:
200-
ann_vcf = OUTDIR/"vaf"/"{sample}.annotated.vcf",
201-
pre_tsv = OUTDIR/"vaf"/"{sample}.masked.prefiltered.tsv"
200+
tsv = OUTDIR/"vaf"/"{sample}.masked.tsv",
201+
annot = OUTDIR/"vaf"/"{sample}.vcf_fields.longer.tsv",
202202
output:
203-
tsv = OUTDIR/"vaf"/"{sample}.masked.filtered.tsv"
203+
tsv = OUTDIR/"vaf"/"{sample}.variants.tsv"
204204
log:
205-
LOGDIR / "vcf_to_tsv" / "{sample}.log.txt"
205+
LOGDIR / "merge_annotation" / "{sample}.log.txt"
206206
script:
207-
"../scripts/vcf_to_tsv.R"
207+
"../scripts/merge_annotation.R"
208208

209209

210210
rule compile_variants:
211211
threads: 1
212212
conda: "../envs/renv.yaml"
213-
input: expand(OUTDIR/"vaf"/"{sample}.masked.filtered.tsv", sample=iter_samples())
213+
input: expand(OUTDIR/"vaf"/"{sample}.variants.tsv", sample=iter_samples())
214214
output:
215-
tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv"
215+
tsv = OUTDIR/f"{OUTPUT_NAME}.variants.tsv"
216216
log:
217217
LOGDIR / "compile_variants" / "log.txt"
218218
script:

workflow/scripts/compile_variants.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ tables <- lapply(
1414
snakemake@input,
1515
function(path) {
1616
read_tsv(path) %>%
17-
mutate(SAMPLE = sub('\\.masked.filtered.tsv$', '', basename(path)))
17+
mutate(SAMPLE = sub('\\.variants.tsv$', '', basename(path)))
1818
}
1919
)
2020

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Write stdout and stderr to log file
4+
log <- file(snakemake@log[[1]], open = "wt")
5+
sink(log, type = "message")
6+
sink(log, type = "output")
7+
8+
library(tidyverse)
9+
library(logger)
10+
log_threshold(INFO)
11+
12+
log_info("Reading tables")
13+
variants <- read_tsv(snakemake@input$tsv)
14+
annotation <- read_tsv(
15+
snakemake@input$annot,
16+
col_select = c("CHROM", "POS", "REF", "ALT", "VARIANT_NAME")
17+
)
18+
19+
log_info("Merging tables")
20+
merged <- left_join(
21+
variants,
22+
annotation,
23+
by = c("CHROM", "POS", "REF", "ALT")
24+
)
25+
26+
log_info("Saving results")
27+
write_tsv(
28+
tsv,
29+
snakemake@output$tsv
30+
)

workflow/scripts/report/NV_description.R

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ date_order <- metadata %>%
4040
empty_vcf <- tibble(
4141
SAMPLE = date_order,
4242
REGION = as.character(NA),
43-
variant = as.character(NA),
43+
VARIANT_NAME = as.character(NA),
4444
ALT_FREQ = as.numeric(NA),
4545
GB_FEATURE = as.character(NA),
4646
synonimous = as.character(NA),
@@ -55,7 +55,7 @@ vcf <- vcf %>%
5555
dplyr::select(
5656
SAMPLE,
5757
REGION,
58-
variant,
58+
VARIANT_NAME,
5959
ALT_FREQ,
6060
GB_FEATURE,
6161
synonimous,
@@ -414,14 +414,14 @@ vcf %>%
414414
select(
415415
SAMPLE,
416416
POS,
417-
variant,
417+
VARIANT_NAME,
418418
ALT_FREQ,
419419
NV_class,
420420
group
421421
) %>%
422422
rename(
423423
sample = SAMPLE,
424-
Variant = variant,
424+
Variant = VARIANT_NAME,
425425
Class = group
426426
) %>%
427427
filter(ALT_FREQ > 0) %>%
@@ -464,11 +464,11 @@ vcf_snp %>%
464464
# Stats for reporting
465465
n_indels <- vcf %>%
466466
filter(NV_class == "INDEL") %>%
467-
pull(variant) %>%
467+
pull(VARIANT_NAME) %>%
468468
unique() %>%
469469
length()
470470

471-
n_snv <- length(unique(vcf$variant)) - n_indels
471+
n_snv <- length(unique(vcf$VARIANT_NAME)) - n_indels
472472
model <- lm(n ~ CollectionDate, data = figur_SNP_table)
473473

474474
# Calculate correlation, if possible

workflow/scripts/report/evo_plots.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ N_S_position <- read_delim(snakemake@input[["N_S"]])
2222
vcf <- vcf %>%
2323
dplyr::select(
2424
SAMPLE,
25-
variant,
25+
VARIANT_NAME,
2626
REGION,
2727
ALT_FREQ,
2828
GB_FEATURE,

workflow/scripts/report/heatmap.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ date_order <- read_csv(snakemake@input[["metadata"]]) %>%
1919

2020
# Create SNP variable and select useful variables from vcf
2121
vcf <- vcf %>%
22-
dplyr::select(variant, SAMPLE, ALT_FREQ)
22+
dplyr::select(VARIANT_NAME, SAMPLE, ALT_FREQ)
2323

2424
vcf <- vcf %>%
2525
pivot_wider(
26-
names_from = variant,
26+
names_from = VARIANT_NAME,
2727
values_from = ALT_FREQ,
2828
values_fill = 0,
2929
values_fn = sum

workflow/scripts/report/snp_plots.R

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ date_order <- metadata %>%
4343
# Simplify features names and create SNP variable
4444
vcf <- vcf %>%
4545
dplyr::select(
46-
variant,
46+
VARIANT_NAME,
4747
SAMPLE,
4848
REGION,
4949
ALT_FREQ,
@@ -52,7 +52,7 @@ vcf <- vcf %>%
5252

5353
# Fill positions without alt frequency with 0
5454
vcf <- vcf %>%
55-
complete(nesting(variant, POS), SAMPLE, fill = list(ALT_FREQ = 0))
55+
complete(nesting(VARIANT_NAME, POS), SAMPLE, fill = list(ALT_FREQ = 0))
5656

5757
# Join variants file and metadata file
5858
vcf <- left_join(vcf, data, by = c("SAMPLE" = "ID"))
@@ -72,7 +72,7 @@ vcf <- arrange(
7272
# Get list with all different polymorphisms
7373
SNPs <- pull(
7474
vcf,
75-
variant
75+
VARIANT_NAME
7676
) %>%
7777
unique()
7878

@@ -89,7 +89,7 @@ cor.df.fill <- lapply(
8989
function(snp) {
9090
df <- filter(
9191
vcf,
92-
variant == snp
92+
VARIANT_NAME == snp
9393
)
9494

9595
test <- cor.test(
@@ -161,14 +161,14 @@ sign <- filter(
161161
# SNPs which are in positions with more than one alternative allele
162162
dup <- vcf %>%
163163
select(
164-
variant,
164+
VARIANT_NAME,
165165
POS
166166
) %>%
167167
unique() %>%
168168
group_by(POS) %>%
169169
filter(n() > 1) %>%
170170
ungroup() %>%
171-
pull(variant) %>%
171+
pull(VARIANT_NAME) %>%
172172
unique()
173173

174174
subset <- c(sign, dup) %>%
@@ -179,12 +179,12 @@ plot.height <- ceiling(length(subset) / 4) * 42
179179

180180
log_info("Plotting SNPs trends in time")
181181
panel <- vcf %>%
182-
filter(variant %in% subset) %>%
182+
filter(VARIANT_NAME %in% subset) %>%
183183
ggplot() +
184184
aes(
185185
x = interval,
186186
y = ALT_FREQ,
187-
color = variant
187+
color = VARIANT_NAME
188188
) +
189189
scale_color_manual(values = sample(color, length(subset))) +
190190
geom_point() +
@@ -234,11 +234,11 @@ cor.df.fill %>%
234234

235235
log_info("Saving SNPs trends table")
236236
vcf %>%
237-
filter(variant %in% subset) %>%
237+
filter(VARIANT_NAME %in% subset) %>%
238238
transmute(
239239
sample = SAMPLE,
240240
POS = POS,
241-
NV = variant,
241+
NV = VARIANT_NAME,
242242
ALT_FREQ = ALT_FREQ,
243243
DaysSinceFirst = interval
244244
) %>%

workflow/scripts/vcf_to_tsv.R

Lines changed: 0 additions & 32 deletions
This file was deleted.

0 commit comments

Comments
 (0)