Skip to content

Commit 4bf29a6

Browse files
committed
Format extracted VCF fields from the SnpEff annotation
1 parent 93dda40 commit 4bf29a6

File tree

3 files changed

+41
-7
lines changed

3 files changed

+41
-7
lines changed

config/config.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,26 @@ VC:
2020
IVAR_QUALITY: 20
2121
IVAR_FREQ: 0.05
2222
IVAR_DEPTH: 30
23+
ANNOTATION:
24+
SNPEFF_COLS:
25+
CHROM: CHROM
26+
POS: POS
27+
REF: REF
28+
ALT: ALT
29+
EFFECT: "ANN[*].EFFECT"
30+
IMPACT: "ANN[*].IMPACT"
31+
BIOTYPE: "ANN[*].BIOTYPE"
32+
GENE: "ANN[*].GENE"
33+
GENEID: "ANN[*].GENEID"
34+
FEATURE: "ANN[*].FEATURE"
35+
FEATUREID: "ANN[*].FEATUREID"
36+
HGVS_P: "ANN[*].HGVS_P"
37+
HGVS_C: "ANN[*].HGVS_C"
38+
FILTER_INCLUDE: # see: https://pcingola.github.io/SnpEff/adds/VCFannotationformat_v1.0.pdf
39+
# IMPACT: [HIGH, MODERATE, LOW]
40+
FILTER_EXCLUDE:
41+
# EFFECT: [upstream_gene_variant, downstream_gene_variant]
42+
VARIANT_NAME_PATTERN: "{GENE}:{coalesce(HGVS_P, HGVS_C)}" # coalesce finds the first non-missing element
2343
DEMIX:
2444
PATHOGEN: "SARS-CoV-2"
2545
MIN_QUALITY: 20

workflow/rules/vaf.smk

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,8 @@ rule extract_vcf_fields:
151151
threads: 1
152152
conda: "../envs/snpeff.yaml"
153153
params:
154-
extract_columns = [
155-
"CHROM", "POS", "REF", "ALT",
156-
'"ANN[*].EFFECT"', '"ANN[*].IMPACT"', '"ANN[*].BIOTYPE"',
157-
'"ANN[*].GENE"', '"ANN[*].GENEID"', '"ANN[*].FEATURE"', '"ANN[*].FEATUREID"', '"ANN[*].HGVS_P"', '"ANN[*].HGVS_C"'
158-
],
159-
sep = ","
154+
extract_columns = [f"'{col}'" for col in config["ANNOTATION"]["SNPEFF_COLS"].values()],
155+
sep = ",",
160156
input:
161157
vcf = OUTDIR/"vaf"/"{sample}.annotated.vcf"
162158
output:
@@ -170,7 +166,10 @@ rule extract_vcf_fields:
170166
rule format_vcf_fields_longer:
171167
conda: "../envs/renv.yaml"
172168
params:
173-
sep = ","
169+
colnames_mapping = config["ANNOTATION"]["SNPEFF_COLS"],
170+
filter_include = config["ANNOTATION"]["FILTER_INCLUDE"],
171+
variant_name_pattern = config["ANNOTATION"]["VARIANT_NAME_PATTERN"],
172+
sep = ",",
174173
input:
175174
tsv = OUTDIR/"vaf"/"{sample}.vcf_fields.tsv"
176175
output:

workflow/scripts/format_vcf_fields_longer.R

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,19 @@ library(tidyverse)
99

1010
read_tsv(snakemake@input$tsv) %>%
1111
separate_rows(contains("[*]"), sep = snakemake@params$sep) %>%
12+
rename(all_of(unlist(snakemake@params$colnames_mapping))) %>%
13+
filter(
14+
!!!map2(
15+
names(snakemake@params$filter_include),
16+
snakemake@params$filter_include,
17+
~ expr(.data[[!!.x]] %in% !!.y)
18+
),
19+
!!!map2(
20+
names(snakemake@params$filter_exclude),
21+
snakemake@params$filter_exclude,
22+
~ expr(!(.data[[!!.x]] %in% !!.y))
23+
)
24+
) %>%
25+
distinct() %>%
26+
mutate(VARIANT_NAME = str_glue(snakemake@params$variant_name_pattern)) %>%
1227
write_tsv(snakemake@output$tsv)

0 commit comments

Comments
 (0)