Skip to content

Commit 3c97af8

Browse files
committed
Filter out errors in SnpEff
Also replaces empty values ("" in config YAML) with NA in R and adds code comments
1 parent 087e845 commit 3c97af8

File tree

2 files changed

+29
-4
lines changed

2 files changed

+29
-4
lines changed

config/config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ ANNOTATION:
3535
FEATUREID: "ANN[*].FEATUREID"
3636
HGVS_P: "ANN[*].HGVS_P"
3737
HGVS_C: "ANN[*].HGVS_C"
38+
ERRORS: "ANN[*].ERRORS"
3839
FILTER_INCLUDE: # see: https://pcingola.github.io/SnpEff/adds/VCFannotationformat_v1.0.pdf
40+
ERRORS: [""]
3941
# IMPACT: [HIGH, MODERATE, LOW]
4042
FILTER_EXCLUDE:
4143
EFFECT: [upstream_gene_variant, downstream_gene_variant]

workflow/scripts/format_vcf_fields_longer.R

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,45 @@ sink(log, type = "output")
77

88
library(tidyverse)
99

10+
# Replace "" values with NA in R filter list
11+
# Snakemake passes filters like: list(ERRORS = c(""))
12+
empty.to.na <- function(x) {
13+
x[x == ""] <- NA
14+
x
15+
}
16+
filter.include <- lapply(snakemake@params$filter_include, empty.to.na)
17+
filter.exclude <- lapply(snakemake@params$filter_exclude, empty.to.na)
18+
19+
# Process input table
1020
read_tsv(snakemake@input$tsv) %>%
21+
# Separate <sep>-delimited "...[*]..." columns (e.g. ANN[*].EFFECT)
1122
separate_longer_delim(contains("[*]"), delim = snakemake@params$sep) %>%
23+
# Replace empty ("") fields with NA
1224
mutate(across(contains("[*]"), ~ na_if(., ""))) %>%
25+
# Rename "...[*]..." columns using the provided lookup via Snakemake config
1326
rename(all_of(unlist(snakemake@params$colnames_mapping))) %>%
27+
# Apply dynamic filters from the Snakemake config:
28+
# map2 pairs column names (.x) with value vectors (.y) and builds boolean expressions.
29+
# Inside the expr call, !! injects a single value into each expression.
30+
# The resulting list of expressions is spliced with !!! so each becomes its
31+
# own condition as if written directly inside the filter call.
1432
filter(
33+
# Keep variants that include the required values in each defined field (e.g. empty ERRORS)
1534
!!!map2(
16-
names(snakemake@params$filter_include),
17-
snakemake@params$filter_include,
35+
names(filter.include),
36+
filter.include,
1837
~ expr(.data[[!!.x]] %in% !!.y)
1938
),
39+
# Keep variants that exclude the required values in each defined field (e.g. EFFECT != "upstream_gene_variant")
2040
!!!map2(
21-
names(snakemake@params$filter_exclude),
22-
snakemake@params$filter_exclude,
41+
names(filter.exclude),
42+
filter.exclude,
2343
~ expr(!(.data[[!!.x]] %in% !!.y))
2444
)
2545
) %>%
46+
# Keep unique rows
2647
distinct() %>%
48+
# Assign variant name using the pattern defined via Snakemake config
2749
mutate(VARIANT_NAME = str_glue(snakemake@params$variant_name_pattern)) %>%
50+
# Write output file
2851
write_tsv(snakemake@output$tsv)

0 commit comments

Comments
 (0)