Skip to content

Commit fb690a5

Browse files
committed
temp: consider filtered sites before imputing zero alt frequency
1 parent bd911f1 commit fb690a5

File tree

3 files changed

+34
-9
lines changed

3 files changed

+34
-9
lines changed

workflow/rules/report.smk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ rule af_time_correlation_data:
347347
input:
348348
variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
349349
metadata = config["METADATA"],
350+
sites = OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv",
350351
output:
351352
fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"),
352353
correlations = report(REPORT_DIR_TABLES/"af_time_correlation.csv"),

workflow/rules/vaf.smk

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -252,23 +252,31 @@ rule filter_mpileup_all_sites:
252252
input:
253253
OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.query.tsv",
254254
output:
255-
OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.filtered.tsv",
255+
temp(OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.filtered_sites.tsv"),
256256
log:
257257
LOGDIR / "filter_mpileup_all_sites" / "{sample}.txt"
258258
run:
259259
import pandas as pd
260260
df = pd.read_csv(input[0], sep="\t")
261-
df["ref_AD"] = df.AD.str.split(",").apply(lambda values: int(values[0]))
262-
df["total_AD"] = df.AD.str.split(",").apply(lambda values: sum(int(n) for n in values))
263-
df["total_ADF"] = df.ADF.str.split(",").apply(lambda values: sum(int(n) for n in values))
264-
df["total_ADR"] = df.ADR.str.split(",").apply(lambda values: sum(int(n) for n in values))
261+
df["SAMPLE"] = wildcards.sample
262+
df["REF_AD"] = df.AD.str.split(",").apply(lambda values: int(values[0]))
263+
df["TOTAL_AD"] = df.AD.str.split(",").apply(lambda values: sum(int(n) for n in values))
264+
df["TOTAL_ADF"] = df.ADF.str.split(",").apply(lambda values: sum(int(n) for n in values))
265+
df["TOTAL_ADR"] = df.ADR.str.split(",").apply(lambda values: sum(int(n) for n in values))
265266
df[
266-
(df.total_AD >= params.min_total_AD) &
267-
(df.total_ADF >= params.min_total_ADF) &
268-
(df.total_ADR >= params.min_total_ADR)
267+
(df.TOTAL_AD >= params.min_total_AD) &
268+
(df.TOTAL_ADF >= params.min_total_ADF) &
269+
(df.TOTAL_ADR >= params.min_total_ADR)
269270
].to_csv(output[0], sep="\t", index=False)
270271

271272

273+
use rule concat_vcf_fields as merge_filtered_mpileup_all_sites with:
274+
input:
275+
expand(OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.filtered_sites.tsv", sample=iter_samples()),
276+
output:
277+
OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv",
278+
279+
272280
rule pairwise_trajectory_correlation:
273281
conda: "../envs/renv.yaml"
274282
input:

workflow/scripts/report/af_time_correlation_data.R

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ library(logger)
1313
log_threshold(INFO)
1414

1515
# Read data
16+
log_info("Reading filtered sites")
17+
sites <- read_tsv(
18+
snakemake@input[["sites"]],
19+
col_select = c("SAMPLE", "POS")
20+
) %>%
21+
mutate(SELECTED_SITE = TRUE)
22+
1623
log_info("Reading variants table")
1724
variants <- read_delim(
1825
snakemake@input[["variants"]],
@@ -24,11 +31,20 @@ variants <- read_delim(
2431
"POS"
2532
)
2633
) %>%
34+
# Annotate sites selected after filter
35+
left_join(sites, by = c("SAMPLE", "POS")) %>%
36+
replace_na(list(SELECTED_SITE = FALSE)) %>%
2737
# Fill positions without alt frequency with 0
38+
# TODO: add a flag to choose behavior
2839
complete(
2940
nesting(REGION, VARIANT_NAME, POS),
3041
SAMPLE,
31-
fill = list(ALT_FREQ = 0)
42+
fill = list(ALT_FREQ = NA)
43+
) %>%
44+
mutate(
45+
ALT_FREQ = case_when(
46+
is.na(ALT_FREQ) & SELECTED_POS ~ 0
47+
)
3248
)
3349

3450
log_info("Reading metadata")

0 commit comments

Comments
 (0)