Skip to content

Commit 4917f21

Browse files
committed
Extract NV panel regions from GenBank file
Removes the need of a JSON file with coordinates, but is flexible enough to allow its usage
1 parent de1e170 commit 4917f21

File tree

6 files changed

+66
-32
lines changed

6 files changed

+66
-32
lines changed

.test/targets.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,5 @@ PLOT_GENOME_REGIONS:
4040
"config/nsp_annotation.csv"
4141
REPORT_QMD:
4242
"template.qmd"
43-
COORDINATES_JSON:
44-
"config/sarscov2_coordinates.json"
4543
GENETIC_CODE_JSON:
4644
"config/standard_genetic_code.json"

config/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ All of the following variables are pre-defined in [config.yaml](/config/config.y
130130

131131
- `ALIGNMENT_REFERENCE`: NCBI accession number of the reference record for sequence alignment.
132132
- `PROBLEMATIC_VCF`: URL or path of a VCF file containing problematic genome positions for masking.
133-
- `COORDINATES_JSON`: path of a JSON file containing genome feature coordinates for data visualization.
134133
- `GENETIC_CODE_JSON`: path of a JSON file containing a genetic code for gene translation.
135134
- `TREE_MODEL`: substitution model used by IQTREE (see [docs](http://www.iqtree.org/doc/Substitution-Models)).
136135
- `UFBOOT` & `SHALRT`: settings for ultrafast bootstrap (see [UFBoot](https://doi.org/10.1093/molbev/msx281))

config/config.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ ALIGNMENT_REFERENCE:
22
"NC_045512.2"
33
PROBLEMATIC_VCF:
44
"https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf"
5-
COORDINATES_JSON:
6-
"config/sarscov2_coordinates.json"
75
GENETIC_CODE_JSON:
86
"config/standard_genetic_code.json"
97
TREE_MODEL:

config/sarscov2_coordinates.json

Lines changed: 0 additions & 23 deletions
This file was deleted.

workflow/rules/report.smk

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,20 @@ rule diversity_plot:
9595
"../scripts/report/diversity_plot.R"
9696

9797

98+
rule extract_genbank_regions:
99+
conda: "../envs/biopython.yaml"
100+
params:
101+
gb_qualifier = "gene",
102+
input:
103+
gb = OUTDIR/"reference.cds.gb",
104+
output:
105+
regions = temp(REPORT_DIR_TABLES/"genbank_regions.json"),
106+
log:
107+
LOGDIR / "extract_genbank_regions" / "log.txt"
108+
script:
109+
"../scripts/report/extract_genbank_regions.py"
110+
111+
98112
rule general_NV_description:
99113
conda: "../envs/renv.yaml"
100114
params:
@@ -104,15 +118,15 @@ rule general_NV_description:
104118
step = config["WINDOW"]["STEP"],
105119
max_alt_freq = 1.0 - config["VC"]["IVAR_FREQ"]
106120
input:
107-
coordinates = config["COORDINATES_JSON"],
121+
coordinates = REPORT_DIR_TABLES/"genbank_regions.json",
108122
regions = config["PLOT_GENOME_REGIONS"],
109123
window = OUTDIR/f"{OUTPUT_NAME}.window.csv",
110124
vcf = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
111125
metadata = config["METADATA"]
112126
output:
113-
fig = report(REPORT_DIR_PLOTS/"figure_5a.png"),
114-
fig_s = report(REPORT_DIR_PLOTS/"figure_5b.png"),
115-
fig_cor = report(REPORT_DIR_PLOTS/"figure_4.png"),
127+
fig = report(REPORT_DIR_PLOTS/"figure_5a.png"), # panel
128+
fig_s = report(REPORT_DIR_PLOTS/"figure_5b.png"), # panel spike
129+
fig_cor = report(REPORT_DIR_PLOTS/"figure_4.png"), # SNPs vs time
116130
json = temp(OUTDIR/"summary_nv.json"),
117131
table_1 = report(REPORT_DIR_TABLES/"figure_5a.csv"),
118132
table_2 = report(REPORT_DIR_TABLES/"figure_5b.csv"),
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env python3
2+
3+
import logging
4+
import json
5+
6+
from Bio import SeqIO
7+
from Bio.SeqFeature import ExactPosition
8+
9+
10+
if __name__ == "__main__":
11+
logging.basicConfig(
12+
filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"],
13+
level=logging.INFO
14+
)
15+
16+
logging.info("Reading GenBank file")
17+
gb = SeqIO.read(snakemake.input.gb, format="gb")
18+
19+
logging.info("Calculating CDS regions")
20+
regions = {}
21+
for feature in gb.features:
22+
identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier, []))
23+
if identifier == "":
24+
logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}'")
25+
elif identifier in regions:
26+
logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}")
27+
else:
28+
logging.debug(f"Adding feature")
29+
if type(feature.location.start) is not ExactPosition or type(feature.location.start) is not ExactPosition:
30+
logging.warning(f"Feature '{identifier}' location is not exact but will be treated as such: {feature.location}")
31+
regions[identifier] = (
32+
int(feature.location.start.real + 1),
33+
int(feature.location.end.real)
34+
)
35+
36+
logging.info("Calculating intergenic regions")
37+
cds_names = tuple(regions.keys())
38+
intergenic_count = 0
39+
for i in range(1, len(cds_names)):
40+
start1, end1 = regions[cds_names[i-1]]
41+
start2, end2 = regions[cds_names[i]]
42+
if (start2 - end1) > 1:
43+
intergenic_count += 1
44+
regions[f"Intergenic_{intergenic_count}"] = (end1 + 1, start2 - 1)
45+
46+
logging.info("Writing regions to JSON file")
47+
with open(snakemake.output.regions, "w") as fw:
48+
json.dump(regions, fw, indent=2)

0 commit comments

Comments
 (0)