Skip to content

Commit de1e170

Browse files
committed
Filter GenBank features in a separate rule
1 parent bd15d31 commit de1e170

File tree

3 files changed

+102
-62
lines changed

3 files changed

+102
-62
lines changed

workflow/rules/evolution.smk

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,27 @@
1+
rule filter_genbank_features:
2+
threads: 1
3+
conda: "../envs/biopython.yaml"
4+
params:
5+
included = config.get("GB_FEATURES", {}).get("INCLUDE", {}),
6+
excluded = config.get("GB_FEATURES", {}).get("EXCLUDE", {}),
7+
input:
8+
gb = OUTDIR/"reference.gb",
9+
output:
10+
gb = OUTDIR/"reference.cds.gb",
11+
log:
12+
LOGDIR / "filter_genbank_features" / "log.txt"
13+
script:
14+
"../scripts/filter_genbank_features.py"
15+
16+
117
rule N_S_sites:
218
threads: 1
319
conda: "../envs/biopython.yaml"
420
params:
5-
features = config.get("GB_FEATURES", {}),
621
gb_qualifier_display = "gene",
722
input:
823
fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
9-
gb = OUTDIR/"reference.gb",
24+
gb = OUTDIR/"reference.cds.gb",
1025
genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve(),
1126
output:
1227
csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv"),

workflow/scripts/N_S_sites_from_fasta.py

Lines changed: 2 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@
33
import logging
44
import json
55
import itertools as it
6-
from typing import Dict, Iterable
6+
from typing import Dict
77

88
import pandas as pd
99
from Bio import SeqIO
1010
from Bio.SeqRecord import SeqRecord
11-
from Bio.SeqFeature import SeqFeature
1211
from Bio.Seq import Seq
1312

1413

@@ -69,54 +68,6 @@ def calculate_ns_sites(codons: dict, genetic_code: dict) -> pd.DataFrame:
6968
return pd.DataFrame({"feature": features, "N": N_sites, "S": S_sites})
7069

7170

72-
def iter_features_filtering(features: Iterable[SeqFeature], included: Dict[str, str], excluded: Dict[str, str]) -> Iterable[SeqFeature]:
73-
# No filters
74-
if len(included) == 0 and len(excluded) == 0:
75-
logging.debug("Selecting all features")
76-
return iter(features)
77-
# Only inclusion filter
78-
elif len(included) == 0 and len(excluded) != 0:
79-
logging.debug(f"Selecting features excluding all of {excluded}")
80-
return (
81-
feature for feature in features
82-
if all(
83-
(qualifier_value not in excluded.get(qualifier_key, []))
84-
for qualifier_key in excluded.keys()
85-
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
86-
)
87-
)
88-
# Only exclusion filter
89-
elif len(included) != 0 and len(excluded) == 0:
90-
logging.debug(f"Selecting features including any of {included}")
91-
return (
92-
feature for feature in features
93-
if any(
94-
(qualifier_value in included.get(qualifier_key, []))
95-
for qualifier_key in included.keys()
96-
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
97-
)
98-
)
99-
# Inclusion then exclusion filter
100-
else:
101-
logging.debug(f"Selecting features including any of {included} and then excluding all of {excluded}")
102-
included_features = (
103-
feature for feature in features
104-
if any(
105-
(qualifier_value in included.get(qualifier_key, []))
106-
for qualifier_key in included.keys()
107-
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
108-
)
109-
)
110-
return (
111-
feature for feature in included_features
112-
if all(
113-
(qualifier_value not in excluded.get(qualifier_key, []))
114-
for qualifier_key in excluded.keys()
115-
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
116-
)
117-
)
118-
119-
12071
def main():
12172

12273
logging.basicConfig(
@@ -136,16 +87,7 @@ def main():
13687

13788
logging.info("Extracting CDS")
13889
coding_records = {}
139-
included = snakemake.params.features.get("INCLUDE", {})
140-
excluded = snakemake.params.features.get("EXCLUDE", {})
141-
for feature in iter_features_filtering(gb.features, included, excluded):
142-
logging.debug(
143-
"Processing SeqFeature: "
144-
f"ID={feature.id} type={feature.type} location={feature.location} "
145-
f"gene={feature.qualifiers.get('gene', [])} "
146-
f"locus_tag={feature.qualifiers.get('locus_tag', [])} "
147-
f"product={feature.qualifiers.get('product', [])}"
148-
)
90+
for feature in gb.features:
14991
identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier_display, []))
15092
if identifier == "":
15193
logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}'")
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env python3
2+
3+
import logging
4+
from typing import Dict, Iterable
5+
6+
from Bio import SeqIO
7+
from Bio.SeqFeature import SeqFeature
8+
9+
10+
def iter_features_filtering(features: Iterable[SeqFeature], included: Dict[str, str], excluded: Dict[str, str]) -> Iterable[SeqFeature]:
11+
# No filters
12+
if len(included) == 0 and len(excluded) == 0:
13+
logging.debug("Selecting all features")
14+
return iter(features)
15+
# Only inclusion filter
16+
elif len(included) == 0 and len(excluded) != 0:
17+
logging.debug(f"Selecting features excluding all of {excluded}")
18+
return (
19+
feature for feature in features
20+
if all(
21+
(qualifier_value not in excluded.get(qualifier_key, []))
22+
for qualifier_key in excluded.keys()
23+
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
24+
)
25+
)
26+
# Only exclusion filter
27+
elif len(included) != 0 and len(excluded) == 0:
28+
logging.debug(f"Selecting features including any of {included}")
29+
return (
30+
feature for feature in features
31+
if any(
32+
(qualifier_value in included.get(qualifier_key, []))
33+
for qualifier_key in included.keys()
34+
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
35+
)
36+
)
37+
# Inclusion then exclusion filter
38+
else:
39+
logging.debug(f"Selecting features including any of {included} and then excluding all of {excluded}")
40+
included_features = (
41+
feature for feature in features
42+
if any(
43+
(qualifier_value in included.get(qualifier_key, []))
44+
for qualifier_key in included.keys()
45+
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
46+
)
47+
)
48+
return (
49+
feature for feature in included_features
50+
if all(
51+
(qualifier_value not in excluded.get(qualifier_key, []))
52+
for qualifier_key in excluded.keys()
53+
for qualifier_value in feature.qualifiers.get(qualifier_key, [])
54+
)
55+
)
56+
57+
58+
if __name__ == "__main__":
59+
logging.basicConfig(
60+
filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"],
61+
level=logging.INFO
62+
)
63+
64+
logging.info("Reading GenBank file")
65+
gb = SeqIO.read(snakemake.input.gb, format="gb")
66+
67+
logging.info("Extracting CDS")
68+
features = []
69+
for feature in iter_features_filtering(gb.features, snakemake.params.included, snakemake.params.excluded):
70+
logging.debug(
71+
"Processing SeqFeature: "
72+
f"ID={feature.id} type={feature.type} location={feature.location} "
73+
f"gene={feature.qualifiers.get('gene', [])} "
74+
f"locus_tag={feature.qualifiers.get('locus_tag', [])} "
75+
f"product={feature.qualifiers.get('product', [])}"
76+
)
77+
features.append(feature)
78+
79+
logging.info("Replacing features")
80+
gb.features = features
81+
82+
logging.info("Writing filtered GenBank files")
83+
SeqIO.write(gb, snakemake.output.gb, "gb")

0 commit comments

Comments
 (0)