Skip to content

Commit 4718cb3

Browse files
authored
Merge pull request #261 from XFF227/dev
Support for SNP queries for Brassica napus; new endpoint for homologous protein queries
2 parents 57d9c4f + bf53773 commit 4718cb3

File tree

10 files changed

+612
-34
lines changed

10 files changed

+612
-34
lines changed

api/models/canola_nssnp.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import Optional
2+
from api import db
3+
# from sqlalchemy.ext.declarative import declarative_base
4+
5+
6+
# CanolaBase = declarative_base()
7+
class CanolaProteinReference(db.Model):
8+
__bind_key__ = "canola_nssnp"
9+
__tablename__ = "protein_reference"
10+
11+
protein_reference_id: db.Mapped[int] = db.mapped_column(db.Integer(), primary_key=True, autoincrement=True)
12+
gene_identifier: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
13+
gene_name: db.Mapped[Optional[str]] = db.mapped_column(db.String(45), nullable=True)
14+
15+
proteinsJoin = db.relationship("CanolaSnpsToProtein", backref="protein", cascade="all, delete-orphan")
16+
17+
18+
class CanolaSnpsToProtein(db.Model):
19+
__bind_key__ = "canola_nssnp"
20+
__tablename__ = "snps_to_protein"
21+
22+
snps_reference_id: db.Mapped[int] = db.mapped_column(db.Integer(), primary_key=True, autoincrement=True)
23+
protein_reference_id: db.Mapped[int] = db.mapped_column(
24+
db.Integer(), db.ForeignKey("protein_reference.protein_reference_id", ondelete="CASCADE"), primary_key=True
25+
)
26+
transcript_pos: db.Mapped[int] = db.mapped_column(db.Integer(), nullable=False)
27+
chromosome: db.Mapped[str] = db.mapped_column(db.String(25), nullable=False)
28+
chromosomal_loci: db.Mapped[int] = db.mapped_column(db.Integer(), nullable=False)
29+
ref_DNA: db.Mapped[str] = db.mapped_column(db.String(1), nullable=False)
30+
alt_DNA: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
31+
aa_pos: db.Mapped[int] = db.mapped_column(db.Integer(), nullable=False)
32+
ref_aa: db.Mapped[str] = db.mapped_column(db.String(3), nullable=False)
33+
alt_aa: db.Mapped[str] = db.mapped_column(db.String(3), nullable=False)
34+
type: db.Mapped[str] = db.mapped_column(db.String(50), nullable=False)
35+
effect_impact: db.Mapped[str] = db.mapped_column(db.String(50), nullable=False)
36+
transcript_biotype: db.Mapped[Optional[str]] = db.mapped_column(db.String(45), nullable=True)
37+
alt_freq: db.Mapped[float] = db.mapped_column(db.Numeric(10, 5), nullable=False)

api/models/homologs_db.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from api import db
2+
3+
4+
class homologs(db.Model):
5+
__bind_key__ = "homologs_db"
6+
__tablename__ = 'homologs'
7+
8+
homologs_id: db.Mapped[int] = db.mapped_column(db.Integer(), primary_key=True, autoincrement=True)
9+
search_protein_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
10+
result_protein_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
11+
search_species_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
12+
result_species_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
13+
Percent_id: db.Mapped[float] = db.mapped_column(db.Numeric(10, 5), nullable=False)
14+
e_score: db.Mapped[str] = db.mapped_column(db.String(10), nullable=False)

api/resources/snps.py

Lines changed: 117 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
SnpsReference as SoybeanSnpsReference,
1818
SamplesLookup as SoybeanSampleNames,
1919
)
20+
from api.models.canola_nssnp import (
21+
CanolaProteinReference as CanolaProteinReference,
22+
CanolaSnpsToProtein as CanolaSnpsToProtein
23+
)
24+
from api.models.homologs_db import homologs as HomologsDB
2025
from api.utils.bar_utils import BARUtils
2126
from flask import request
2227
import re
@@ -141,7 +146,7 @@ class GeneNameAlias(Resource):
141146
def get(self, species="", gene_id=""):
142147
"""Endpoint returns annotated SNP poplar data in order of (to match A th API format):
143148
AA pos (zero-indexed), sample id, 'missense_variant','MODERATE', 'MISSENSE', codon/DNA base change,
144-
AA change (DH), pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype
149+
AA change (DH), pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype (allele frequency for canola)
145150
values with single quotes are fixed"""
146151
results_json = []
147152

@@ -160,43 +165,76 @@ def get(self, species="", gene_id=""):
160165
protein_reference = SoybeanProteinReference
161166
snps_to_protein = SoybeanSnpsToProtein
162167
snps_reference = SoybeanSnpsReference
168+
elif species == "canola" and BARUtils.is_canola_gene_valid(gene_id):
169+
protein_reference = CanolaProteinReference
170+
snps_to_protein = CanolaSnpsToProtein
163171
else:
164172
return BARUtils.error_exit("Invalid gene id"), 400
165173

166-
rows = (
167-
db.session.execute(
168-
db.select(protein_reference, snps_to_protein, snps_reference)
169-
.select_from(protein_reference)
170-
.join(snps_to_protein)
171-
.join(snps_reference)
172-
.where(protein_reference.gene_identifier == gene_id)
174+
if species == "canola" and BARUtils.is_canola_gene_valid(gene_id):
175+
rows = (
176+
db.session.execute(
177+
db.select(protein_reference, snps_to_protein)
178+
.select_from(protein_reference)
179+
.join(snps_to_protein)
180+
.where(protein_reference.gene_identifier == gene_id)
181+
)
182+
.tuples()
183+
.all()
184+
)
185+
for protein, snptoprotein in rows:
186+
itm_lst = [
187+
snptoprotein.chromosome,
188+
snptoprotein.aa_pos - 1, # zero index-ed
189+
None,
190+
"missense_variant",
191+
"MODERATE",
192+
"MISSENSE",
193+
str(snptoprotein.transcript_pos) + snptoprotein.ref_DNA + ">" + snptoprotein.alt_DNA,
194+
snptoprotein.ref_aa + snptoprotein.alt_aa,
195+
None,
196+
gene_id,
197+
"protein_coding",
198+
"CODING",
199+
protein.gene_name,
200+
float(snptoprotein.alt_freq),
201+
]
202+
results_json.append(itm_lst)
203+
else:
204+
rows = (
205+
db.session.execute(
206+
db.select(protein_reference, snps_to_protein, snps_reference)
207+
.select_from(protein_reference)
208+
.join(snps_to_protein)
209+
.join(snps_reference)
210+
.where(protein_reference.gene_identifier == gene_id)
211+
)
212+
.tuples()
213+
.all()
173214
)
174-
.tuples()
175-
.all()
176-
)
177215

178-
# BAR A Th API format is chr, AA pos (zero-indexed), sample id, 'missense_variant',
179-
# 'MODERATE', 'MISSENSE', codon/DNA base change, AA change (DH),
180-
# pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype
181-
for protein, snpsjoin, snpstbl in rows:
182-
itm_lst = [
183-
snpstbl.chromosome,
184-
# snpstbl.chromosomal_loci,
185-
snpsjoin.aa_pos - 1, # zero index-ed
186-
snpstbl.sample_id,
187-
"missense_variant",
188-
"MODERATE",
189-
"MISSENSE",
190-
str(snpsjoin.transcript_pos) + snpsjoin.ref_DNA + ">" + snpsjoin.alt_DNA,
191-
snpsjoin.ref_aa + snpsjoin.alt_aa,
192-
None,
193-
re.sub(r".\d$", "", protein.gene_identifier),
194-
"protein_coding",
195-
"CODING",
196-
protein.gene_identifier,
197-
None,
198-
]
199-
results_json.append(itm_lst)
216+
# BAR A Th API format is chr, AA pos (zero-indexed), sample id, 'missense_variant',
217+
# 'MODERATE', 'MISSENSE', codon/DNA base change, AA change (DH),
218+
# pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype
219+
for protein, snpsjoin, snpstbl in rows:
220+
itm_lst = [
221+
snpstbl.chromosome,
222+
# snpstbl.chromosomal_loci,
223+
snpsjoin.aa_pos - 1, # zero index-ed
224+
snpstbl.sample_id,
225+
"missense_variant",
226+
"MODERATE",
227+
"MISSENSE",
228+
str(snpsjoin.transcript_pos) + snpsjoin.ref_DNA + ">" + snpsjoin.alt_DNA,
229+
snpsjoin.ref_aa + snpsjoin.alt_aa,
230+
None,
231+
re.sub(r".\d$", "", protein.gene_identifier),
232+
"protein_coding",
233+
"CODING",
234+
protein.gene_identifier,
235+
None,
236+
]
237+
results_json.append(itm_lst)
200238

201239
# Return results if there are data
202240
if len(results_json) > 0:
@@ -502,3 +540,49 @@ def get(self, pval="", araid="", popid=""):
502540
pop_both_sig_idx = HotspotUtils.get_sig_index(pop_both_sig)
503541
output = {"ara_id": araid, "pop_id": popid, "ara_hotspots": ara_both_sig_idx, "pop_hotspots": pop_both_sig_idx}
504542
return BARUtils.success_exit(output)
543+
544+
545+
@snps.route("/homologs/<string:search_species>/<string:search_gene>/<string:target_species>")
546+
class Homologs(Resource):
547+
@snps.param("search_species", _in="path", default="canola")
548+
@snps.param("search_gene", _in="path", default="BnaA07g31480D")
549+
@snps.param("target_species", _in="path", default="arabidopsis")
550+
def get(self, search_species="", search_gene="", target_species=""):
551+
"""This endpoint shows the homologs proteins of search_gene in target_species.
552+
The endpoint returns a list of homologous pairs of proteins in following format:
553+
Percent_id(percent identity get by blast); e score
554+
"""
555+
# Escape input
556+
search_species = escape(search_species)
557+
gene_id = escape(search_gene)
558+
target_species = escape(target_species)
559+
supported = ["arabidopsis", "canola"]
560+
if (search_species not in supported) or (target_species not in supported):
561+
return BARUtils.error_exit("Species not supported"), 400
562+
elif (search_species == "arabidopsis" and BARUtils.is_arabidopsis_gene_valid(gene_id)) or (search_species == "canola" and BARUtils.is_canola_gene_valid(gene_id)):
563+
results = HomologsDB.query.filter_by(
564+
search_protein_name=gene_id,
565+
search_species_name=search_species,
566+
result_species_name=target_species
567+
).all()
568+
if not results:
569+
return BARUtils.error_exit("No homologs found for the given query"), 400
570+
571+
homologs_list = [
572+
{
573+
"search_species_name": search_species,
574+
"search_protein_name": gene_id,
575+
"result_species_name": target_species,
576+
"result_protein_name": homolog.result_protein_name,
577+
"Percent_id": float(homolog.Percent_id),
578+
"e_score": float(homolog.e_score)
579+
}
580+
for homolog in results
581+
]
582+
homologs_list.sort(key=lambda x: x["e_score"])
583+
if len(homologs_list) >= 5:
584+
homologs_list = homologs_list[:5]
585+
response = {"homologs": homologs_list}
586+
return BARUtils.success_exit(response), 200
587+
else:
588+
return BARUtils.error_exit("Invalid gene id"), 400

api/utils/bar_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,17 @@ def is_cannabis_gene_valid(gene):
8383
else:
8484
return False
8585

86+
@staticmethod
87+
def is_canola_gene_valid(gene):
88+
"""This function verifies if canola gene (BnaC07g42830D) is valid
89+
:param gene:
90+
:return:
91+
"""
92+
if re.search(r"^Bna[AC]\d{2}g\d{5}[A-D]?$", gene, re.I):
93+
return True
94+
else:
95+
return False
96+
8697
@staticmethod
8798
def is_arachis_gene_valid(gene):
8899
"""This function verifies if arachis gene is valid: Adur10000_comp0_c0_seq1

config/BAR_API.cfg

100644100755
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,8 @@ SQLALCHEMY_BINDS = {
4040
'striga' : 'mysql://root:root@localhost/striga',
4141
'tomato_nssnp' : 'mysql://root:root@localhost/tomato_nssnp',
4242
'tomato_sequence' : 'mysql://root:root@localhost/tomato_sequence',
43-
'triphysaria' : 'mysql://root:root@localhost/triphysaria'
43+
'triphysaria' : 'mysql://root:root@localhost/triphysaria',
44+
'canola_nssnp' : 'mysql://root:root@localhost/canola_nssnp',
45+
'homologs_db' : 'mysql://root:root@localhost/homologs_db'
46+
4447
}

0 commit comments

Comments
 (0)