Skip to content

Commit 26f57a1

Browse files
authored
feat: get cds overlap given chromosome, start, + stop (#217)
* Add class (`FeatureOverlap`) for getting cds overlap * Only supports GRCh38 input * Returns VRS Sequence Locations * `get_mane_summary` --> `get_mane` * Now can additionally download MANE RefSeq GFF file
1 parent e502616 commit 26f57a1

File tree

9 files changed

+787
-23
lines changed

9 files changed

+787
-23
lines changed

Pipfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ pyliftover = "*"
1111
pandas = "*"
1212
hgvs = "*"
1313
"biocommons.seqrepo" = "*"
14-
pydantic = "*"
14+
pydantic = "==1.*"
1515
fastapi = "*"
1616
uvicorn = "*"
17-
gene-normalizer = ">=0.1.34, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8"
17+
gene-normalizer = "==0.1.*"
1818
"ga4gh.vrs" = "*"
19+
"ga4gh.vrsatile.pydantic" = "==0.0.*"
1920

2021
[dev-packages]
2122
cool_seq_tool = {editable = true, path = "."}

cool_seq_tool/data/data_downloads.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,34 +23,38 @@ def __init__(self) -> None:
2323
"""Initialize downloadable data locations."""
2424
self._data_dir = APP_ROOT / "data"
2525

26-
def get_mane_summary(self) -> Path:
27-
"""Identify latest MANE summary data. If unavailable locally, download from
28-
source.
26+
def get_mane(self, is_summary: bool = True) -> Path:
27+
"""Identify latest MANE summary or refseq gff data. If unavailable locally,
28+
download from source.
2929
30+
:param is_summary: `True` if getting summary data. `False` if getting refseq
31+
gff data.
3032
:return: path to MANE summary file
3133
"""
34+
fn_end = ".summary.txt.gz" if is_summary else ".refseq_genomic.gff.gz"
35+
3236
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
3337
ftp.login()
3438
ftp.cwd("/refseq/MANE/MANE_human/current")
3539
files = ftp.nlst()
36-
mane_summary_file = \
37-
[f for f in files if f.endswith(".summary.txt.gz")]
38-
if not mane_summary_file:
39-
raise Exception("Unable to download MANE summary data")
40-
mane_summary_file = mane_summary_file[0]
41-
self._mane_summary_path = \
42-
self._data_dir / mane_summary_file[:-3]
43-
mane_data_path = self._data_dir / mane_summary_file
44-
if not self._mane_summary_path.exists():
45-
logger.info("Downloading MANE summary file from NCBI.")
40+
41+
mane_file = [f for f in files if f.endswith(fn_end)]
42+
if not mane_file:
43+
raise Exception(f"Unable to find MANE {fn_end[1:]} data")
44+
mane_file = mane_file[0]
45+
self._mane_path = \
46+
self._data_dir / mane_file[:-3]
47+
mane_data_path = self._data_dir / mane_file
48+
if not self._mane_path.exists():
49+
logger.info(f"Downloading {mane_file}...")
4650
with open(mane_data_path, "wb") as fp:
47-
ftp.retrbinary(f"RETR {mane_summary_file}", fp.write)
51+
ftp.retrbinary(f"RETR {mane_file}", fp.write)
4852
with gzip.open(mane_data_path, "rb") as f_in:
49-
with open(self._mane_summary_path, "wb") as f_out:
53+
with open(self._mane_path, "wb") as f_out:
5054
shutil.copyfileobj(f_in, f_out)
5155
remove(mane_data_path)
52-
logger.info("MANE summary file download complete.")
53-
return self._mane_summary_path
56+
logger.info(f"{mane_file} download complete.")
57+
return self._mane_path
5458

5559
def get_lrg_refseq_gene_data(self) -> Path:
5660
"""Identify latest LRG RefSeq Gene file. If unavailable locally, download from

cool_seq_tool/data_sources/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
from .gene_normalizer import GeneNormalizer
77
from .mane_transcript import MANETranscript
88
from .alignment_mapper import AlignmentMapper
9+
from .feature_overlap import FeatureOverlap
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
"""Module for getting feature (gene/exon) overlap"""
2+
import re
3+
from pathlib import Path
4+
from typing import Dict, Optional
5+
6+
import pandas as pd
7+
from ga4gh.core import ga4gh_identify
8+
from ga4gh.vrs import models
9+
10+
from cool_seq_tool.data_sources import SeqRepoAccess
11+
from cool_seq_tool.paths import MANE_REFSEQ_GFF_PATH
12+
from cool_seq_tool.schemas import Assembly, CdsOverlap, ResidueMode
13+
14+
15+
# Pattern for chromosome
16+
CHR_PATTERN = r"X|Y|([1-9]|1[0-9]|2[0-2])"
17+
18+
19+
class FeatureOverlapError(Exception):
20+
"""Custom exception for the Feature Overlap class"""
21+
22+
23+
class FeatureOverlap:
24+
"""The class for getting feature overlap"""
25+
26+
def __init__(
27+
self,
28+
seqrepo_access: SeqRepoAccess,
29+
mane_refseq_gff_path: Path = MANE_REFSEQ_GFF_PATH,
30+
) -> None:
31+
"""Initialize the FeatureOverlap class. Will load RefSeq data and store as df.
32+
33+
:param seqrepo_access: Client for accessing SeqRepo data
34+
:param mane_refseq_gff_path: Path to the MANE RefSeq GFF file
35+
"""
36+
self.seqrepo_access = seqrepo_access
37+
self.mane_refseq_gff_path = mane_refseq_gff_path
38+
self.df = self._load_mane_refseq_gff_data()
39+
40+
def _load_mane_refseq_gff_data(self) -> pd.core.frame.DataFrame:
41+
"""Load MANE RefSeq GFF data file into DataFrame.
42+
43+
:return: DataFrame containing MANE RefSeq GFF data for CDS. Columns include
44+
`type`, `chromosome` (chromosome without 'chr' prefix), `cds_start`,
45+
`cds_stop`, `info_name` (name of record), and `gene`. `cds_start` and
46+
`cds_stop` use inter-residue coordinates.
47+
"""
48+
df = pd.read_csv(
49+
self.mane_refseq_gff_path,
50+
sep="\t",
51+
header=None,
52+
skiprows=9,
53+
usecols=[0, 2, 3, 4, 8],
54+
)
55+
df.columns = ["chromosome", "type", "cds_start", "cds_stop", "info"]
56+
57+
# Restrict to only feature of interest: CDS (which has gene info)
58+
df = df[df["type"] == "CDS"].copy()
59+
60+
# Get name from the info field
61+
df["info_name"] = df["info"].apply(
62+
lambda info: re.findall("Name=([^;]+)", info)[0]
63+
)
64+
65+
# Get gene from the info field
66+
df["gene"] = df["info"].apply(lambda info: re.findall("gene=([^;]+)", info)[0])
67+
68+
# Get chromosome names without prefix and without suffix for alternate
69+
# transcripts
70+
df["chromosome"] = df["chromosome"].apply(
71+
lambda chromosome: chromosome.strip("chr").split("_")[0]
72+
)
73+
df["chromosome"] = df["chromosome"].astype(str)
74+
75+
# Convert start and stop to ints
76+
df["cds_start"] = df["cds_start"].astype(int)
77+
df["cds_stop"] = df["cds_stop"].astype(int)
78+
79+
# Convert to inter-residue coordinates
80+
df["cds_start"] -= 1
81+
82+
# Only retain certain columns
83+
df = df[["type", "chromosome", "cds_start", "cds_stop", "info_name", "gene"]]
84+
85+
return df
86+
87+
def _get_chr_from_alt_ac(self, identifier: str) -> str:
88+
"""Get chromosome given genomic identifier
89+
90+
:param identifier: Genomic identifier on GRCh38 assembly
91+
:raises FeatureOverlapError: If unable to find associated GRCh38 chromosome
92+
:return: Chromosome. 1..22, X, Y. No 'chr' prefix.
93+
"""
94+
aliases, error_msg = self.seqrepo_access.translate_identifier(
95+
identifier, Assembly.GRCH38.value
96+
)
97+
98+
if error_msg:
99+
raise FeatureOverlapError(str(error_msg))
100+
101+
if not aliases:
102+
raise FeatureOverlapError(
103+
f"Unable to find {Assembly.GRCH38.value} aliases for: {identifier}"
104+
)
105+
106+
assembly_chr_pattern = rf"^{Assembly.GRCH38.value}:(?P<chromosome>{CHR_PATTERN})$" # noqa: E501
107+
for a in aliases:
108+
chr_match = re.match(assembly_chr_pattern, a)
109+
if chr_match:
110+
break
111+
112+
if not chr_match:
113+
raise FeatureOverlapError(
114+
f"Unable to find {Assembly.GRCH38.value} chromosome for: {identifier}"
115+
)
116+
117+
chr_groupdict = chr_match.groupdict()
118+
return chr_groupdict["chromosome"]
119+
120+
def get_grch38_mane_gene_cds_overlap(
121+
self,
122+
start: int,
123+
end: int,
124+
chromosome: Optional[str] = None,
125+
identifier: Optional[str] = None,
126+
residue_mode: ResidueMode = ResidueMode.RESIDUE,
127+
) -> Optional[Dict[str, CdsOverlap]]:
128+
"""Given GRCh38 genomic data, find the overlapping MANE features (gene and cds).
129+
The genomic data is specified as a sequence location by `chromosome`, `start`,
130+
`end`. All CDS regions with which the input sequence location has nonzero base
131+
pair overlap will be returned.
132+
133+
:param start: GRCh38 start position
134+
:param end: GRCh38 end position
135+
:param chromosome: Chromosome. 1..22, X, or Y. If not provided, must provide
136+
`identifier`. If both `chromosome` and `identifier` are provided,
137+
`chromosome` will be used.
138+
:param identifier: Genomic identifier on GRCh38 assembly. If not provided, must
139+
provide `chromosome`. If both `chromosome` and `identifier` are provided,
140+
`chromosome` will be used.
141+
:param residue_mode: Residue mode for `start` and `end`
142+
:raise FeatureOverlapError: If missing required fields or unable to find
143+
associated ga4gh identifier
144+
:return: MANE feature (gene/cds) overlap data represented as a dict. The
145+
dictionary will be keyed by genes which overlap the input sequence location.
146+
Each gene contains a list of the overlapping CDS regions with the beginning
147+
and end of the input sequence location's overlap with each
148+
{
149+
gene: {
150+
'cds': VRS Sequence Location
151+
'overlap': VRS Sequence Location
152+
}
153+
}
154+
"""
155+
ga4gh_seq_id = None
156+
if chromosome:
157+
if not re.match(f"^{CHR_PATTERN}$", chromosome):
158+
raise FeatureOverlapError("`chromosome` must be 1, ..., 22, X, or Y")
159+
else:
160+
if identifier:
161+
chromosome = self._get_chr_from_alt_ac(identifier)
162+
if identifier.startswith("ga4gh:SQ."):
163+
ga4gh_seq_id = identifier
164+
else:
165+
raise FeatureOverlapError(
166+
"Must provide either `chromosome` or `identifier`"
167+
)
168+
169+
# Convert residue to inter-residue
170+
if residue_mode == ResidueMode.RESIDUE:
171+
start -= 1
172+
173+
# Get feature dataframe (df uses inter-residue)
174+
feature_df = self.df[
175+
(self.df["chromosome"] == chromosome)
176+
& (self.df["cds_start"] <= end) # noqa: W503
177+
& (self.df["cds_stop"] >= start) # noqa: W503
178+
].copy()
179+
180+
if feature_df.empty:
181+
return None
182+
183+
# Add overlap columns
184+
feature_df["overlap_start"] = feature_df["cds_start"].apply(
185+
lambda x: x if start <= x else start
186+
)
187+
feature_df["overlap_stop"] = feature_df["cds_stop"].apply(
188+
lambda x: end if end <= x else x
189+
)
190+
191+
# Get ga4gh identifier for chromosome
192+
if not ga4gh_seq_id:
193+
grch38_chr = f"{Assembly.GRCH38.value}:{chromosome}"
194+
ga4gh_aliases, error_msg = self.seqrepo_access.translate_identifier(
195+
grch38_chr, "ga4gh"
196+
)
197+
198+
# Errors should never happen but catching just in case
199+
if error_msg:
200+
raise FeatureOverlapError(str(error_msg))
201+
elif not ga4gh_aliases:
202+
raise FeatureOverlapError(
203+
f"Unable to find ga4gh identifier for: {grch38_chr}"
204+
)
205+
206+
ga4gh_seq_id = ga4gh_aliases[0]
207+
208+
def _get_seq_loc(start_pos: int, stop_pos: int, ga4gh_sequence_id: str) -> Dict:
209+
"""Get VRS Sequence Location represented as a dict
210+
211+
:param start_pos: Start position
212+
:param stop_pos: Stop position
213+
:param ga4gh_sequence_id: ga4gh sequence identifier
214+
:return: VRS Sequence Location represented as dictionary with the ga4gh ID
215+
included
216+
"""
217+
_sl = models.SequenceLocation(
218+
sequence_id=ga4gh_sequence_id,
219+
interval=models.SequenceInterval(
220+
start=models.Number(value=start_pos),
221+
end=models.Number(value=stop_pos),
222+
),
223+
)
224+
_sl._id = ga4gh_identify(_sl)
225+
return _sl.as_dict()
226+
227+
resp = {}
228+
for gene, group in feature_df.groupby("gene"):
229+
_gene_overlap_data = []
230+
231+
for cds_row in group.itertuples():
232+
_gene_overlap_data.append(
233+
CdsOverlap(
234+
cds=_get_seq_loc(
235+
cds_row.cds_start, cds_row.cds_stop, ga4gh_seq_id
236+
),
237+
overlap=_get_seq_loc(
238+
cds_row.overlap_start, cds_row.overlap_stop, ga4gh_seq_id
239+
),
240+
).dict(by_alias=True)
241+
)
242+
resp[gene] = _gene_overlap_data
243+
244+
return resp

cool_seq_tool/paths.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,17 @@
1212

1313
d = DataDownload()
1414

15+
provided_mane_refseq_gff_path = environ.get("MANE_REFSEQ_GFF_PATH")
16+
if provided_mane_refseq_gff_path:
17+
MANE_REFSEQ_GFF_PATH = Path(provided_mane_refseq_gff_path)
18+
else:
19+
MANE_REFSEQ_GFF_PATH = d.get_mane(is_summary=False)
20+
1521
provided_mane_summary_path = environ.get("MANE_SUMMARY_PATH", "")
1622
if provided_mane_summary_path:
1723
MANE_SUMMARY_PATH = Path(provided_mane_summary_path)
1824
else:
19-
MANE_SUMMARY_PATH = d.get_mane_summary()
25+
MANE_SUMMARY_PATH = d.get_mane(is_summary=True)
2026

2127
provided_lrg_refseq_path = environ.get("LRG_REFSEQGENE_PATH", "")
2228
if provided_lrg_refseq_path:

cool_seq_tool/schemas.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import re
55
from typing import Literal, Optional, List, Tuple, Union, Dict, Any, Type
66

7+
from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation
78
from pydantic import BaseModel, root_validator, validator
89
from pydantic.main import Extra
910
from pydantic.types import StrictStr, StrictInt
@@ -616,3 +617,43 @@ def schema_extra(schema: Dict[str, Any],
616617
"url": "https://github.com/GenomicMedLab/cool-seq-tool"
617618
}
618619
}
620+
621+
622+
class CdsOverlap(BaseModelForbidExtra):
623+
"""Create model for representing CDS start/stop and Overlap start/stop"""
624+
625+
cds: SequenceLocation
626+
overlap: SequenceLocation
627+
628+
class Config(BaseModelForbidExtra.Config):
629+
"""Configure model."""
630+
631+
@staticmethod
632+
def schema_extra(schema: Dict[str, Any], model: Type["CdsOverlap"]) -> None:
633+
"""Configure OpenAPI schema."""
634+
if "title" in schema.keys():
635+
schema.pop("title", None)
636+
for prop in schema.get("properties", {}).values():
637+
prop.pop("title", None)
638+
schema["example"] = {
639+
"cds": {
640+
"_id": "ga4gh:VSL._H2ST69A4RkWCSRHOoMv-edt-R45fPdq",
641+
"type": "SequenceLocation",
642+
"sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
643+
"interval": {
644+
"type": "SequenceInterval",
645+
"start": {"value": 140726493, "type": "Number"},
646+
"end": {"value": 140726516, "type": "Number"},
647+
},
648+
},
649+
"overlap": {
650+
"_id": "ga4gh:VSL._H2ST69A4RkWCSRHOoMv-edt-R45fPdq",
651+
"type": "SequenceLocation",
652+
"sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
653+
"interval": {
654+
"type": "SequenceInterval",
655+
"start": {"value": 140726493, "type": "Number"},
656+
"end": {"value": 140726516, "type": "Number"},
657+
},
658+
}
659+
}

cool_seq_tool/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.14-dev1"
1+
__version__ = "0.1.14-dev2"

0 commit comments

Comments
 (0)