Skip to content

Commit d559ca0

Browse files
authored
Merge pull request #33 from galaxy-genome-annotation/fix_bulk_gff3_feature_import
begin to refactor gff3
2 parents 7128414 + 193bd46 commit d559ca0

23 files changed

+3859
-128
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ cover
4141
docs/_build
4242

4343
# Python virtualenv
44-
.venv
44+
.venv*
4545

4646
# test harness
4747
test_harness.py

README.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ Or with the Arrow client:
8585
History
8686
-------
8787

88+
- 4.2.3
89+
- Fixed `load_gff3` to more accurately load transcripts including the CDS as well as handle non-coding types more accurately.
8890
- 4.2.2
8991
- Drastically speed up load_gff3
9092
- `load_gff3` now uses the Apollo `add_transcript` method if it is a gene or mRNA type

apollo/annotations/__init__.py

Lines changed: 147 additions & 111 deletions
Large diffs are not rendered by default.

apollo/util.py

Lines changed: 147 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,139 @@ def AssertAdmin(user):
8989

9090

9191
def _tnType(feature):
92-
if feature.type in ('gene', 'mRNA', 'exon', 'CDS', 'terminator', 'tRNA'):
92+
if feature.type in ('gene', 'mRNA', 'exon', 'CDS', 'terminator', 'tRNA', 'snRNA', 'snoRNA', 'ncRNA', 'rRNA', 'miRNA', 'repeat_region', 'transposable_element', 'pseudogene', 'transcript'):
9393
return feature.type
9494
else:
9595
return 'exon'
9696

9797

98+
def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False):
99+
current = _yieldSubFeatureData(gene, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name)
100+
101+
if gene.sub_features:
102+
current['children'] = []
103+
for sf in gene.sub_features:
104+
if _tnType(sf) in coding_transcript_types:
105+
current['children'].append(_yieldCodingTranscriptData(sf, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name))
106+
elif _tnType(sf) in noncoding_transcript_types:
107+
current['children'].append(_yieldNonCodingTranscriptData(sf, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name))
108+
109+
# # TODO: handle comments
110+
# # TODO: handle dbxrefs
111+
# # TODO: handle attributes
112+
# # TODO: handle aliases
113+
# # TODO: handle description
114+
# # TODO: handle GO, Gene Product, Provenance
115+
116+
if 'children' in current and gene.type == 'gene':
117+
# Only sending mRNA level as apollo is more comfortable with orphan mRNAs
118+
return current['children']
119+
else:
120+
# No children, return a generic gene feature
121+
return current
122+
123+
124+
def _yieldSubFeatureData(f, disable_cds_recalculation=False, use_name=False):
125+
current = {
126+
'location': {
127+
'strand': f.strand,
128+
'fmin': int(f.location.start),
129+
'fmax': int(f.location.end),
130+
},
131+
'type': {
132+
'name': _tnType(f),
133+
'cv': {
134+
'name': 'sequence',
135+
}
136+
},
137+
}
138+
if disable_cds_recalculation:
139+
current['use_cds'] = 'true'
140+
141+
if f.type in (coding_transcript_types + noncoding_transcript_types + gene_types + pseudogenes_types
142+
+ single_level_feature_types):
143+
current['name'] = f.qualifiers.get('Name', [f.id])[0]
144+
145+
if 'ID' in f.qualifiers:
146+
current['gff_id'] = f.qualifiers['ID'][0]
147+
148+
if use_name:
149+
current['use_name'] = True
150+
151+
# if OGS:
152+
# TODO: handle comments
153+
# TODO: handle dbxrefs
154+
# TODO: handle attributes
155+
# TODO: handle aliases
156+
# TODO: handle description
157+
# TODO: handle GO, Gene Product, Provenance
158+
return current
159+
160+
161+
def _yieldCodingTranscriptData(f, disable_cds_recalculation=False, use_name=False):
162+
current = {
163+
'location': {
164+
'strand': f.strand,
165+
'fmin': int(f.location.start),
166+
'fmax': int(f.location.end),
167+
},
168+
'type': {
169+
'name': _tnType(f),
170+
'cv': {
171+
'name': 'sequence',
172+
}
173+
},
174+
}
175+
176+
if f.type in (coding_transcript_types + noncoding_transcript_types + gene_types + pseudogenes_types
177+
+ single_level_feature_types):
178+
current['name'] = f.qualifiers.get('Name', [f.id])[0]
179+
180+
if 'ID' in f.qualifiers:
181+
current['gff_id'] = f.qualifiers['ID'][0]
182+
183+
if len(f.sub_features) > 0:
184+
current['children'] = []
185+
for sf in f.sub_features:
186+
current['children'].append(
187+
_yieldSubFeatureData(sf, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name))
188+
189+
return current
190+
191+
192+
def _yieldNonCodingTranscriptData(features, disable_cds_recalculation=False, use_name=False):
193+
return _yieldCodingTranscriptData(features, disable_cds_recalculation, use_name)
194+
195+
196+
# def _yieldSingleLevelFeatureData(features):
197+
# return _yieldSubFeatureData(features[0])
198+
199+
200+
def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False):
201+
feature_type = _tnType(feature)
202+
if feature_type in gene_types:
203+
return _yieldGeneData(feature)
204+
elif feature_type in pseudogenes_types:
205+
return _yieldGeneData(feature)
206+
elif feature_type in coding_transcript_types:
207+
return _yieldCodingTranscriptData(feature)
208+
elif feature_type in noncoding_transcript_types:
209+
return _yieldNonCodingTranscriptData(feature)
210+
elif feature_type in single_level_feature_types:
211+
# return _yieldSingleLevelFeatureData(current_feature)
212+
return _yieldSubFeatureData(feature)
213+
else:
214+
return _yieldSubFeatureData(feature)
215+
216+
# # if OGS:
217+
# # TODO: handle comments
218+
# # TODO: handle dbxrefs
219+
# # TODO: handle attributes
220+
# # TODO: handle aliases
221+
# # TODO: handle description
222+
# # TODO: handle GO, Gene Product, Provenance
223+
224+
98225
def _yieldFeatData(features, use_name=False, disable_cds_recalculation=False):
99226
for f in features:
100227
current = {
@@ -110,14 +237,17 @@ def _yieldFeatData(features, use_name=False, disable_cds_recalculation=False):
110237
}
111238
},
112239
}
113-
if disable_cds_recalculation is True:
240+
if disable_cds_recalculation:
114241
current['use_cds'] = 'true'
115242

116243
if f.type in (coding_transcript_types + noncoding_transcript_types + gene_types + pseudogenes_types
117244
+ single_level_feature_types):
118245
current['name'] = f.qualifiers.get('Name', [f.id])[0]
119246

120-
if use_name is True:
247+
if 'ID' in f.qualifiers:
248+
current['gff_id'] = f.qualifiers['ID'][0]
249+
250+
if use_name:
121251
current['use_name'] = True
122252

123253
# if OGS:
@@ -150,6 +280,20 @@ def add_property_to_feature(feature, property_key, property_value):
150280
return feature
151281

152282

283+
def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False):
284+
"""
285+
286+
:param disable_cds_recalculation:
287+
:param use_name:
288+
:param features:
289+
:return:
290+
"""
291+
compiled = []
292+
for f in features:
293+
compiled.append(yieldApolloData(f, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation))
294+
return compiled
295+
296+
153297
def features_to_feature_schema(features, use_name=False, disable_cds_recalculation=False):
154298
"""
155299

arrow/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '4.2.2'
1+
__version__ = '4.2.3'

arrow/commands/annotations/load_gff3.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,6 @@
3333
help="Disable CDS recalculation and instead use the one provided",
3434
is_flag=True
3535
)
36-
@click.option(
37-
"--verbose",
38-
help="Verbose mode",
39-
is_flag=True
40-
)
4136
@click.option(
4237
"--timing",
4338
help="Output loading performance metrics",
@@ -46,11 +41,11 @@
4641
@pass_context
4742
@custom_exception
4843
@str_output
49-
def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, verbose=False, timing=False):
44+
def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, timing=False):
5045
"""Load a full GFF3 into annotation track
5146
5247
Output:
5348
5449
Loading report
5550
"""
56-
return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, verbose=verbose, timing=timing)
51+
return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, timing=timing)

docs/commands/annotations.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -531,7 +531,6 @@ Load a full GFF3 into annotation track
531531
--use_name Use the given name instead of generating one.
532532
--disable_cds_recalculation Disable CDS recalculation and instead use the one
533533
provided
534-
--verbose Verbose mode
535534
--timing Output loading performance metrics
536535
-h, --help Show this message and exit.
537536

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
requests
2-
biopython
2+
biopython==1.77
33
cachetools<4
44
click>=6.7
55
wrapt
66
pyyaml
77
decorator
8-
bcbio-gff
8+
bcbio-gff==0.6.6
99
pytest-timeit

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
setup(
1818
name="apollo",
19-
version='4.2.2',
19+
version='4.2.3',
2020
description="Apollo API library",
2121
long_description=readme,
2222
author="Helena Rasche;Anthony Bretaudeau;Nathan Dunn",

0 commit comments

Comments
 (0)