Skip to content

Commit f95b1a5

Browse files
committed
issue #83 - switch to GTF for Ensembl
1 parent 923e452 commit f95b1a5

File tree

6 files changed

+53
-27
lines changed

6 files changed

+53
-27
lines changed

cdot/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
__version__ = "0.2.26"
2-
2+
# Data version is kept in generate_transcript_version.json_schema_version
33

44
def get_data_schema_int(version: str) -> int:
55
""" Return an int which increments upon breaking changes - ie anything other than patch """

generate_transcript_data/ensembl_transcripts_chm13v2.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@ fi
1313

1414
merge_args=()
1515
for release in 2022_06 2022_07; do
16-
filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gff3.gz
16+
filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gtf.gz
1717
url=https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/${release}/${filename}
1818
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
1919

2020
if [[ ! -e ${filename} ]]; then
2121
wget ${url}
2222
fi
2323
if [[ ! -e ${cdot_file} ]]; then
24-
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
24+
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
2525
fi
2626
merge_args+=(${cdot_file})
2727
done

generate_transcript_data/ensembl_transcripts_grch37.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@ fi
1919
merge_args=()
2020
for release in 82 85 87; do
2121
# Switched to using GTFs as they contain protein version
22-
filename=Homo_sapiens.GRCh37.${release}.gff3.gz
23-
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename}
22+
filename=Homo_sapiens.GRCh37.${release}.gtf.gz
23+
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gtf/homo_sapiens/${filename}
2424
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
2525
if [[ ! -e ${filename} ]]; then
2626
wget ${url}
2727
fi
2828
if [[ ! -e ${cdot_file} ]]; then
29-
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
29+
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
3030
fi
3131
merge_args+=(${cdot_file})
3232
done

generate_transcript_data/ensembl_transcripts_grch38.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,16 @@ fi
2626
#81 is first GFF3 for GRCh38
2727
merge_args=()
2828
for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112; do
29-
# Switched to using GTFs as they contain protein version
30-
filename=Homo_sapiens.GRCh38.${release}.gff3.gz
31-
url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename}
29+
# Switched to using GTFs as they contain protein version while Ensembl GFF3s do not (required for c_to_p)
30+
filename=Homo_sapiens.GRCh38.${release}.gtf.gz
31+
url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
3232
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
3333

3434
if [[ ! -e ${filename} ]]; then
3535
wget ${url}
3636
fi
3737
if [[ ! -e ${cdot_file} ]]; then
38-
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
38+
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
3939
fi
4040
merge_args+=(${cdot_file})
4141
done

generate_transcript_data/gff_parser.py

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def __init__(self, filename, genome_build, url,
4040
self.transcript_proteins = {}
4141
# Store features in separate dict as we don't need to write all as JSON
4242
self.transcript_features_by_type = defaultdict(lambda: defaultdict(list))
43-
43+
self._warned_about_htseq_tag_attributes = False
4444

4545
name_ac_map = {}
4646
if not no_contig_conversion:
@@ -345,6 +345,40 @@ def _get_gene_accession(feature) -> Optional[str]:
345345
gene_accession = gene_id
346346
return gene_accession
347347

348+
def _handle_protein_version(self, transcript_accession, feature):
349+
# RefSeq GFF3: CDS protein_id = NP_001659.1
350+
# RefSeq GTF: CDS protein_id = NP_001659.1
351+
# Ensembl GTF: CDS protein_id = ENSP00000477624 protein_version = 1
352+
# Ensembl GTF: CDS protein_id = ENSP00000477624 <----- can't use this one as no version
353+
if feature.type == "CDS":
354+
if protein_accession := feature.attr.get("protein_id"):
355+
if protein_version := feature.attr.get("protein_version"):
356+
protein_accession = f"{protein_accession}.{protein_version}"
357+
if not "." in protein_accession:
358+
raise ValueError(f"Protein '{protein_accession}' missing version")
359+
self.transcript_proteins[transcript_accession] = protein_accession
360+
361+
def _add_tags_to_transcript_data(self, transcript_data, feature):
362+
# Ideally we only want to get this once per transcript
363+
# So we want to pick something like mRNA or transcript not CDS or exon
364+
if feature.type in ('mRNA', 'transcript'):
365+
attr_tuples = getattr(feature, "attr_tuples", None)
366+
if attr_tuples is None:
367+
attr_tuples = feature.attr.items()
368+
if not self._warned_about_htseq_tag_attributes:
369+
self._warned_about_htseq_tag_attributes = True
370+
htseq_version = importlib.metadata.version('HTSeq')
371+
logging.warning("Your version of HTSeq (%s) can not handle duplicated tags. Some will be lost "
372+
"See https://github.com/htseq/htseq/issues/83", htseq_version)
373+
374+
attr_list_vals = defaultdict(list)
375+
for tag, value in attr_tuples:
376+
attr_list_vals[tag].append(value)
377+
378+
if tag_list := attr_list_vals.get("tag"):
379+
transcript_data["tag"] = ",".join(tag_list)
380+
381+
348382
def get_genes_and_transcripts(self):
349383
self._parse()
350384
self._finish()
@@ -356,6 +390,9 @@ class GTFParser(GFFParser):
356390
""" GTF (GFF2) - used by Ensembl, @see http://gmod.org/wiki/GFF2
357391
358392
GFF2 only has 2 levels of feature hierarchy, so we have to build or 3 levels of gene/transcript/exons ourselves
393+
394+
We *have* to use GTF as Ensembl GFF3s don't include the protein version (just the ID)
395+
359396
"""
360397
GTF_TRANSCRIPTS_DATA = GFFParser.CODING_FEATURES | {"exon"}
361398
FEATURE_ALLOW_LIST = GTF_TRANSCRIPTS_DATA | {"gene", "transcript"}
@@ -392,14 +429,8 @@ def handle_feature(self, feature):
392429
gene_data["biotype"].add(biotype)
393430
transcript["biotype"].add(biotype)
394431

395-
if feature.type == "CDS":
396-
if protein := feature.attr.get("protein_id"):
397-
if protein_version := feature.attr.get("protein_version"):
398-
protein = f"{protein}.{protein_version}"
399-
self.transcript_proteins[transcript_accession] = protein
400-
elif feature.type == "transcript":
401-
if tag := feature.attr.get("tag"):
402-
transcript["tag"] = tag
432+
self._handle_protein_version(transcript_accession, feature)
433+
self._add_tags_to_transcript_data(transcript, feature)
403434

404435

405436
class GFF3Parser(GFFParser):
@@ -461,15 +492,11 @@ def handle_feature(self, feature):
461492
# Some exons etc may be for miRNAs that have no transcript ID, so skip those (won't have parent)
462493
if parent_id:
463494
transcript_accession = self.transcript_accession_by_feature_id.get(parent_id)
495+
self._handle_protein_version(transcript_accession, feature)
464496
else:
465497
logging.warning("Transcript data has no parent: %s" % feature.get_gff_line())
466498
transcript_accession = None
467499

468-
if feature.type == "CDS":
469-
dbxref = self._get_dbxref(feature)
470-
if genbank := (dbxref.get("Genbank") or dbxref.get("GenBank")):
471-
self.transcript_proteins[transcript_accession] = genbank
472-
473500
if transcript_accession:
474501
transcript = self.transcript_data_by_accession.get(transcript_accession)
475502
if not transcript:
@@ -516,8 +543,7 @@ def _handle_transcript(self, gene_data, transcript_accession, feature):
516543
if feature.attr.get("partial"):
517544
transcript_data["partial"] = 1
518545

519-
if tag := feature.attr.get("tag"):
520-
transcript_data["tag"] = tag
546+
self._add_tags_to_transcript_data(transcript_data, feature)
521547

522548
self.transcript_data_by_accession[transcript_accession] = transcript_data
523549
self.transcript_accession_by_feature_id[feature.attr["ID"]] = transcript_accession
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# After 0.2.22 we split version into separate code (pip) and data schema versions
22
# The cdot client will use its own major/minor to determine whether it can read these data files
3-
JSON_SCHEMA_VERSION = "0.2.26"
3+
JSON_SCHEMA_VERSION = "0.2.27"

0 commit comments

Comments
 (0)