@@ -40,7 +40,7 @@ def __init__(self, filename, genome_build, url,
4040 self .transcript_proteins = {}
4141 # Store features in separate dict as we don't need to write all as JSON
4242 self .transcript_features_by_type = defaultdict (lambda : defaultdict (list ))
43-
43+ self . _warned_about_htseq_tag_attributes = False
4444
4545 name_ac_map = {}
4646 if not no_contig_conversion :
@@ -345,6 +345,40 @@ def _get_gene_accession(feature) -> Optional[str]:
345345 gene_accession = gene_id
346346 return gene_accession
347347
348+ def _handle_protein_version (self , transcript_accession , feature ):
349+ # RefSeq GFF3: CDS protein_id = NP_001659.1
350+ # RefSeq GTF: CDS protein_id = NP_001659.1
351+ # Ensembl GTF: CDS protein_id = ENSP00000477624 protein_version = 1
352+ # Ensembl GTF: CDS protein_id = ENSP00000477624 <----- can't use this one as no version
353+ if feature .type == "CDS" :
354+ if protein_accession := feature .attr .get ("protein_id" ):
355+ if protein_version := feature .attr .get ("protein_version" ):
356+ protein_accession = f"{ protein_accession } .{ protein_version } "
357+ if not "." in protein_accession :
358+ raise ValueError (f"Protein '{ protein_accession } ' missing version" )
359+ self .transcript_proteins [transcript_accession ] = protein_accession
360+
361+ def _add_tags_to_transcript_data (self , transcript_data , feature ):
362+ # Ideally we only want to get this once per transcript
363+ # So we want to pick something like mRNA or transcript not CDS or exon
364+ if feature .type in ('mRNA' , 'transcript' ):
365+ attr_tuples = getattr (feature , "attr_tuples" , None )
366+ if attr_tuples is None :
367+ attr_tuples = feature .attr .items ()
368+ if not self ._warned_about_htseq_tag_attributes :
369+ self ._warned_about_htseq_tag_attributes = True
370+ htseq_version = importlib .metadata .version ('HTSeq' )
371+ logging .warning ("Your version of HTSeq (%s) can not handle duplicated tags. Some will be lost "
372+ "See https://github.com/htseq/htseq/issues/83" , htseq_version )
373+
374+ attr_list_vals = defaultdict (list )
375+ for tag , value in attr_tuples :
376+ attr_list_vals [tag ].append (value )
377+
378+ if tag_list := attr_list_vals .get ("tag" ):
379+ transcript_data ["tag" ] = "," .join (tag_list )
380+
381+
348382 def get_genes_and_transcripts (self ):
349383 self ._parse ()
350384 self ._finish ()
@@ -356,6 +390,9 @@ class GTFParser(GFFParser):
356390 """ GTF (GFF2) - used by Ensembl, @see http://gmod.org/wiki/GFF2
357391
358392 GFF2 only has 2 levels of feature hierarchy, so we have to build or 3 levels of gene/transcript/exons ourselves
393+
394+ We *have* to use GTF as Ensembl GFF3s don't include the protein version (just the ID)
395+
359396 """
360397 GTF_TRANSCRIPTS_DATA = GFFParser .CODING_FEATURES | {"exon" }
361398 FEATURE_ALLOW_LIST = GTF_TRANSCRIPTS_DATA | {"gene" , "transcript" }
@@ -392,14 +429,8 @@ def handle_feature(self, feature):
392429 gene_data ["biotype" ].add (biotype )
393430 transcript ["biotype" ].add (biotype )
394431
395- if feature .type == "CDS" :
396- if protein := feature .attr .get ("protein_id" ):
397- if protein_version := feature .attr .get ("protein_version" ):
398- protein = f"{ protein } .{ protein_version } "
399- self .transcript_proteins [transcript_accession ] = protein
400- elif feature .type == "transcript" :
401- if tag := feature .attr .get ("tag" ):
402- transcript ["tag" ] = tag
432+ self ._handle_protein_version (transcript_accession , feature )
433+ self ._add_tags_to_transcript_data (transcript , feature )
403434
404435
405436class GFF3Parser (GFFParser ):
@@ -461,15 +492,11 @@ def handle_feature(self, feature):
461492 # Some exons etc may be for miRNAs that have no transcript ID, so skip those (won't have parent)
462493 if parent_id :
463494 transcript_accession = self .transcript_accession_by_feature_id .get (parent_id )
495+ self ._handle_protein_version (transcript_accession , feature )
464496 else :
465497 logging .warning ("Transcript data has no parent: %s" % feature .get_gff_line ())
466498 transcript_accession = None
467499
468- if feature .type == "CDS" :
469- dbxref = self ._get_dbxref (feature )
470- if genbank := (dbxref .get ("Genbank" ) or dbxref .get ("GenBank" )):
471- self .transcript_proteins [transcript_accession ] = genbank
472-
473500 if transcript_accession :
474501 transcript = self .transcript_data_by_accession .get (transcript_accession )
475502 if not transcript :
@@ -516,8 +543,7 @@ def _handle_transcript(self, gene_data, transcript_accession, feature):
516543 if feature .attr .get ("partial" ):
517544 transcript_data ["partial" ] = 1
518545
519- if tag := feature .attr .get ("tag" ):
520- transcript_data ["tag" ] = tag
546+ self ._add_tags_to_transcript_data (transcript_data , feature )
521547
522548 self .transcript_data_by_accession [transcript_accession ] = transcript_data
523549 self .transcript_accession_by_feature_id [feature .attr ["ID" ]] = transcript_accession
0 commit comments