Skip to content

Commit a929f94

Browse files
committed
Consider the possibility of strand=".". Resolves #2158
1 parent 1a20a09 commit a929f94

File tree

3 files changed

+28
-22
lines changed

3 files changed

+28
-22
lines changed

csq.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
965965
if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0;
966966

967967
#if XDBG
968-
fprintf(stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev");
968+
fprintf(stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":(tr->strand==STRAND_REV?"rev":"unk"));
969969
fprintf(stderr," %d .. %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt);
970970
#endif
971971

@@ -998,7 +998,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint
998998
while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
999999
if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced
10001000
}
1001-
else
1001+
else if ( tr->strand==STRAND_FWD )
10021002
{
10031003
// STRAND_FWD
10041004
int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion
@@ -1271,13 +1271,13 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
12711271
{
12721272
if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
12731273
if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
1274-
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
1274+
else if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
12751275
}
12761276
if ( splice->ref_end > ex_end - 3 )
12771277
{
12781278
if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
12791279
if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
1280-
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
1280+
else if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
12811281
}
12821282
if ( splice->set_refalt )
12831283
{
@@ -1338,17 +1338,17 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds,
13381338
if ( !(tr->trim & TRIM_5PRIME) )
13391339
{
13401340
if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
1341-
else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
1341+
else if ( tr->strand==STRAND_REV ) { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
13421342
}
13431343
if ( !(tr->trim & TRIM_3PRIME) )
13441344
{
13451345
if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
1346-
else { if ( child->icds==0 ) splice.check_stop = 1; }
1346+
else if ( tr->strand==STRAND_REV ) { if ( child->icds==0 ) splice.check_stop = 1; }
13471347
}
13481348
if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
13491349
{
13501350
if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
1351-
else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
1351+
else if ( tr->strand==STRAND_REV ) { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
13521352
}
13531353
if ( child->icds!=0 ) splice.check_region_beg = 1;
13541354
if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
@@ -1586,7 +1586,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
15861586
}
15871587
}
15881588
}
1589-
else // STRAND_REV
1589+
else if ( strand==STRAND_REV )
15901590
{
15911591
// right padding - number of bases to take from ref
15921592
npad = (seq.m - (sbeg + seq.l)) % 3;
@@ -1673,6 +1673,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,
16731673
}
16741674
}
16751675
}
1676+
else error("Should not happen: %d\n", strand);
16761677
kputc_(0,tseq); tseq->l--;
16771678
#if DBG
16781679
fprintf(stderr," tseq: %s\n", tseq->s);
@@ -1858,7 +1859,7 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
18581859
kputs(gf_type2gff_string(csq->biotype), str);
18591860

18601861
if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
1861-
kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
1862+
kputs(csq->strand==STRAND_FWD ? "|+" : (csq->strand==STRAND_REV ? "|-" : "|."), str);
18621863

18631864
if ( csq->vstr.l )
18641865
kputs(csq->vstr.s, str);
@@ -1882,6 +1883,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
18821883
{
18831884
int i;
18841885
gf_tscript_t *tr = hap->tr;
1886+
assert( tr->strand==STRAND_FWD || tr->strand==STRAND_REV );
18851887
int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
18861888
int icsq = node->ncsq_list++;
18871889
hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
@@ -2177,7 +2179,7 @@ void hap_finalize(args_t *args, hap_t *hap)
21772179
indel = 0;
21782180
}
21792181
}
2180-
else
2182+
else if ( tr->strand==STRAND_REV )
21812183
{
21822184
i = istack + 1, ibeg = -1;
21832185
while ( --i > 0 )

gff.c

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ typedef struct
5353
uint32_t beg;
5454
uint32_t end;
5555
uint32_t trid;
56-
uint32_t strand:1; // STRAND_REV,STRAND_FWD
56+
uint32_t strand:2; // STRAND_{REV,FWD,UNK}
5757
uint32_t phase:2; // 0, 1, 2, or 3 for unknown
58-
uint32_t iseq:29;
58+
uint32_t iseq:28;
5959
}
6060
ftr_t;
6161

@@ -474,7 +474,7 @@ static void gff_parse_exon(gff_t *gff, const char *line, ftr_t *ftr)
474474
// associate with transcript id
475475
gff_id_register(&gff->tscript_ids, aux->parent, aux->parent_end, &ftr->trid);
476476

477-
if ( ftr->strand==-1 && gff->verbosity > 0 )
477+
if ( ftr->strand==STRAND_UNK && gff->verbosity > 0 )
478478
{
479479
if ( !gff->warned.unknown_strand || gff->verbosity > 1 )
480480
fprintf(stderr,"Warning: Ignoring GFF feature with unknown strand .. %s\n",line);
@@ -568,6 +568,7 @@ static int gff_parse_line(gff_t *gff, char *line, ftr_t *ftr)
568568
ftr->strand = -1;
569569
if ( *ss == '+' ) ftr->strand = STRAND_FWD;
570570
else if ( *ss == '-' ) ftr->strand = STRAND_REV;
571+
else ftr->strand = STRAND_UNK;
571572
ss += 2;
572573

573574
// 8th column: phase (codon offset)
@@ -757,7 +758,7 @@ static void tscript_init_cds(gff_t *gff)
757758
}
758759
if ( !tscript_ok ) continue; // skip this transcript
759760
}
760-
else
761+
else if ( tr->strand==STRAND_REV )
761762
{
762763
if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN )
763764
{
@@ -820,6 +821,8 @@ static void tscript_init_cds(gff_t *gff)
820821
}
821822
if ( !tscript_ok ) continue; // skip this transcript
822823
}
824+
else
825+
continue; // unknown strand
823826

824827
// set len. At the same check that CDS within a transcript do not overlap
825828
len = 0;
@@ -868,7 +871,7 @@ static void tscript_init_cds(gff_t *gff)
868871
i--;
869872
}
870873
}
871-
else
874+
else if ( tr->strand==STRAND_REV )
872875
{
873876
i = 0;
874877
while ( i<tr->ncds && len%3 )
@@ -910,7 +913,7 @@ static int gff_dump(gff_t *gff, const char *fname)
910913
gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
911914
char *gene_id = gff->init.gene_ids.str[gene->id];
912915
str.l = 0;
913-
ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':'-',gene_id,gene->name,gene->used);
916+
ksprintf(&str,"%s\t.\tgene\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Name=%s;used=%d\n",gff->init.seq[gene->iseq],gene->beg+1,gene->end+1,gene->strand==STRAND_FWD?'+':(gene->strand==STRAND_REV?'-':'.'),gene_id,gene->name,gene->used);
914917
if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
915918
}
916919

@@ -921,7 +924,7 @@ static int gff_dump(gff_t *gff, const char *fname)
921924
char *gene_id = gff->init.gene_ids.str[tr->gene->id];
922925
const char *type = tr->type==GF_PROTEIN_CODING ? "mRNA" : gf_type2gff_string(tr->type);
923926
str.l = 0;
924-
ksprintf(&str,"%s\t.\t%s\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
927+
ksprintf(&str,"%s\t.\t%s\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tID=%s;Parent=%s;biotype=%s;used=%d\n",itr->seq,type,itr->beg+1,itr->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id],gene_id,gf_type2gff_string(tr->type),tr->used);
925928
if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
926929
}
927930
regitr_destroy(itr);
@@ -932,7 +935,7 @@ static int gff_dump(gff_t *gff, const char *fname)
932935
gf_cds_t *cds = regitr_payload(itr,gf_cds_t*);
933936
gf_tscript_t *tr = cds->tr;
934937
str.l = 0;
935-
ksprintf(&str,"%s\t.\tCDS\t%"PRIu32"\t%"PRIu32"\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':'-',cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
938+
ksprintf(&str,"%s\t.\tCDS\t%"PRIu32"\t%"PRIu32"\t.\t%c\t%c\tParent=%s\n",itr->seq,cds->beg+1,cds->beg+cds->len,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),cds->phase==3?'.':cds->phase+(int)'0',gff->tscript_ids.str[tr->id]);
936939
if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
937940
}
938941
regitr_destroy(itr);
@@ -943,7 +946,7 @@ static int gff_dump(gff_t *gff, const char *fname)
943946
gf_utr_t *utr = regitr_payload(itr,gf_utr_t*);
944947
gf_tscript_t *tr = utr->tr;
945948
str.l = 0;
946-
ksprintf(&str,"%s\t.\t%s_prime_UTR\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
949+
ksprintf(&str,"%s\t.\t%s_prime_UTR\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,utr->which==prime3?"three":"five",utr->beg+1,utr->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id]);
947950
if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
948951
}
949952
regitr_destroy(itr);
@@ -954,7 +957,7 @@ static int gff_dump(gff_t *gff, const char *fname)
954957
gf_exon_t *exon = regitr_payload(itr,gf_exon_t*);
955958
gf_tscript_t *tr = exon->tr;
956959
str.l = 0;
957-
ksprintf(&str,"%s\t.\texon\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':'-',gff->tscript_ids.str[tr->id]);
960+
ksprintf(&str,"%s\t.\texon\t%"PRIu32"\t%"PRIu32"\t.\t%c\t.\tParent=%s\n",itr->seq,exon->beg+1,exon->end+1,tr->strand==STRAND_FWD?'+':(tr->strand==STRAND_REV?'-':'.'),gff->tscript_ids.str[tr->id]);
958961
if ( bgzf_write(out, str.s, str.l) != str.l ) error("Error writing %s: %s\n", fname, strerror(errno));
959962
}
960963
regitr_destroy(itr);

gff.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@
150150

151151
#define STRAND_REV 0
152152
#define STRAND_FWD 1
153+
#define STRAND_UNK 2
153154

154155
#define TRIM_NONE 0
155156
#define TRIM_5PRIME 1
@@ -273,9 +274,9 @@ struct gf_tscript_t_
273274
{
274275
uint32_t id; // transcript id
275276
uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
276-
uint32_t strand:1, // STRAND_REV or STRAND_FWD
277+
uint32_t strand:2, // STRAND_REV,FWD,UNK
277278
used:1, // does it have any exons, UTRs, CDS?
278-
ncds:30, // number of exons
279+
ncds:29, // number of exons
279280
mcds;
280281
gf_cds_t **cds; // ordered list of exons
281282
uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types

0 commit comments

Comments
 (0)