Skip to content

Commit cd3627a

Browse files
committed
Added Gotoh Alignment for Junction Genotyping.
1 parent 0374dae commit cd3627a

File tree

1 file changed

+139
-24
lines changed

1 file changed

+139
-24
lines changed

src/junction.h

Lines changed: 139 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ Contact: Tobias Rausch (rausch@embl.de)
3636
#include <zlib.h>
3737
#include <stdio.h>
3838
#include "kseq.h"
39+
#include "fasta_reader.h"
3940
KSEQ_INIT(gzFile, gzread)
4041

4142
namespace torali {
@@ -117,14 +118,15 @@ inline void
117118
inline void
118119
_getUniqueKmers(TReference const& ref, TKmerSet const& kmerSet, TUniqueKmer& uniqueKmer, unsigned int kmerLength, unsigned int alphsize) {
119120
typedef typename TKmerSet::key_type TUSize;
120-
121+
121122
char currentk[MAXKMERLENGTH];
122123
for(unsigned int i = 0; i<MAXKMERLENGTH; ++i) currentk[i] = 0;
123124
unsigned int kmerlen=0;
124125
TUSize bucket = 0;
125126
std::string::const_iterator sIt= ref.begin();
126127
std::string::const_iterator sItEnd= ref.end();
127-
for(unsigned int seqIndex=0;sIt!=sItEnd;++sIt,++seqIndex) {
128+
unsigned int seqIndex=0;
129+
for(;sIt!=sItEnd;++sIt,++seqIndex) {
128130
if (*sIt!='N') {
129131
if (kmerlen==kmerLength) {
130132
if (kmerSet.find(bucket)==kmerSet.end()) uniqueKmer.insert(ref.substr(seqIndex-kmerlen, kmerlen));
@@ -142,10 +144,109 @@ inline void
142144
if ((kmerlen == kmerLength) && (kmerSet.find(bucket)==kmerSet.end())) uniqueKmer.insert(ref.substr(ref.size()-kmerlen, kmerlen));
143145
}
144146

147+
template<typename TSequence, typename TBreaks>
148+
inline void
149+
_getReferenceBreakpoint(TSequence const& ref, TSequence const& var, TSequence& outVar, TBreaks& bpcoord)
150+
{
151+
typedef std::vector<char> TAlignSeq;
152+
typedef FastaRecord<std::string, unsigned int, Dna5GapAlphabet, TAlignSeq, void> TFastaRecord;
153+
std::vector<TFastaRecord> alignFwd;
154+
std::vector<TFastaRecord> alignRev;
155+
std::vector<TFastaRecord> align;
156+
157+
// Convert to alignment data structure
158+
TAlignSeq r;
159+
for(typename TSequence::const_iterator itS = ref.begin(); itS!=ref.end(); ++itS) r.push_back(dna5_encode[(int) *itS]);
160+
TAlignSeq v;
161+
for(typename TSequence::const_iterator itS = var.begin(); itS!=var.end(); ++itS) v.push_back(dna5_encode[(int) *itS]);
162+
163+
// Decide forward or reverse alignment
164+
DnaScore<int> scr(1, -2, -5, 0);
165+
AlignConfig<true, false, false, true> alc;
166+
int fwdScore=globalGotohAlignment(alignFwd, r, v, scr, alc);
167+
reverseComplement(v);
168+
int revScore=globalGotohAlignment(alignRev, r, v, scr, alc);
169+
if (fwdScore > revScore) reverseComplement(v);
170+
171+
// Compute final alignment
172+
globalGotohAlignment(align, r, v, scr, alc);
173+
174+
// Find longest internal gap
175+
int refIndex=-1;
176+
int varIndex=-1;
177+
int gapStartRefIndex=0;
178+
int gapEndRefIndex=0;
179+
int gapStartVarIndex=0;
180+
int gapEndVarIndex=0;
181+
int longestGap=0;
182+
TAlignSeq::const_iterator alRef = align[0].seq.begin();
183+
TAlignSeq::const_iterator alVar = align[1].seq.begin();
184+
bool inGap=false;
185+
for (; (alRef != align[0].seq.end()) && (alVar != align[1].seq.end()); ++alRef, ++alVar) {
186+
if (*alRef != dna5gap_encode[(int) '-']) ++refIndex;
187+
if (*alVar != dna5gap_encode[(int) '-']) ++varIndex;
188+
// Internal gap?
189+
if (((*alVar == dna5gap_encode[(int) '-']) || (*alRef == dna5gap_encode[(int) '-'])) && (refIndex>0) && (varIndex>0)) {
190+
if (!inGap) {
191+
gapStartRefIndex=refIndex;
192+
gapEndRefIndex=refIndex;
193+
gapStartVarIndex=varIndex;
194+
gapEndVarIndex=varIndex;
195+
} else {
196+
gapEndRefIndex=refIndex;
197+
gapEndVarIndex=varIndex;
198+
}
199+
inGap=true;
200+
} else {
201+
if ((inGap) && ((gapEndRefIndex - gapStartRefIndex) > longestGap)) {
202+
longestGap=gapEndRefIndex - gapStartRefIndex;
203+
bpcoord[0]=gapStartRefIndex;
204+
bpcoord[1]=gapEndRefIndex;
205+
bpcoord[2]=gapStartVarIndex;
206+
bpcoord[3]=gapEndVarIndex;
207+
}
208+
inGap=false;
209+
}
210+
}
211+
212+
// Output sequence might be reverse complemented
213+
outVar.clear();
214+
for(TAlignSeq::const_iterator itA = v.begin(); itA!=v.end(); ++itA) outVar.push_back(dna5_decode[(int) *itA]);
215+
216+
// Debug code
217+
/*
218+
std::cout << "FwdScore: " << fwdScore << ", RevScore: " << revScore << std::endl;
219+
std::cout << "Alignment: " << fwd << std::endl;
220+
std::cout << "Ref: ";
221+
int ind=0;
222+
for(TAlignSeq::iterator alItTmp = r.begin(); alItTmp != r.end(); ++alItTmp, ++ind) {
223+
std::cout << dna5_decode[(int) *alItTmp];
224+
if ((ind==bpcoord[0]) || (ind==bpcoord[1])) std::cout << " | ";
225+
}
226+
std::cout << std::endl;
227+
std::cout << "Var: ";
228+
ind=0;
229+
for(TAlignSeq::iterator alItTmp = v.begin(); alItTmp != v.end(); ++alItTmp, ++ind) {
230+
std::cout << dna5_decode[(int) *alItTmp];
231+
if ((ind==bpcoord[2]) || (ind==bpcoord[3])) std::cout << " | ";
232+
}
233+
std::cout << std::endl;
234+
std::cout << ">Ref" << std::endl;
235+
for(TAlignSeq::iterator alItTmp = align[0].seq.begin(); alItTmp != align[0].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
236+
std::cout << std::endl;
237+
std::cout << ">Contig" << std::endl;
238+
for(TAlignSeq::iterator alItTmp = align[1].seq.begin(); alItTmp != align[1].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
239+
std::cout << std::endl;
240+
std::cout << std::endl;
241+
std::cout << std::endl;
242+
*/
243+
}
244+
145245

146-
template<typename TFiles, typename TGenome, typename TSampleLibrary, typename TSVs, typename TCountMap>
246+
247+
template<typename TFiles, typename TGenome, typename TSampleLibrary, typename TSVs, typename TCountMap, typename TTag>
147248
inline void
148-
annotateJunctionReads(TFiles const& files, TGenome const& genome, uint16_t const minMapQual, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& countMap)
249+
annotateJunctionReads(TFiles const& files, TGenome const& genome, uint16_t const minMapQual, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& countMap, SVType<TTag> svType)
149250
{
150251
typedef typename TCountMap::key_type TSampleSVPair;
151252
typedef typename TCountMap::mapped_type TCountPair;
@@ -190,55 +291,69 @@ inline void
190291
typename TSVs::const_iterator itSVEnd = svs.end();
191292
for(;itSV!=itSVEnd;++itSV) {
192293
if (itSV->consensus.empty()) continue;
193-
if (itSV->chr2 == refIndex) {
194-
int consLen = itSV->consensus.size();
195-
// For translocations temporarily store the first reference part in the probe strings
196-
refProbes[itSV->id]=boost::to_upper_copy(std::string(seq->seq.s + std::max(itSV->svEnd - consLen, 0), seq->seq.s + itSV->svEnd + consLen));
197-
}
294+
int consLen = itSV->consensus.size();
295+
296+
// Create a pseudo structural variant record
297+
StructuralVariantRecord svRec;
298+
svRec.chr = itSV->chr;
299+
svRec.chr2 = itSV->chr2;
300+
svRec.svStartBeg = std::max(itSV->svStart - consLen, 0);
301+
svRec.svStart = itSV->svStart;
302+
svRec.svStartEnd = std::min(itSV->svStart + consLen, references[itSV->chr].RefLength);
303+
svRec.svEndBeg = std::max(itSV->svEnd - consLen, 0);
304+
svRec.svEnd = itSV->svEnd;
305+
svRec.svEndEnd = std::min(itSV->svEnd + consLen, references[itSV->chr2].RefLength);
306+
if ((itSV->chr != itSV->chr2) && (itSV->chr2 == refIndex)) {
307+
refProbes[itSV->id] = _getSVRef(seq->seq.s, svRec, refIndex, svType);
308+
}
198309
if (itSV->chr == refIndex) {
199-
int consLen = itSV->consensus.size();
200-
std::string sourceRight=refProbes[itSV->id];
310+
// Get the reference string
311+
if (itSV->chr != itSV->chr2) svRec.consensus=refProbes[itSV->id];
201312
refProbes[itSV->id]="";
202-
std::string sourceLeft=boost::to_upper_copy(std::string(seq->seq.s + std::max(itSV->svStart - consLen, 0), seq->seq.s + itSV->svStart + consLen));
203-
std::string cons=boost::to_upper_copy(itSV->consensus);
313+
std::string svRefStr = _getSVRef(seq->seq.s, svRec, refIndex, svType);
314+
std::string consensus;
315+
std::vector<int> bpcoord(4);
316+
std::fill(bpcoord.begin(), bpcoord.end(), 0);
317+
_getReferenceBreakpoint(svRefStr, itSV->consensus, consensus, bpcoord);
318+
// Has a breakpoint been found
319+
if ((bpcoord[2]==0) || (bpcoord[3]==0)) continue;
320+
// Find tagging kmers
204321
for (unsigned int kmerLength=11; kmerLength<MAXKMERLENGTH; kmerLength+=2) {
205322
unsigned int minReqHammingDist = (unsigned int) ((std::log(kmerLength) * std::log(kmerLength)) / std::log(4));
206323
typedef std::set<boost::multiprecision::uint128_t> TKmerSet;
207324
TKmerSet refKmerSet;
208-
std::string refLeft=sourceLeft;
209-
std::string refRight=sourceRight;
210-
_getKmers(refLeft, refKmerSet, kmerLength, 4);
211-
_getKmers(refRight, refKmerSet, kmerLength, 4);
325+
_getKmers(svRefStr, refKmerSet, kmerLength, 4);
212326
typedef boost::unordered_set<std::string> TUniqueKmers;
213327
TUniqueKmers uniqueKmers;
214-
_getUniqueKmers(cons, refKmerSet, uniqueKmers, kmerLength, 4);
328+
std::string consPiece=consensus.substr(std::max(bpcoord[2] - (int) kmerLength, 0), 2 * kmerLength);
329+
_getUniqueKmers(consPiece, refKmerSet, uniqueKmers, kmerLength, 4);
215330
unsigned int maxHammingDistance=0;
216331
std::string maxHammingKmer="";
217332
if (uniqueKmers.size()) {
218333
TUniqueKmers::iterator itK = uniqueKmers.begin();
219334
for(;itK!=uniqueKmers.end();++itK) {
220-
unsigned int hammingDist = std::min(_getMinHammingDistance(refLeft, *itK, minReqHammingDist), _getMinHammingDistance(refRight, *itK, minReqHammingDist));
335+
unsigned int hammingDist = _getMinHammingDistance(svRefStr, *itK, minReqHammingDist);
221336
if ((hammingDist>minReqHammingDist) && (hammingDist>maxHammingDistance)) {
222337
maxHammingDistance=hammingDist;
223338
maxHammingKmer=*itK;
224339
}
225340
}
226341
}
227342
if (maxHammingDistance) {
228-
// Found suitable kmer
343+
// Found suitable kmer, find reference tagging kmer
229344
TKmerSet consKmerSet;
230-
_getKmers(cons, consKmerSet, kmerLength, 4);
345+
_getKmers(consensus, consKmerSet, kmerLength, 4);
231346
TUniqueKmers uniqueRefKmers;
232-
refLeft=refLeft.substr(std::max((int) refLeft.size()/2 - (int) kmerLength, 0), 2 * kmerLength);
233-
refRight=refRight.substr(std::max((int) refRight.size()/2 - (int) kmerLength, 0), 2 * kmerLength);
347+
std::string refLeft=svRefStr.substr(std::max(bpcoord[0] - (int) kmerLength, 0), 2 * kmerLength);
348+
std::string refRight=svRefStr.substr(std::max(bpcoord[1] - (int) kmerLength, 0), 2 * kmerLength);
234349
_getUniqueKmers(refLeft, consKmerSet, uniqueRefKmers, kmerLength, 4);
235350
_getUniqueKmers(refRight, consKmerSet, uniqueRefKmers, kmerLength, 4);
236351
unsigned int maxRefHammingDistance=0;
237352
std::string maxRefHammingKmer="";
238353
if (uniqueRefKmers.size()) {
239354
TUniqueKmers::iterator itK = uniqueRefKmers.begin();
240355
for(;itK!=uniqueRefKmers.end();++itK) {
241-
unsigned int hammingDist = _getMinHammingDistance(cons, *itK, minReqHammingDist);
356+
unsigned int hammingDist = _getMinHammingDistance(consensus, *itK, minReqHammingDist);
242357
if ((hammingDist>minReqHammingDist) && (hammingDist>maxRefHammingDistance)) {
243358
maxRefHammingDistance=hammingDist;
244359
maxRefHammingKmer=*itK;

0 commit comments

Comments
 (0)