@@ -36,6 +36,7 @@ Contact: Tobias Rausch (rausch@embl.de)
3636#include < zlib.h>
3737#include < stdio.h>
3838#include " kseq.h"
39+ #include " fasta_reader.h"
3940KSEQ_INIT (gzFile, gzread)
4041
4142namespace torali {
@@ -117,14 +118,15 @@ inline void
117118inline void
118119 _getUniqueKmers (TReference const & ref, TKmerSet const & kmerSet, TUniqueKmer& uniqueKmer, unsigned int kmerLength, unsigned int alphsize) {
119120 typedef typename TKmerSet::key_type TUSize;
120-
121+
121122 char currentk[MAXKMERLENGTH];
122123 for (unsigned int i = 0 ; i<MAXKMERLENGTH; ++i) currentk[i] = 0 ;
123124 unsigned int kmerlen=0 ;
124125 TUSize bucket = 0 ;
125126 std::string::const_iterator sIt = ref.begin ();
126127 std::string::const_iterator sItEnd = ref.end ();
127- for (unsigned int seqIndex=0 ;sIt !=sItEnd ;++sIt ,++seqIndex) {
128+ unsigned int seqIndex=0 ;
129+ for (;sIt !=sItEnd ;++sIt ,++seqIndex) {
128130 if (*sIt !=' N' ) {
129131 if (kmerlen==kmerLength) {
130132 if (kmerSet.find (bucket)==kmerSet.end ()) uniqueKmer.insert (ref.substr (seqIndex-kmerlen, kmerlen));
@@ -142,10 +144,109 @@ inline void
142144 if ((kmerlen == kmerLength) && (kmerSet.find (bucket)==kmerSet.end ())) uniqueKmer.insert (ref.substr (ref.size ()-kmerlen, kmerlen));
143145 }
144146
147+ template <typename TSequence, typename TBreaks>
148+ inline void
149+ _getReferenceBreakpoint (TSequence const & ref, TSequence const & var, TSequence& outVar, TBreaks& bpcoord)
150+ {
151+ typedef std::vector<char > TAlignSeq;
152+ typedef FastaRecord<std::string, unsigned int , Dna5GapAlphabet, TAlignSeq, void > TFastaRecord;
153+ std::vector<TFastaRecord> alignFwd;
154+ std::vector<TFastaRecord> alignRev;
155+ std::vector<TFastaRecord> align;
156+
157+ // Convert to alignment data structure
158+ TAlignSeq r;
159+ for (typename TSequence::const_iterator itS = ref.begin (); itS!=ref.end (); ++itS) r.push_back (dna5_encode[(int ) *itS]);
160+ TAlignSeq v;
161+ for (typename TSequence::const_iterator itS = var.begin (); itS!=var.end (); ++itS) v.push_back (dna5_encode[(int ) *itS]);
162+
163+ // Decide forward or reverse alignment
164+ DnaScore<int > scr (1 , -2 , -5 , 0 );
165+ AlignConfig<true , false , false , true > alc;
166+ int fwdScore=globalGotohAlignment (alignFwd, r, v, scr, alc);
167+ reverseComplement (v);
168+ int revScore=globalGotohAlignment (alignRev, r, v, scr, alc);
169+ if (fwdScore > revScore) reverseComplement (v);
170+
171+ // Compute final alignment
172+ globalGotohAlignment (align, r, v, scr, alc);
173+
174+ // Find longest internal gap
175+ int refIndex=-1 ;
176+ int varIndex=-1 ;
177+ int gapStartRefIndex=0 ;
178+ int gapEndRefIndex=0 ;
179+ int gapStartVarIndex=0 ;
180+ int gapEndVarIndex=0 ;
181+ int longestGap=0 ;
182+ TAlignSeq::const_iterator alRef = align[0 ].seq .begin ();
183+ TAlignSeq::const_iterator alVar = align[1 ].seq .begin ();
184+ bool inGap=false ;
185+ for (; (alRef != align[0 ].seq .end ()) && (alVar != align[1 ].seq .end ()); ++alRef, ++alVar) {
186+ if (*alRef != dna5gap_encode[(int ) ' -' ]) ++refIndex;
187+ if (*alVar != dna5gap_encode[(int ) ' -' ]) ++varIndex;
188+ // Internal gap?
189+ if (((*alVar == dna5gap_encode[(int ) ' -' ]) || (*alRef == dna5gap_encode[(int ) ' -' ])) && (refIndex>0 ) && (varIndex>0 )) {
190+ if (!inGap) {
191+ gapStartRefIndex=refIndex;
192+ gapEndRefIndex=refIndex;
193+ gapStartVarIndex=varIndex;
194+ gapEndVarIndex=varIndex;
195+ } else {
196+ gapEndRefIndex=refIndex;
197+ gapEndVarIndex=varIndex;
198+ }
199+ inGap=true ;
200+ } else {
201+ if ((inGap) && ((gapEndRefIndex - gapStartRefIndex) > longestGap)) {
202+ longestGap=gapEndRefIndex - gapStartRefIndex;
203+ bpcoord[0 ]=gapStartRefIndex;
204+ bpcoord[1 ]=gapEndRefIndex;
205+ bpcoord[2 ]=gapStartVarIndex;
206+ bpcoord[3 ]=gapEndVarIndex;
207+ }
208+ inGap=false ;
209+ }
210+ }
211+
212+ // Output sequence might be reverse complemented
213+ outVar.clear ();
214+ for (TAlignSeq::const_iterator itA = v.begin (); itA!=v.end (); ++itA) outVar.push_back (dna5_decode[(int ) *itA]);
215+
216+ // Debug code
217+ /*
218+ std::cout << "FwdScore: " << fwdScore << ", RevScore: " << revScore << std::endl;
219+ std::cout << "Alignment: " << fwd << std::endl;
220+ std::cout << "Ref: ";
221+ int ind=0;
222+ for(TAlignSeq::iterator alItTmp = r.begin(); alItTmp != r.end(); ++alItTmp, ++ind) {
223+ std::cout << dna5_decode[(int) *alItTmp];
224+ if ((ind==bpcoord[0]) || (ind==bpcoord[1])) std::cout << " | ";
225+ }
226+ std::cout << std::endl;
227+ std::cout << "Var: ";
228+ ind=0;
229+ for(TAlignSeq::iterator alItTmp = v.begin(); alItTmp != v.end(); ++alItTmp, ++ind) {
230+ std::cout << dna5_decode[(int) *alItTmp];
231+ if ((ind==bpcoord[2]) || (ind==bpcoord[3])) std::cout << " | ";
232+ }
233+ std::cout << std::endl;
234+ std::cout << ">Ref" << std::endl;
235+ for(TAlignSeq::iterator alItTmp = align[0].seq.begin(); alItTmp != align[0].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
236+ std::cout << std::endl;
237+ std::cout << ">Contig" << std::endl;
238+ for(TAlignSeq::iterator alItTmp = align[1].seq.begin(); alItTmp != align[1].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
239+ std::cout << std::endl;
240+ std::cout << std::endl;
241+ std::cout << std::endl;
242+ */
243+ }
244+
145245
146- template <typename TFiles, typename TGenome, typename TSampleLibrary, typename TSVs, typename TCountMap>
246+
247+ template <typename TFiles, typename TGenome, typename TSampleLibrary, typename TSVs, typename TCountMap, typename TTag>
147248inline void
148- annotateJunctionReads (TFiles const & files, TGenome const & genome, uint16_t const minMapQual, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& countMap)
249+ annotateJunctionReads (TFiles const & files, TGenome const & genome, uint16_t const minMapQual, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& countMap, SVType<TTag> svType )
149250{
150251 typedef typename TCountMap::key_type TSampleSVPair;
151252 typedef typename TCountMap::mapped_type TCountPair;
@@ -190,55 +291,69 @@ inline void
190291 typename TSVs::const_iterator itSVEnd = svs.end ();
191292 for (;itSV!=itSVEnd;++itSV) {
192293 if (itSV->consensus .empty ()) continue ;
193- if (itSV->chr2 == refIndex) {
194- int consLen = itSV->consensus .size ();
195- // For translocations temporarily store the first reference part in the probe strings
196- refProbes[itSV->id ]=boost::to_upper_copy (std::string (seq->seq .s + std::max (itSV->svEnd - consLen, 0 ), seq->seq .s + itSV->svEnd + consLen));
197- }
294+ int consLen = itSV->consensus .size ();
295+
296+ // Create a pseudo structural variant record
297+ StructuralVariantRecord svRec;
298+ svRec.chr = itSV->chr ;
299+ svRec.chr2 = itSV->chr2 ;
300+ svRec.svStartBeg = std::max (itSV->svStart - consLen, 0 );
301+ svRec.svStart = itSV->svStart ;
302+ svRec.svStartEnd = std::min (itSV->svStart + consLen, references[itSV->chr ].RefLength );
303+ svRec.svEndBeg = std::max (itSV->svEnd - consLen, 0 );
304+ svRec.svEnd = itSV->svEnd ;
305+ svRec.svEndEnd = std::min (itSV->svEnd + consLen, references[itSV->chr2 ].RefLength );
306+ if ((itSV->chr != itSV->chr2 ) && (itSV->chr2 == refIndex)) {
307+ refProbes[itSV->id ] = _getSVRef (seq->seq .s , svRec, refIndex, svType);
308+ }
198309 if (itSV->chr == refIndex) {
199- int consLen = itSV-> consensus . size ();
200- std::string sourceRight =refProbes[itSV->id ];
310+ // Get the reference string
311+ if (itSV-> chr != itSV-> chr2 ) svRec. consensus =refProbes[itSV->id ];
201312 refProbes[itSV->id ]=" " ;
202- std::string sourceLeft=boost::to_upper_copy (std::string (seq->seq .s + std::max (itSV->svStart - consLen, 0 ), seq->seq .s + itSV->svStart + consLen));
203- std::string cons=boost::to_upper_copy (itSV->consensus );
313+ std::string svRefStr = _getSVRef (seq->seq .s , svRec, refIndex, svType);
314+ std::string consensus;
315+ std::vector<int > bpcoord (4 );
316+ std::fill (bpcoord.begin (), bpcoord.end (), 0 );
317+ _getReferenceBreakpoint (svRefStr, itSV->consensus , consensus, bpcoord);
318+ // Has a breakpoint been found
319+ if ((bpcoord[2 ]==0 ) || (bpcoord[3 ]==0 )) continue ;
320+ // Find tagging kmers
204321 for (unsigned int kmerLength=11 ; kmerLength<MAXKMERLENGTH; kmerLength+=2 ) {
205322 unsigned int minReqHammingDist = (unsigned int ) ((std::log (kmerLength) * std::log (kmerLength)) / std::log (4 ));
206323 typedef std::set<boost::multiprecision::uint128_t > TKmerSet;
207324 TKmerSet refKmerSet;
208- std::string refLeft=sourceLeft;
209- std::string refRight=sourceRight;
210- _getKmers (refLeft, refKmerSet, kmerLength, 4 );
211- _getKmers (refRight, refKmerSet, kmerLength, 4 );
325+ _getKmers (svRefStr, refKmerSet, kmerLength, 4 );
212326 typedef boost::unordered_set<std::string> TUniqueKmers;
213327 TUniqueKmers uniqueKmers;
214- _getUniqueKmers (cons, refKmerSet, uniqueKmers, kmerLength, 4 );
328+ std::string consPiece=consensus.substr (std::max (bpcoord[2 ] - (int ) kmerLength, 0 ), 2 * kmerLength);
329+ _getUniqueKmers (consPiece, refKmerSet, uniqueKmers, kmerLength, 4 );
215330 unsigned int maxHammingDistance=0 ;
216331 std::string maxHammingKmer=" " ;
217332 if (uniqueKmers.size ()) {
218333 TUniqueKmers::iterator itK = uniqueKmers.begin ();
219334 for (;itK!=uniqueKmers.end ();++itK) {
220- unsigned int hammingDist = std::min ( _getMinHammingDistance (refLeft , *itK, minReqHammingDist), _getMinHammingDistance (refRight, *itK, minReqHammingDist) );
335+ unsigned int hammingDist = _getMinHammingDistance (svRefStr , *itK, minReqHammingDist);
221336 if ((hammingDist>minReqHammingDist) && (hammingDist>maxHammingDistance)) {
222337 maxHammingDistance=hammingDist;
223338 maxHammingKmer=*itK;
224339 }
225340 }
226341 }
227342 if (maxHammingDistance) {
228- // Found suitable kmer
343+ // Found suitable kmer, find reference tagging kmer
229344 TKmerSet consKmerSet;
230- _getKmers (cons , consKmerSet, kmerLength, 4 );
345+ _getKmers (consensus , consKmerSet, kmerLength, 4 );
231346 TUniqueKmers uniqueRefKmers;
232- refLeft=refLeft .substr (std::max (( int ) refLeft. size ()/ 2 - (int ) kmerLength, 0 ), 2 * kmerLength);
233- refRight=refRight .substr (std::max (( int ) refRight. size ()/ 2 - (int ) kmerLength, 0 ), 2 * kmerLength);
347+ std::string refLeft=svRefStr .substr (std::max (bpcoord[ 0 ] - (int ) kmerLength, 0 ), 2 * kmerLength);
348+ std::string refRight=svRefStr .substr (std::max (bpcoord[ 1 ] - (int ) kmerLength, 0 ), 2 * kmerLength);
234349 _getUniqueKmers (refLeft, consKmerSet, uniqueRefKmers, kmerLength, 4 );
235350 _getUniqueKmers (refRight, consKmerSet, uniqueRefKmers, kmerLength, 4 );
236351 unsigned int maxRefHammingDistance=0 ;
237352 std::string maxRefHammingKmer=" " ;
238353 if (uniqueRefKmers.size ()) {
239354 TUniqueKmers::iterator itK = uniqueRefKmers.begin ();
240355 for (;itK!=uniqueRefKmers.end ();++itK) {
241- unsigned int hammingDist = _getMinHammingDistance (cons , *itK, minReqHammingDist);
356+ unsigned int hammingDist = _getMinHammingDistance (consensus , *itK, minReqHammingDist);
242357 if ((hammingDist>minReqHammingDist) && (hammingDist>maxRefHammingDistance)) {
243358 maxRefHammingDistance=hammingDist;
244359 maxRefHammingKmer=*itK;
0 commit comments