@@ -63,6 +63,7 @@ Contact: Tobias Rausch (rausch@embl.de)
6363#include " spanning.h"
6464#include " coverage.h"
6565#include " junction.h"
66+ #include " fasta_reader.h"
6667
6768#include < sys/types.h>
6869#include < sys/stat.h>
@@ -88,6 +89,7 @@ struct Config {
8889 boost::filesystem::path vcffile;
8990 boost::filesystem::path genome;
9091 boost::filesystem::path exclude;
92+ boost::filesystem::path peDump;
9193 std::vector<boost::filesystem::path> files;
9294};
9395
@@ -300,12 +302,12 @@ _translateSVCoordinates(TUSize refSize, TUSize, TUSize refSizeLeft, TSize diagBe
300302 if (finalGapEnd < refSizeLeft) return true ;
301303 if (sv.ct %2 ==0 ) {
302304 finalGapStart += sv.svStartBeg ;
303- if (sv.ct > =2 ) finalGapEnd = sv.svEndBeg + (finalGapEnd - refSizeLeft) - 1 ;
305+ if (sv.ct = =2 ) finalGapEnd = sv.svEndBeg + (finalGapEnd - refSizeLeft) - 1 ;
304306 else finalGapEnd = sv.svEndBeg + (refSize - finalGapEnd) + 1 ;
305307 } else {
306- if (sv. ct >= 2 ) finalGapStart + = sv.svEndBeg ;
307- else finalGapStart = sv.svEndBeg + (refSizeLeft - finalGapStart) ;
308- finalGapEnd = sv.svStartBeg + (finalGapEnd - refSizeLeft) - 1 ;
308+ finalGapStart = sv.svEndBeg + (refSizeLeft - finalGapStart) ;
309+ if (sv. ct == 1 ) finalGapEnd = sv.svStartBeg + (finalGapEnd - refSizeLeft) - 1 ;
310+ else finalGapEnd = sv.svStartBeg + (refSize - finalGapEnd) + 1 ;
309311 }
310312 if (sv.ct %2 !=0 ) {
311313 TUSize tmpStart = finalGapStart;
@@ -378,8 +380,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
378380 typename TReadSet::const_iterator splitIter = splitReadSet.begin ();
379381 for (;splitIter!=splitReadSet.end ();++splitIter) {
380382 TSequence read;
381- std::string::const_iterator readIt=splitIter->begin ();
382- for (;readIt!=splitIter->end ();++readIt) read.push_back (dna5_encode[int (*readIt)]);
383+ for (std::string::const_iterator readIt=splitIter->begin (); readIt!=splitIter->end (); ++readIt) read.push_back (dna5_encode[int (*readIt)]);
383384 unsigned int readLen = read.size ();
384385
385386 // Count kmer's
@@ -421,7 +422,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
421422 sC .read = read;
422423 sC .forwardRead = alignDir;
423424 readVec.push_back (sC );
424- // std::cout << seqIt->chrName << ',' << sC.offset << ',' << sC.lastKmer << ',' << sC.diag << ',' << sC.forwardRead << std::endl;
425+ // std::cout << sC.offset << ',' << sC.lastKmer << ',' << sC.diag << ',' << sC.forwardRead << ',' << *splitIter << ',' << readLen << std::endl;
425426 }
426427 }
427428 }
@@ -445,6 +446,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
445446 unsigned int bestBoundE = 0 ;
446447 unsigned int bound = 1 ;
447448 unsigned int boundS = 0 ;
449+ int readLenInit = readVecIt->read .size ();
448450 ++readVecIt;
449451 for (;readVecIt!=readVecItEnd;++readVecIt, ++bound) {
450452 // Read pairs should support the same offset and the last kmer should be sufficiently close (allow one mismatch in the last kmer)
@@ -463,7 +465,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
463465 support = 1 ;
464466 }
465467 }
466- if (_betterSplit (refSize, refSizeRight, refSizeLeft, oldDiag, oldOffset, skippedLength, initialLength, c.epsilon , support, bestSupport, sv.ct , readVecIt-> read . size () , svType)) {
468+ if (_betterSplit (refSize, refSizeRight, refSizeLeft, oldDiag, oldOffset, skippedLength, initialLength, c.epsilon , support, bestSupport, sv.ct , readLenInit , svType)) {
467469 bestSupport = support;
468470 bestBoundS = boundS;
469471 bestBoundE = bound;
@@ -644,15 +646,12 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
644646 }
645647
646648 // Debug output
647- // typename TConsSeq::const_iterator csSeqIt = ref1.begin();
648- // typename TConsSeq::const_iterator csSeqItEnd = ref1.end();
649- // for(;csSeqIt != csSeqItEnd; ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
650- // std::cout << std::endl;
651- // csSeqIt = ref2.begin();
652- // csSeqItEnd = ref2.end();
653- // for(;csSeqIt != csSeqItEnd; ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
654- // std::cout << std::endl;
655-
649+ // if (sv.id==2) {
650+ // for(typename TConsSeq::const_iterator csSeqIt = ref1.begin();csSeqIt != ref1.end(); ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
651+ // std::cout << std::endl;
652+ // for(typename TConsSeq::const_iterator csSeqIt = ref2.begin();csSeqIt != ref2.end(); ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
653+ // std::cout << std::endl;
654+ // }
656655
657656 // Calculate forward and reverse dynamic programming matrix
658657 int matchScore = 2 ;
@@ -678,23 +677,16 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
678677 }
679678
680679 // Debug code
681- // typedef std::vector<char> TSequence;
682- // typedef FastaRecord<std::string, unsigned int, Dna5GapAlphabet, TSequence> TFastaRecord;
683- // std::vector<TFastaRecord> align;
684- // globalNwAlignment(align, ref1, consSeq, sc, alConf);
685- // TSequence::iterator alItTmp = align[0].seq.begin();
686- // TSequence::iterator alItTmpEnd = align[0].seq.end();
687- // std::cout << std::endl;
688- // for(unsigned int i = 0; i<100; ++i) std::cout << (i % 10);
689- // std::cout << std::endl;
690- // for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
691- // std::cout << std::endl;
692- // alItTmp = align[1].seq.begin();
693- // alItTmpEnd = align[1].seq.end();
694- // for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
695- // std::cout << std::endl;
696-
697-
680+ // if (sv.id==2) {
681+ // typedef FastaRecord<std::string, unsigned int, Dna5GapAlphabet, TSequence, void> TFastaRecord;
682+ // std::vector<TFastaRecord> align;
683+ // globalNwAlignment(align, ref1, consSeq, sc, alConf);
684+ // std::cout << std::endl;
685+ // for(TSequence::iterator alItTmp = align[0].seq.begin();alItTmp != align[0].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
686+ // std::cout << std::endl;
687+ // for(TSequence::iterator alItTmp = align[1].seq.begin();alItTmp != align[1].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
688+ // std::cout << std::endl;
689+ // }
698690
699691 reverseComplement (ref2);
700692 reverseComplement (consSeq);
@@ -715,18 +707,16 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
715707 }
716708
717709 // Debug code
718- // align.clear();
719- // globalNwAlignment(align, ref2, consSeq, sc, alConf);
720- // alItTmp = align[0].seq.begin();
721- // alItTmpEnd = align[0].seq.end();
722- // std::cout << std::endl;
723- // for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
724- // std::cout << std::endl;
725- // alItTmp = align[1].seq.begin();
726- // alItTmpEnd = align[1].seq.end();
727- // for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
728- // std::cout << std::endl;
729-
710+ // if (sv.id==2) {
711+ // typedef FastaRecord<std::string, unsigned int, Dna5GapAlphabet, TSequence, void> TFastaRecord;
712+ // std::vector<TFastaRecord> align;
713+ // globalNwAlignment(align, ref2, consSeq, sc, alConf);
714+ // std::cout << std::endl;
715+ // for(TSequence::iterator alItTmp = align[0].seq.begin();alItTmp != align[0].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
716+ // std::cout << std::endl;
717+ // for(TSequence::iterator alItTmp = align[1].seq.begin();alItTmp != align[1].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
718+ // std::cout << std::endl;
719+ // }
730720
731721 // Get back to the true orientation
732722 reverseComplement (consSeq);
@@ -1374,7 +1364,7 @@ findPutativeSplitReads(TConfig const& c, std::vector<TStructuralVariantRecord>&
13741364
13751365 // Minimum clip size length fulfilled?
13761366 int minClipSize = (int ) (log10 (al.QueryBases .size ()) * 10 );
1377- #pragma omp critical
1367+ #pragma omp critical
13781368 {
13791369 if (clipSizes[0 ]>=minClipSize) splitReadSet.insert (al.QueryBases );
13801370 }
@@ -1613,9 +1603,9 @@ _updateClique(TBamRecordIterator const& el, TSize& svStart, TSize& svEnd, TSize&
16131603
16141604template <typename TConfig, typename TSampleLibrary, typename TSVs, typename TCountMap, typename TTag>
16151605inline void
1616- _annotateJunctionReads (TConfig const & c, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& junctionCountMap, SVType<TTag>)
1606+ _annotateJunctionReads (TConfig const & c, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& junctionCountMap, SVType<TTag> svType )
16171607{
1618- annotateJunctionReads (c.files , c.genome , c.minGenoQual , sampleLib, svs, junctionCountMap);
1608+ annotateJunctionReads (c.files , c.genome , c.minGenoQual , sampleLib, svs, junctionCountMap, svType );
16191609}
16201610
16211611
@@ -1682,15 +1672,15 @@ inline int run(Config const& c, TSVType svType) {
16821672 }
16831673
16841674 // Get library parameters
1685- #pragma omp parallel for default(shared)
1675+ #pragma omp parallel for default(shared)
16861676 for (unsigned int file_c = 0 ; file_c < c.files .size (); ++file_c) {
16871677 // Get a sample name
16881678 std::string sampleName (c.files [file_c].stem ().string ());
16891679
16901680 // Get library parameters and overall maximum insert size
16911681 TLibraryMap libInfo;
16921682 getLibraryParams (c.files [file_c], libInfo, c.percentAbnormal , c.madCutoff );
1693- #pragma omp critical
1683+ #pragma omp critical
16941684 {
16951685 TLibraryMap::const_iterator libIter=libInfo.begin ();
16961686 for (;libIter!=libInfo.end ();++libIter)
@@ -1699,6 +1689,15 @@ inline int run(Config const& c, TSVType svType) {
16991689 }
17001690 }
17011691
1692+ // Dump PE
1693+ bool dumpPe=false ;
1694+ std::ofstream dumpPeFile;
1695+ if (c.peDump .string ().size ()) {
1696+ dumpPe=true ;
1697+ dumpPeFile.open (c.peDump .string ().c_str ());
1698+ dumpPeFile << " #id\t chr\t pos\t matechr\t matepos\t mapq" << std::endl;
1699+ }
1700+
17021701 // Parse exclude interval list
17031702 BamTools::RefVector::const_iterator itRef = references.begin ();
17041703 std::vector<bool > validChr;
@@ -1922,7 +1921,7 @@ inline int run(Config const& c, TSVType svType) {
19221921 // for(TCompSize::const_iterator compIt = compSize.begin(); compIt!=compSize.end(); ++compIt) std::cerr << *compIt << std::endl;
19231922
19241923 // Iterate each component
1925- #pragma omp parallel for default(shared)
1924+ #pragma omp parallel for default(shared)
19261925 for (int compIt = 0 ; compIt < numComp; ++compIt) {
19271926 if (compSize[compIt]<2 ) continue ;
19281927 typedef boost::graph_traits<Graph>::vertex_descriptor TVertexDescriptor;
@@ -1991,8 +1990,7 @@ inline int run(Config const& c, TSVType svType) {
19911990 svRec.peSupport = clique.size ();
19921991 svRec.wiggle = abs (wiggle);
19931992 std::vector<uint16_t > mapQV;
1994- typename TCliqueMembers::const_iterator itC = clique.begin ();
1995- for (;itC!=clique.end ();++itC) mapQV.push_back (g[*itC]->MapQuality );
1993+ for (typename TCliqueMembers::const_iterator itC = clique.begin (); itC!=clique.end (); ++itC) mapQV.push_back (g[*itC]->MapQuality );
19961994 std::sort (mapQV.begin (), mapQV.end ());
19971995 svRec.peMapQuality = mapQV[mapQV.size ()/2 ];
19981996 if ((refIndex==clusterMateRefID) && (svRec.svStartEnd > svRec.svEndBeg )) {
@@ -2004,15 +2002,28 @@ inline int run(Config const& c, TSVType svType) {
20042002 svRec.srAlignQuality =0 ;
20052003 svRec.precise =false ;
20062004 svRec.ct =connectionType;
2007- #pragma omp critical
2005+ #pragma omp critical
20082006 {
20092007 svRec.id = clique_count++;
20102008 svs.push_back (svRec);
20112009 }
2010+
2011+ // Dump PEs
2012+ #pragma omp critical
2013+ if (dumpPe) {
2014+ for (typename TCliqueMembers::const_iterator itC=clique.begin (); itC!=clique.end (); ++itC) {
2015+ std::stringstream id;
2016+ id << _addID (svType) << std::setw (8 ) << std::setfill (' 0' ) << svRec.id ;
2017+ dumpPeFile << id.str () << " \t " << references[g[*itC]->RefID ].RefName << " \t " << g[*itC]->Position << " \t " << references[g[*itC]->MateRefID ].RefName << " \t " << g[*itC]->MatePosition << " \t " << g[*itC]->MapQuality << std::endl;
2018+ }
2019+ }
20122020 }
20132021 }
20142022 }
20152023
2024+ // Close dump PE file
2025+ if (dumpPe) dumpPeFile.close ();
2026+
20162027 // Split-read search
20172028 if (peMapping) {
20182029 if (boost::filesystem::exists (c.genome ) && boost::filesystem::is_regular_file (c.genome ) && boost::filesystem::file_size (c.genome ))
@@ -2115,6 +2126,7 @@ int main(int argc, char **argv) {
21152126 boost::program_options::options_description hidden (" Hidden options" );
21162127 hidden.add_options ()
21172128 (" input-file" , boost::program_options::value< std::vector<boost::filesystem::path> >(&c.files ), " input file" )
2129+ (" pe-dump,p" , boost::program_options::value<boost::filesystem::path>(&c.peDump )->default_value (" " ), " outfile to dump PE info" )
21182130 (" epsilon,e" , boost::program_options::value<float >(&c.epsilon )->default_value (0.1 ), " allowed epsilon deviation of PE vs. SR deletion" )
21192131 (" pe-fraction,c" , boost::program_options::value<float >(&c.percentAbnormal )->default_value (0.0 ), " fixed fraction c of discordant PEs, for c=0 MAD cutoff is used" )
21202132 (" num-split,n" , boost::program_options::value<unsigned int >(&c.minimumSplitRead )->default_value (2 ), " minimum number of splitted reads" )
0 commit comments