Skip to content

Commit 0374dae

Browse files
committed
Added paired-end dump option.
1 parent 2fc7d1a commit 0374dae

File tree

1 file changed

+67
-55
lines changed

1 file changed

+67
-55
lines changed

src/delly.cpp

Lines changed: 67 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Contact: Tobias Rausch (rausch@embl.de)
6363
#include "spanning.h"
6464
#include "coverage.h"
6565
#include "junction.h"
66+
#include "fasta_reader.h"
6667

6768
#include <sys/types.h>
6869
#include <sys/stat.h>
@@ -88,6 +89,7 @@ struct Config {
8889
boost::filesystem::path vcffile;
8990
boost::filesystem::path genome;
9091
boost::filesystem::path exclude;
92+
boost::filesystem::path peDump;
9193
std::vector<boost::filesystem::path> files;
9294
};
9395

@@ -300,12 +302,12 @@ _translateSVCoordinates(TUSize refSize, TUSize, TUSize refSizeLeft, TSize diagBe
300302
if (finalGapEnd < refSizeLeft) return true;
301303
if (sv.ct%2==0) {
302304
finalGapStart += sv.svStartBeg;
303-
if (sv.ct>=2) finalGapEnd = sv.svEndBeg + (finalGapEnd - refSizeLeft) - 1;
305+
if (sv.ct==2) finalGapEnd = sv.svEndBeg + (finalGapEnd - refSizeLeft) - 1;
304306
else finalGapEnd = sv.svEndBeg + (refSize - finalGapEnd) + 1;
305307
} else {
306-
if (sv.ct>=2) finalGapStart += sv.svEndBeg;
307-
else finalGapStart = sv.svEndBeg + (refSizeLeft - finalGapStart);
308-
finalGapEnd = sv.svStartBeg + (finalGapEnd - refSizeLeft) - 1;
308+
finalGapStart = sv.svEndBeg + (refSizeLeft - finalGapStart);
309+
if (sv.ct==1) finalGapEnd = sv.svStartBeg + (finalGapEnd - refSizeLeft) - 1;
310+
else finalGapEnd = sv.svStartBeg + (refSize - finalGapEnd) + 1;
309311
}
310312
if (sv.ct%2!=0) {
311313
TUSize tmpStart = finalGapStart;
@@ -378,8 +380,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
378380
typename TReadSet::const_iterator splitIter = splitReadSet.begin();
379381
for(;splitIter!=splitReadSet.end();++splitIter) {
380382
TSequence read;
381-
std::string::const_iterator readIt=splitIter->begin();
382-
for(;readIt!=splitIter->end();++readIt) read.push_back(dna5_encode[int(*readIt)]);
383+
for(std::string::const_iterator readIt=splitIter->begin(); readIt!=splitIter->end(); ++readIt) read.push_back(dna5_encode[int(*readIt)]);
383384
unsigned int readLen = read.size();
384385

385386
// Count kmer's
@@ -421,7 +422,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
421422
sC.read = read;
422423
sC.forwardRead = alignDir;
423424
readVec.push_back(sC);
424-
//std::cout << seqIt->chrName << ',' << sC.offset << ',' << sC.lastKmer << ',' << sC.diag << ',' << sC.forwardRead << std::endl;
425+
//std::cout << sC.offset << ',' << sC.lastKmer << ',' << sC.diag << ',' << sC.forwardRead << ',' << *splitIter << ',' << readLen << std::endl;
425426
}
426427
}
427428
}
@@ -445,6 +446,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
445446
unsigned int bestBoundE = 0;
446447
unsigned int bound = 1;
447448
unsigned int boundS = 0;
449+
int readLenInit = readVecIt->read.size();
448450
++readVecIt;
449451
for(;readVecIt!=readVecItEnd;++readVecIt, ++bound) {
450452
// Read pairs should support the same offset and the last kmer should be sufficiently close (allow one mismatch in the last kmer)
@@ -463,7 +465,7 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
463465
support = 1;
464466
}
465467
}
466-
if (_betterSplit(refSize, refSizeRight, refSizeLeft, oldDiag, oldOffset, skippedLength, initialLength, c.epsilon, support, bestSupport, sv.ct, readVecIt->read.size(), svType)) {
468+
if (_betterSplit(refSize, refSizeRight, refSizeLeft, oldDiag, oldOffset, skippedLength, initialLength, c.epsilon, support, bestSupport, sv.ct, readLenInit, svType)) {
467469
bestSupport = support;
468470
bestBoundS = boundS;
469471
bestBoundE = bound;
@@ -644,15 +646,12 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
644646
}
645647

646648
// Debug output
647-
//typename TConsSeq::const_iterator csSeqIt = ref1.begin();
648-
//typename TConsSeq::const_iterator csSeqItEnd = ref1.end();
649-
//for(;csSeqIt != csSeqItEnd; ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
650-
//std::cout << std::endl;
651-
//csSeqIt = ref2.begin();
652-
//csSeqItEnd = ref2.end();
653-
//for(;csSeqIt != csSeqItEnd; ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
654-
//std::cout << std::endl;
655-
649+
//if (sv.id==2) {
650+
// for(typename TConsSeq::const_iterator csSeqIt = ref1.begin();csSeqIt != ref1.end(); ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
651+
// std::cout << std::endl;
652+
// for(typename TConsSeq::const_iterator csSeqIt = ref2.begin();csSeqIt != ref2.end(); ++csSeqIt) std::cout << dna5_decode[(unsigned int) *csSeqIt];
653+
// std::cout << std::endl;
654+
//}
656655

657656
// Calculate forward and reverse dynamic programming matrix
658657
int matchScore = 2;
@@ -678,23 +677,16 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
678677
}
679678

680679
// Debug code
681-
//typedef std::vector<char> TSequence;
682-
//typedef FastaRecord<std::string, unsigned int, Dna5GapAlphabet, TSequence> TFastaRecord;
683-
//std::vector<TFastaRecord> align;
684-
//globalNwAlignment(align, ref1, consSeq, sc, alConf);
685-
//TSequence::iterator alItTmp = align[0].seq.begin();
686-
//TSequence::iterator alItTmpEnd = align[0].seq.end();
687-
//std::cout << std::endl;
688-
//for(unsigned int i = 0; i<100; ++i) std::cout << (i % 10);
689-
//std::cout << std::endl;
690-
//for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
691-
//std::cout << std::endl;
692-
//alItTmp = align[1].seq.begin();
693-
//alItTmpEnd = align[1].seq.end();
694-
//for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
695-
//std::cout << std::endl;
696-
697-
680+
//if (sv.id==2) {
681+
// typedef FastaRecord<std::string, unsigned int, Dna5GapAlphabet, TSequence, void> TFastaRecord;
682+
// std::vector<TFastaRecord> align;
683+
// globalNwAlignment(align, ref1, consSeq, sc, alConf);
684+
// std::cout << std::endl;
685+
// for(TSequence::iterator alItTmp = align[0].seq.begin();alItTmp != align[0].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
686+
// std::cout << std::endl;
687+
// for(TSequence::iterator alItTmp = align[1].seq.begin();alItTmp != align[1].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
688+
// std::cout << std::endl;
689+
//}
698690

699691
reverseComplement(ref2);
700692
reverseComplement(consSeq);
@@ -715,18 +707,16 @@ void searchSplit(TConfig const& c, TStructuralVariantRecord& sv, std::string con
715707
}
716708

717709
// Debug code
718-
//align.clear();
719-
//globalNwAlignment(align, ref2, consSeq, sc, alConf);
720-
//alItTmp = align[0].seq.begin();
721-
//alItTmpEnd = align[0].seq.end();
722-
//std::cout << std::endl;
723-
//for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
724-
//std::cout << std::endl;
725-
//alItTmp = align[1].seq.begin();
726-
//alItTmpEnd = align[1].seq.end();
727-
//for(;alItTmp != alItTmpEnd; ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
728-
//std::cout << std::endl;
729-
710+
//if (sv.id==2) {
711+
// typedef FastaRecord<std::string, unsigned int, Dna5GapAlphabet, TSequence, void> TFastaRecord;
712+
// std::vector<TFastaRecord> align;
713+
// globalNwAlignment(align, ref2, consSeq, sc, alConf);
714+
// std::cout << std::endl;
715+
// for(TSequence::iterator alItTmp = align[0].seq.begin();alItTmp != align[0].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
716+
// std::cout << std::endl;
717+
// for(TSequence::iterator alItTmp = align[1].seq.begin();alItTmp != align[1].seq.end(); ++alItTmp) std::cout << dna5gap_decode[(int) *alItTmp];
718+
// std::cout << std::endl;
719+
//}
730720

731721
// Get back to the true orientation
732722
reverseComplement(consSeq);
@@ -1374,7 +1364,7 @@ findPutativeSplitReads(TConfig const& c, std::vector<TStructuralVariantRecord>&
13741364

13751365
// Minimum clip size length fulfilled?
13761366
int minClipSize = (int) (log10(al.QueryBases.size()) * 10);
1377-
#pragma omp critical
1367+
#pragma omp critical
13781368
{
13791369
if (clipSizes[0]>=minClipSize) splitReadSet.insert(al.QueryBases);
13801370
}
@@ -1613,9 +1603,9 @@ _updateClique(TBamRecordIterator const& el, TSize& svStart, TSize& svEnd, TSize&
16131603

16141604
template<typename TConfig, typename TSampleLibrary, typename TSVs, typename TCountMap, typename TTag>
16151605
inline void
1616-
_annotateJunctionReads(TConfig const& c, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& junctionCountMap, SVType<TTag>)
1606+
_annotateJunctionReads(TConfig const& c, TSampleLibrary& sampleLib, TSVs& svs, TCountMap& junctionCountMap, SVType<TTag> svType)
16171607
{
1618-
annotateJunctionReads(c.files, c.genome, c.minGenoQual, sampleLib, svs, junctionCountMap);
1608+
annotateJunctionReads(c.files, c.genome, c.minGenoQual, sampleLib, svs, junctionCountMap, svType);
16191609
}
16201610

16211611

@@ -1682,15 +1672,15 @@ inline int run(Config const& c, TSVType svType) {
16821672
}
16831673

16841674
// Get library parameters
1685-
#pragma omp parallel for default(shared)
1675+
#pragma omp parallel for default(shared)
16861676
for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) {
16871677
// Get a sample name
16881678
std::string sampleName(c.files[file_c].stem().string());
16891679

16901680
// Get library parameters and overall maximum insert size
16911681
TLibraryMap libInfo;
16921682
getLibraryParams(c.files[file_c], libInfo, c.percentAbnormal, c.madCutoff);
1693-
#pragma omp critical
1683+
#pragma omp critical
16941684
{
16951685
TLibraryMap::const_iterator libIter=libInfo.begin();
16961686
for(;libIter!=libInfo.end();++libIter)
@@ -1699,6 +1689,15 @@ inline int run(Config const& c, TSVType svType) {
16991689
}
17001690
}
17011691

1692+
// Dump PE
1693+
bool dumpPe=false;
1694+
std::ofstream dumpPeFile;
1695+
if (c.peDump.string().size()) {
1696+
dumpPe=true;
1697+
dumpPeFile.open(c.peDump.string().c_str());
1698+
dumpPeFile << "#id\tchr\tpos\tmatechr\tmatepos\tmapq" << std::endl;
1699+
}
1700+
17021701
// Parse exclude interval list
17031702
BamTools::RefVector::const_iterator itRef = references.begin();
17041703
std::vector<bool> validChr;
@@ -1922,7 +1921,7 @@ inline int run(Config const& c, TSVType svType) {
19221921
//for(TCompSize::const_iterator compIt = compSize.begin(); compIt!=compSize.end(); ++compIt) std::cerr << *compIt << std::endl;
19231922

19241923
// Iterate each component
1925-
#pragma omp parallel for default(shared)
1924+
#pragma omp parallel for default(shared)
19261925
for(int compIt = 0; compIt < numComp; ++compIt) {
19271926
if (compSize[compIt]<2) continue;
19281927
typedef boost::graph_traits<Graph>::vertex_descriptor TVertexDescriptor;
@@ -1991,8 +1990,7 @@ inline int run(Config const& c, TSVType svType) {
19911990
svRec.peSupport = clique.size();
19921991
svRec.wiggle = abs(wiggle);
19931992
std::vector<uint16_t> mapQV;
1994-
typename TCliqueMembers::const_iterator itC = clique.begin();
1995-
for(;itC!=clique.end();++itC) mapQV.push_back(g[*itC]->MapQuality);
1993+
for(typename TCliqueMembers::const_iterator itC = clique.begin(); itC!=clique.end(); ++itC) mapQV.push_back(g[*itC]->MapQuality);
19961994
std::sort(mapQV.begin(), mapQV.end());
19971995
svRec.peMapQuality = mapQV[mapQV.size()/2];
19981996
if ((refIndex==clusterMateRefID) && (svRec.svStartEnd > svRec.svEndBeg)) {
@@ -2004,15 +2002,28 @@ inline int run(Config const& c, TSVType svType) {
20042002
svRec.srAlignQuality=0;
20052003
svRec.precise=false;
20062004
svRec.ct=connectionType;
2007-
#pragma omp critical
2005+
#pragma omp critical
20082006
{
20092007
svRec.id = clique_count++;
20102008
svs.push_back(svRec);
20112009
}
2010+
2011+
// Dump PEs
2012+
#pragma omp critical
2013+
if (dumpPe) {
2014+
for(typename TCliqueMembers::const_iterator itC=clique.begin(); itC!=clique.end(); ++itC) {
2015+
std::stringstream id;
2016+
id << _addID(svType) << std::setw(8) << std::setfill('0') << svRec.id;
2017+
dumpPeFile << id.str() << "\t" << references[g[*itC]->RefID].RefName << "\t" << g[*itC]->Position << "\t" << references[g[*itC]->MateRefID].RefName << "\t" << g[*itC]->MatePosition << "\t" << g[*itC]->MapQuality << std::endl;
2018+
}
2019+
}
20122020
}
20132021
}
20142022
}
20152023

2024+
// Close dump PE file
2025+
if (dumpPe) dumpPeFile.close();
2026+
20162027
// Split-read search
20172028
if (peMapping) {
20182029
if (boost::filesystem::exists(c.genome) && boost::filesystem::is_regular_file(c.genome) && boost::filesystem::file_size(c.genome))
@@ -2115,6 +2126,7 @@ int main(int argc, char **argv) {
21152126
boost::program_options::options_description hidden("Hidden options");
21162127
hidden.add_options()
21172128
("input-file", boost::program_options::value< std::vector<boost::filesystem::path> >(&c.files), "input file")
2129+
("pe-dump,p", boost::program_options::value<boost::filesystem::path>(&c.peDump)->default_value(""), "outfile to dump PE info")
21182130
("epsilon,e", boost::program_options::value<float>(&c.epsilon)->default_value(0.1), "allowed epsilon deviation of PE vs. SR deletion")
21192131
("pe-fraction,c", boost::program_options::value<float>(&c.percentAbnormal)->default_value(0.0), "fixed fraction c of discordant PEs, for c=0 MAD cutoff is used")
21202132
("num-split,n", boost::program_options::value<unsigned int>(&c.minimumSplitRead)->default_value(2), "minimum number of splitted reads")

0 commit comments

Comments
 (0)