@@ -8,26 +8,31 @@ Cluster::Cluster(Options* opt){
88}
99
1010Cluster::~Cluster (){
11- for ( int i= 0 ; i< mPairs . size (); i++) {
12- delete mPairs [i];
13- mPairs [i] = NULL ;
11+ map<string, Pair*>::iterator iter;
12+ for (iter = mPairs . begin (); iter!= mPairs . end (); iter++) {
13+ delete iter-> second ;
1414 }
1515}
1616
1717void Cluster::addPair (Pair* p){
18- mPairs .push_back (p);
18+ string qname = p->getQName ();
19+ if (mPairs .count (qname)>0 )
20+ delete mPairs [qname];
21+ mPairs [qname] = p;
1922}
2023
2124void Cluster::dump (){
22- for (int i=0 ; i<mPairs .size (); i++) {
23- mPairs [i]->dump ();
25+ map<string, Pair*>::iterator iter;
26+ for (iter = mPairs .begin (); iter!=mPairs .end (); iter++) {
27+ iter->second ->dump ();
2428 }
2529}
2630
2731bool Cluster::matches (Pair* p){
28- for (int i=0 ; i<mPairs .size (); i++) {
29- if (mPairs [i]->isDupWith (p))
30- return true ;
32+ map<string, Pair*>::iterator iter;
33+ for (iter = mPairs .begin (); iter!=mPairs .end (); iter++) {
34+ if (iter->second ->isDupWith (p))
35+ return true ;
3136 }
3237 return false ;
3338}
@@ -93,8 +98,9 @@ int Cluster::umiDiff(const string& umi1, const string& umi2) {
9398vector<Pair*> Cluster::clusterByUMI (int umiDiffThreshold) {
9499 vector<Cluster*> subClusters;
95100 map<string, int > umiCount;
96- for (int i=0 ; i<mPairs .size (); i++) {
97- string umi = mPairs [i]->getUMI ();
101+ map<string, Pair*>::iterator iterOfPairs;
102+ for (iterOfPairs = mPairs .begin (); iterOfPairs!=mPairs .end (); iterOfPairs++) {
103+ string umi = iterOfPairs->second ->getUMI ();
98104 umiCount[umi]++;
99105 }
100106 while (mPairs .size ()>0 ) {
@@ -112,9 +118,9 @@ vector<Pair*> Cluster::clusterByUMI(int umiDiffThreshold) {
112118 Cluster* c = new Cluster (mOptions );
113119
114120 // create the group by the top UMI
115- vector< Pair*>::iterator piter;
121+ map<string, Pair*>::iterator piter;
116122 for (piter = mPairs .begin (); piter!=mPairs .end ();){
117- Pair* p = * piter;
123+ Pair* p = piter-> second ;
118124 string umi = p->getUMI ();
119125 if (umiDiff (umi, topUMI) <= umiDiffThreshold) {
120126 c->addPair (p);
@@ -146,13 +152,6 @@ vector<Pair*> Cluster::clusterByUMI(int umiDiffThreshold) {
146152}
147153
148154Pair* Cluster::consensusMerge () {
149- /* if(mPairs.size() == 1) {
150- Pair* p = mPairs[mPairs.size()-1];
151- p->mMergeReads = mPairs.size();
152- mPairs.pop_back();
153- return p;
154- }*/
155-
156155 int leftDiff = 0 ;
157156 int rightDiff = 0 ;
158157 bam1_t * left = consensusMergeBam (true , leftDiff);
@@ -181,45 +180,88 @@ Pair* Cluster::consensusMerge() {
181180}
182181
183182bam1_t * Cluster::consensusMergeBam (bool isLeft, int & diff) {
183+ if (mPairs .size () < mOptions ->clusterSizeReq ) {
184+ return NULL ;
185+ }
186+ vector<Pair*> allPairs;
187+ map<string, Pair*>::iterator iterOfPairs;
188+ for (iterOfPairs = mPairs .begin (); iterOfPairs!=mPairs .end (); iterOfPairs++) {
189+ allPairs.push_back (iterOfPairs->second );
190+ }
191+ if (mPairs .size () > mOptions ->skipLowComplexityClusterThreshold ) {
192+ map<string, int > cigars;
193+ bam1_t * firstRead = NULL ;
194+ for (iterOfPairs = mPairs .begin (); iterOfPairs!=mPairs .end (); iterOfPairs++) {
195+ Pair* p = iterOfPairs->second ;
196+ bam1_t * b = p->mLeft ;
197+ if (!isLeft)
198+ b = p->mRight ;
199+ if (b) {
200+ string qname = BamUtil::getQName (b);
201+ if (cigars.count (qname) == 0 )
202+ cigars[qname] = 1 ;
203+ else
204+ cigars[qname]++;
205+ if (!firstRead)
206+ firstRead = b;
207+ }
208+ }
209+ // this is abnormal, usually due to mapping result of low complexity reads
210+ if (cigars.size () > mPairs .size () * 0.5 && firstRead) {
211+ string seq = BamUtil::getSeq (firstRead);
212+ int diffNeighbor = 0 ;
213+ for (int i=0 ;i<seq.length ()-1 ;i++) {
214+ if (seq[i] != seq[i+1 ])
215+ diffNeighbor++;
216+ }
217+ if (diffNeighbor < seq.length ()*0.5 ) {
218+ if (mOptions ->debug ) {
219+ cerr << " Skipping " << mPairs .size () << " low complexity reads like: " << seq << endl;
220+ }
221+ return NULL ;
222+ }
223+ }
224+ }
225+
184226 bool leftReadMode = isLeft;
185227 // if processing right reads, check if this group is aligned by left
186228 if (!isLeft) {
187229 bool leftAligned = true ;
188230 int lastPos = -1 ;
189- for (int i=0 ; i<mPairs .size (); i++) {
190- if (mPairs [i]->mRight ) {
191- if (lastPos >= 0 && mPairs [i]->mRight ->core .pos != lastPos) {
231+ for (int i=0 ; i<allPairs .size (); i++) {
232+ if (allPairs [i]->mRight ) {
233+ if (lastPos >= 0 && allPairs [i]->mRight ->core .pos != lastPos) {
192234 leftAligned = false ;
193235 break ;
194236 }
195- lastPos = mPairs [i]->mRight ->core .pos ;
237+ lastPos = allPairs [i]->mRight ->core .pos ;
196238 }
197239 }
198240 // if it's left aligned, then process them as left reads
199241 if (leftAligned)
200242 leftReadMode = true ;
201243 }
202244 // first we get a read that is most contained by other reads
203- vector<int > containedByList (mPairs .size (), 0 );
204- for (int i=0 ; i<mPairs .size (); i++) {
245+ vector<int > containedByList (allPairs .size (), 0 );
246+ for (int i=0 ; i<allPairs .size (); i++) {
205247 bam1_t * part = NULL ;
206248 if (isLeft)
207- part = mPairs [i]->mLeft ;
249+ part = allPairs [i]->mLeft ;
208250 else
209- part = mPairs [i]->mRight ;
251+ part = allPairs [i]->mRight ;
210252 if (part == NULL )
211253 continue ;
212254
213255 int containedBy = 1 ;
214256
215- for (int j=0 ; j<mPairs .size (); j++) {
257+ for (int j=0 ; j<allPairs .size (); j++) {
216258 if (i == j)
217259 continue ;
218260 bam1_t * whole = NULL ;
219261 if (isLeft)
220- whole = mPairs [j]->mLeft ;
262+ whole = allPairs [j]->mLeft ;
221263 else
222- whole = mPairs [j]->mRight ;
264+ whole = allPairs [j]->mRight ;
223265 if (whole == NULL )
224266 continue ;
225267
@@ -248,15 +290,15 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
248290 int thisLen = 0 ;
249291 int curLen = 0 ;
250292 if (isLeft) {
251- if (mPairs [i]->mLeft )
252- thisLen = mPairs [i]->mLeft ->core .l_qseq ;
253- if (mPairs [mostContainedById]->mLeft )
254- curLen = mPairs [mostContainedById]->mLeft ->core .l_qseq ;
293+ if (allPairs [i]->mLeft )
294+ thisLen = allPairs [i]->mLeft ->core .l_qseq ;
295+ if (allPairs [mostContainedById]->mLeft )
296+ curLen = allPairs [mostContainedById]->mLeft ->core .l_qseq ;
255297 } else {
256- if (mPairs [i]->mRight )
257- thisLen = mPairs [i]->mRight ->core .l_qseq ;
258- if (mPairs [mostContainedById]->mRight )
259- curLen = mPairs [mostContainedById]->mRight ->core .l_qseq ;
298+ if (allPairs [i]->mRight )
299+ thisLen = allPairs [i]->mRight ->core .l_qseq ;
300+ if (allPairs [mostContainedById]->mRight )
301+ curLen = allPairs [mostContainedById]->mRight ->core .l_qseq ;
260302 }
261303 if (thisLen < curLen) {
262304 mostContainedByNum = containedByList[i];
@@ -266,22 +308,23 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
266308 }
267309
268310 // no marjority
269- if (mostContainedByNum < containedByList.size ()*0.4 && containedByList.size () != 1 )
311+ if (mostContainedByNum < containedByList.size ()*0.4 && containedByList.size () != 1 ) {
270312 return NULL ;
313+ }
271314
272315 bam1_t * out = NULL ;
273316 char * outScore = NULL ;
274317 if (isLeft) {
275- out = mPairs [mostContainedById]->mLeft ;
276- outScore = mPairs [mostContainedById]->getLeftScore ();
318+ out = allPairs [mostContainedById]->mLeft ;
319+ outScore = allPairs [mostContainedById]->getLeftScore ();
277320 // make it null so that it will not be deleted
278- mPairs [mostContainedById]->mLeft = NULL ;
321+ allPairs [mostContainedById]->mLeft = NULL ;
279322 }
280323 else {
281- out = mPairs [mostContainedById]->mRight ;
282- outScore = mPairs [mostContainedById]->getRightScore ();
324+ out = allPairs [mostContainedById]->mRight ;
325+ outScore = allPairs [mostContainedById]->getRightScore ();
283326 // make it null so that it will not be deleted
284- mPairs [mostContainedById]->mRight = NULL ;
327+ allPairs [mostContainedById]->mRight = NULL ;
285328 }
286329
287330 if (out == NULL ) {
@@ -294,18 +337,18 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
294337 reads.push_back (out);
295338 scores.push_back (outScore);
296339
297- for (int j=0 ; j<mPairs .size (); j++) {
340+ for (int j=0 ; j<allPairs .size (); j++) {
298341 if (mostContainedById == j)
299342 continue ;
300343 bam1_t * read = NULL ;
301344 char * score = NULL ;
302345 if (isLeft) {
303- read = mPairs [j]->mLeft ;
304- score = mPairs [j]->getLeftScore ();
346+ read = allPairs [j]->mLeft ;
347+ score = allPairs [j]->getLeftScore ();
305348 }
306349 else {
307- read = mPairs [j]->mRight ;
308- score = mPairs [j]->getRightScore ();
350+ read = allPairs [j]->mRight ;
351+ score = allPairs [j]->getRightScore ();
309352 }
310353 if (read == NULL || score == NULL )
311354 continue ;
@@ -322,32 +365,6 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
322365 return NULL ;
323366 }
324367
325- // if the sequences are right ones of pairs, we check whether it a completely a chaos
326- /* if(!isLeft) {
327- int bothEndNotAligned = 0;
328- for(int r=0; r<reads.size(); r++) {
329- // left aligned
330- if(reads[r]->core.pos == out->core.pos && BamUtil::isPartOf(out, reads[r], true))
331- continue;
332- // right aligned
333- if(reads[r]->core.pos + bam_cigar2rlen(reads[r]->core.n_cigar, (uint32_t *)bam_get_cigar(reads[r])) == out->core.pos + bam_cigar2rlen(out->core.n_cigar, (uint32_t *)bam_get_cigar(out)))
334- continue;
335-
336- // both not aligned
337- bothEndNotAligned++;
338- }
339-
340- if(bothEndNotAligned*2 >= reads.size()) {
341- cerr << "Chaos of " << reads.size() << " reads: " << BamUtil::getQName(out) << endl;
342- for(int r=0; r<reads.size(); r++) {
343- cerr << reads[r]->core.pos << "," << reads[r]->core.mpos << "," << reads[r]->core.isize << "," << reads[r]->core.l_qseq << "," << BamUtil::getCigar(reads[r]) << endl;
344- }
345- bam_destroy1(out);
346- out = NULL;
347- return NULL;
348- }
349- }*/
350-
351368 diff = makeConsensus (reads, out, scores, leftReadMode);
352369
353370 return out;
@@ -615,36 +632,20 @@ void Cluster::addRead(bam1_t* b) {
615632 if (b->core .isize > 0 ) {
616633 Pair* p = new Pair (mOptions );
617634 p->setLeft (b);
618- mPairs . push_back (p) ;
635+ mPairs [p-> getQName ()]=p ;
619636 return ;
620637 }
621- // right or unproper paired
622- bool found = false ;
623- string qname = BamUtil::getQName (b);
624- for (int i=0 ; i<mPairs .size (); i++) {
625- Pair* p = mPairs [i];
626- if (p->mLeft && !p->mRight ) {
627- if (p->getQName () == qname) {
628- p->setRight (b);
629- found = true ;
630- break ;
631- }
632- } else if (p->mRight && !p->mLeft ) {
633- if (p->getQName () == qname) {
634- p->setLeft (b);
635- found = true ;
636- break ;
637- }
638- }
639- }
640638
641- if (!found) {
639+ string qname = BamUtil::getQName (b);
640+ if (mPairs .count (qname) > 0 )
641+ mPairs [qname]->setRight (b);
642+ else {
642643 Pair* p = new Pair (mOptions );
643644 if (b->core .isize < 0 )
644645 p->setRight (b);
645646 else
646647 p->setLeft (b);
647- mPairs . push_back (p) ;
648+ mPairs [qname]=p ;
648649 }
649650}
650651
0 commit comments