Skip to content

Commit fea52d3

Browse files
committed
version 10.6
1 parent 5df72ec commit fea52d3

File tree

5 files changed

+280
-95
lines changed

5 files changed

+280
-95
lines changed

src/ChrCopyNumber.cpp

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ ChrCopyNumber::ChrCopyNumber(int windowSize, int chrLength, std::string const& c
6767
isMedianCalculated_ = false;
6868
isSmoothed_ = false;
6969
ploidy_=NA;
70+
float meanTargetRegionLength=0;
7071
if (targetBed == "") {
7172
if (windowSize ==0) {
7273
cerr << "Error: windowSize is set to Zero\n";
@@ -150,7 +151,7 @@ ChrCopyNumber::ChrCopyNumber(int windowSize, int chrLength, std::string const& c
150151
oss.flush();
151152
genes_names.push_back(oss.str());
152153
}
153-
154+
meanTargetRegionLength+=atoi(endstr.c_str())-atoi(start.c_str());
154155
++length_;
155156

156157
}
@@ -162,10 +163,14 @@ ChrCopyNumber::ChrCopyNumber(int windowSize, int chrLength, std::string const& c
162163
}
163164

164165
exons_Countchr_ = length_;
165-
166+
meanTargetRegionLength/=length_;
166167
readCount_ = vector<float>(exons_Countchr_,0);
167168

168169
cout << "Number of exons analysed in chromosome "<< chromosome_ << " : " << exons_Countchr_ << "\n";
170+
cout << "Average exon length in chromosome "<< chromosome_ << " : " << meanTargetRegionLength << "\n";
171+
if (meanTargetRegionLength <30) {
172+
cerr << "WARNING: check your file with targeted regions: the average length of targeted regions is unexpectedly short\n";
173+
}
169174

170175
}else {
171176
std::cerr << "Failed to open " << captureFile << "\n";
@@ -415,6 +420,23 @@ void ChrCopyNumber::removeLowReadCountWindows(const int RCThresh) {
415420
}
416421
}
417422

423+
void ChrCopyNumber::fillInRatio(bool islog) {
424+
if ((int)ratio_.size()!=length_)
425+
ratio_.resize(length_);
426+
427+
for (int i = 0; i<length_; i++) {
428+
if (readCount_[i]>=0 && !(mappabilityProfile_.size() > 0 && mappabilityProfile_[i] <= minMappabilityPerWindow)) {
429+
if (islog) {
430+
ratio_[i] = log(readCount_[i]+1)/log(2.0);
431+
}else {
432+
ratio_[i] = readCount_[i];
433+
}
434+
} else {
435+
ratio_[i] = NA;
436+
}
437+
}
438+
}
439+
418440
void ChrCopyNumber::calculateRatioLog(ChrCopyNumber control, const double * a, const int degree){
419441
if ((int)ratio_.size()!=length_)
420442
ratio_.resize(length_);
@@ -623,15 +645,36 @@ void ChrCopyNumber::recalculateRatio (float constant) {
623645
ratio_[i] /= constant;
624646
}
625647

626-
void ChrCopyNumber::recalculateRatioWithContam (float contamination, float normGenytype) { //normGenytype==1 if AB, normGenytype==0.5 if A
648+
void ChrCopyNumber::recalculateLogRatio (float constant) {
627649
for (int i = 0; i<length_; i++)
628-
if (ratio_[i] != NA) {
650+
if (ratio_[i] != NA)
651+
ratio_[i] -= constant;
652+
}
653+
654+
void ChrCopyNumber::recalculateRatioWithContam (float contamination, float normGenytype, bool isLogged) { //normGenytype==1 if AB, normGenytype==0.5 if A
655+
if (!isLogged) {
656+
for (int i = 0; i<length_; i++)
657+
if (ratio_[i] != NA) {
629658
//ratio_[i] = (ratio_[i]-contamination*normGenytype)/(1-contamination); //correct only for ploidy 2
630-
ratio_[i] = (ratio_[i]*(1-contamination+2*contamination/ploidy_) -contamination*normGenytype/ploidy_*2)/(1-contamination);
659+
ratio_[i] = (ratio_[i]*(1-contamination+2*contamination/ploidy_) -contamination*normGenytype/ploidy_*2)/(1-contamination);
631660

632661
if (ratio_[i]<0)
633662
ratio_[i] = 0;
634663
}
664+
} else {
665+
for (int i = 0; i<length_; i++)
666+
if (ratio_[i] != NA) {
667+
float realCopy = pow(2,ratio_[i]);
668+
ratio_[i] = (realCopy*(1-contamination+2*contamination/ploidy_) -contamination*normGenytype/ploidy_*2)/(1-contamination);
669+
if (ratio_[i]<0)
670+
ratio_[i] = NA;
671+
else {
672+
ratio_[i]=log2(ratio_[i]);
673+
}
674+
675+
}
676+
677+
}
635678
}
636679

637680

@@ -659,7 +702,7 @@ int ChrCopyNumber::getCoveredPart(int breakPointStart, int breakPointEnd) { //fo
659702
return lengthCovered;
660703
}
661704

662-
void ChrCopyNumber::calculateCopyNumberMedian(int ploidy, int minCNAlength, bool noisyData, bool CompleteGenomicsData){ //create median profiles using 'bpfinal_' and store them in medianProfile_, info about medians themselves is stored in medianValues_ and about SD in sd_, lengths of fragments in bpLengths_
705+
void ChrCopyNumber::calculateCopyNumberMedian(int ploidy, int minCNAlength, bool noisyData, bool CompleteGenomicsData, bool isLogged){ //create median profiles using 'bpfinal_' and store them in medianProfile_, info about medians themselves is stored in medianValues_ and about SD in sd_, lengths of fragments in bpLengths_
663706
if (ploidy!=ploidy_) {
664707
cerr << "..Warning: in calculateCopyNumberMedian() class's ploidy is different from "<<ploidy<<"\n";
665708
ploidy_=ploidy;
@@ -733,8 +776,11 @@ void ChrCopyNumber::calculateCopyNumberMedian(int ploidy, int minCNAlength, bool
733776

734777
bool ifHomoz = false;
735778
float locMedian=NA;
736-
if (int(data.size())>=minCNAlength && data.size()>0)
779+
if (int(data.size())>=minCNAlength && data.size()>0) {
737780
locMedian = get_median(data); //including the last point
781+
if (isLogged)
782+
locMedian=pow(2, locMedian);
783+
}
738784
if (isBAFpresent && notNA > 100 && locMedian < 1 && noisyData ) {
739785
vector<string>heteroValuesPerWindowStrings = split(BAFValuesInTheSegment, ';');
740786
int numberofBAFpoints =heteroValuesPerWindowStrings.size();
@@ -768,6 +814,8 @@ void ChrCopyNumber::calculateCopyNumberMedian(int ploidy, int minCNAlength, bool
768814
}
769815
} else {
770816
median = get_median(data); //including the last point
817+
if (isLogged)
818+
median=pow(2, median);
771819
if (isBAFpresent) {
772820
// if (dataBAF.size()>0)
773821
// medianBAF = get_median(dataBAF);
@@ -1209,6 +1257,18 @@ void ChrCopyNumber::deleteFlanks(int telo_centromeric_flanks) {
12091257

12101258
}
12111259

1260+
1261+
int ChrCopyNumber::removeLargeExons(float threshold) {
1262+
int howManyRemoved = 0;
1263+
for (int i =0; i< length_; i++) {
1264+
if (ends_[i]-coordinates_[i]>threshold) {
1265+
howManyRemoved++;
1266+
readCount_[i]=NA;
1267+
}
1268+
}
1269+
return howManyRemoved;
1270+
}
1271+
12121272
void ChrCopyNumber::recalcFlanks(int telo_centromeric_flanks, int minNumberOfWindows) {
12131273
int maxRegionLengthToDelete = int(telo_centromeric_flanks/step_);
12141274
for (int i = 0; i < (int)medianValues_.size(); i++) {

src/GenomeCopyNumber.cpp

Lines changed: 105 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -401,9 +401,9 @@ void GenomeCopyNumber::recalculateRatio (float contamination) {
401401
for ( it=chrCopyNumber_.begin() ; it != chrCopyNumber_.end(); it++ ) {
402402
if (sex_.compare("XY")==0 && (it->getChromosome().find("X")!=string::npos || it->getChromosome().find("Y")!=string::npos)) {
403403
//should take into account that normally one has only one copy of X and Y..
404-
it->recalculateRatioWithContam(contamination,0.5);
404+
it->recalculateRatioWithContam(contamination,0.5, isRatioLogged_);
405405
} else
406-
it->recalculateRatioWithContam(contamination,1);
406+
it->recalculateRatioWithContam(contamination,1, isRatioLogged_);
407407
}
408408

409409
}
@@ -1101,19 +1101,21 @@ long double GenomeCopyNumber::calculateRSS(int ploidy)
11011101
int index = findIndex(chrNumber);
11021102
int length = chrCopyNumber_[index].getLength();
11031103
for (int i = 0; i< length; i++) {
1104-
float observed = 0;
1105-
float expected = 0;
1106-
{
1107-
observed = chrCopyNumber_[index].getRatioAtBin(i);
1108-
expected = observed;
1104+
float observed = chrCopyNumber_[index].getRatioAtBin(i);
1105+
if (observed!=NA) {
1106+
if (isRatioLogged_) {
1107+
observed=pow(2,observed);
1108+
}
1109+
float expected = observed;
11091110
if (chrCopyNumber_[index].isMedianCalculated()) {
11101111
expected = chrCopyNumber_[index].getMedianProfileAtI(i);
11111112
if (chrCopyNumber_[index].isSmoothed())
11121113
expected = chrCopyNumber_[index].getSmoothedProfileAtI(i);
1113-
}
1114+
11141115
}
1115-
observedvalues.push_back(observed);
1116-
expectedvalues.push_back(expected);
1116+
observedvalues.push_back(observed);
1117+
expectedvalues.push_back(expected);
1118+
}
11171119
}
11181120
}
11191121

@@ -1126,7 +1128,12 @@ long double GenomeCopyNumber::calculateRSS(int ploidy)
11261128
RSS = RSS + (long double)pow(diff,2);
11271129
}
11281130
}
1129-
return RSS;
1131+
if (observedvalues.size()==0) {
1132+
return 0;
1133+
}
1134+
double normRSS = (RSS/observedvalues.size());
1135+
observedvalues.clear();expectedvalues.clear();
1136+
return normRSS;
11301137
}
11311138

11321139

@@ -1223,7 +1230,35 @@ void GenomeCopyNumber::calculateRatioUsingCG_Regression( GenomeCopyNumber & cont
12231230
//XXX
12241231
}
12251232

1226-
int GenomeCopyNumber::calculateRatio( GenomeCopyNumber & controlCopyNumber, int degree, bool intercept,bool logLogNorm) {
1233+
int GenomeCopyNumber::fillInRatio() {
1234+
vector <float> countValues;
1235+
vector<ChrCopyNumber>::iterator it;
1236+
1237+
for ( it=chrCopyNumber_.begin() ; it != chrCopyNumber_.end(); it++ ) {
1238+
if ((sex_.compare("XY")==0) && (it->getChromosome().find("X")!=string::npos || it->getChromosome().find("Y")!=string::npos)) {
1239+
//should take into account that normally one has only one copy of X and Y..
1240+
it->fillInRatio(isRatioLogged_);
1241+
} else {
1242+
it->fillInRatio(isRatioLogged_);
1243+
for (int i = 0; i< it->getLength(); i++) {
1244+
if (it->getValueAt(i)>0) {
1245+
countValues.push_back(it->getRatioAtBin(i));
1246+
}
1247+
}
1248+
}
1249+
}
1250+
float median=get_medianNotNA(countValues);
1251+
for ( it=chrCopyNumber_.begin() ; it != chrCopyNumber_.end(); it++ ) {
1252+
if (!isRatioLogged_) {
1253+
it->recalculateRatio(median);
1254+
} else {
1255+
it->recalculateLogRatio(median);
1256+
}
1257+
}
1258+
return 1;
1259+
}
1260+
1261+
int GenomeCopyNumber::calculateRatio( GenomeCopyNumber & controlCopyNumber, int degree, bool intercept) {
12271262

12281263

12291264
int maximalNumberOfIterations = 300;
@@ -1232,7 +1267,7 @@ int GenomeCopyNumber::calculateRatio( GenomeCopyNumber & controlCopyNumber, int
12321267

12331268
int successfulFit = 0;
12341269

1235-
if(logLogNorm) {
1270+
if(isRatioLogged_) {
12361271
intercept=1; degree=1;//because it is loglogscale
12371272
vector <float> y; //y ~ a0x+a1
12381273
vector <float> x;
@@ -2920,6 +2955,8 @@ void GenomeCopyNumber::printRatioBedGraph(std::string const& chr, std::ofstream
29202955

29212956
for (int i = 0; i< length; i++) {
29222957
value=chrCopyNumber_[index].getRatioAtBin(i);
2958+
if (isRatioLogged_ && value!=NA)
2959+
value=pow(2,value);
29232960
position=chrCopyNumber_[index].getCoordinateAtBin(i);
29242961
float valueToPrint;
29252962
if (chrCopyNumber_[index].isSmoothed())
@@ -2962,8 +2999,12 @@ void GenomeCopyNumber::printRatio(std::string const& chr, std::ofstream & file,
29622999
int length = chrCopyNumber_[index].getLength();
29633000
//cout <<length<<" == "<<chrCopyNumber_[index].getValues().size() <<"\n";
29643001
for (int i = 0; i< length; i++) {
2965-
if (printNA || chrCopyNumber_[index].getRatioAtBin(i)!=NA) {//process this this window
2966-
file << chrNumber <<"\t"<<chrCopyNumber_[index].getCoordinateAtBin(i)+1<<"\t"<<chrCopyNumber_[index].getRatioAtBin(i) ;
3002+
float ratioToPrint = chrCopyNumber_[index].getRatioAtBin(i);
3003+
if (isRatioLogged_ && ratioToPrint!= NA) {
3004+
ratioToPrint=pow(2,ratioToPrint);
3005+
}
3006+
if (printNA || ratioToPrint!=NA) {//process this this window
3007+
file << chrNumber <<"\t"<<chrCopyNumber_[index].getCoordinateAtBin(i)+1<<"\t"<<ratioToPrint ;
29673008
if (chrCopyNumber_[index].isMedianCalculated()) {
29683009
file << "\t"<<chrCopyNumber_[index].getMedianProfileAtI(i) ;
29693010
float valueToPrint;
@@ -3127,7 +3168,7 @@ void GenomeCopyNumber::calculateCopyNumberMedians (int minCNAlength, bool noisyD
31273168
vector<ChrCopyNumber>::iterator it;
31283169
for ( it=chrCopyNumber_.begin() ; it != chrCopyNumber_.end(); it++ ) {
31293170
cout << "..calculating medians for " << it->getChromosome()<< "\n";
3130-
it->calculateCopyNumberMedian(ploidy_, minCNAlength, noisyData, CompleteGenomicsData);
3171+
it->calculateCopyNumberMedian(ploidy_, minCNAlength, noisyData, CompleteGenomicsData, isRatioLogged_);
31313172
}
31323173
}
31333174

@@ -3186,6 +3227,8 @@ void GenomeCopyNumber::calculateSDAndMed(int ploidy, map <float,float> &sds,map
31863227
med = it->getMedianProfileAtI(i);
31873228
float level = round_by_ploidy(med, ploidy);
31883229
value = it->getRatioAtBin(i);
3230+
if (isRatioLogged_&& value!=NA)
3231+
value=pow(2,value);
31893232
if (value != NA) {
31903233
if (mymap.count(level) == 0) {
31913234
vector <float> a;
@@ -3223,6 +3266,8 @@ void GenomeCopyNumber::calculateSDs(int ploidy, map <float,float> &sds) {
32233266
med = it->getMedianProfileAtI(i);
32243267
float level = round_by_ploidy(med, ploidy);
32253268
value = it->getRatioAtBin(i);
3269+
if (isRatioLogged_&& value!=NA)
3270+
value=pow(2,value);
32263271
if (value != NA) {
32273272
if (mymap.count(level) == 0) {
32283273
vector <float> a;
@@ -3267,6 +3312,8 @@ float GenomeCopyNumber::calculateVarianceForNormalCopy(int ploidy) { //geting th
32673312
med = it->getMedianProfileAtI(i);
32683313
if ((med>lowBoundary)&&(med < highBoundary)) {
32693314
value = it->getRatioAtBin(i);
3315+
if (isRatioLogged_&& value!=NA)
3316+
value=pow(2,value);
32703317
if (value != NA) {
32713318
myfile << value-1 << "\n";
32723319
variance += (value-1)*(value-1);
@@ -3319,19 +3366,23 @@ float GenomeCopyNumber::evaluateContamination () {
33193366
int length = chrCopyNumber_[index].getLength();
33203367
for (int i = 0; i< length; i++) {
33213368
float observed = chrCopyNumber_[index].getRatioAtBin(i);
3322-
float expected = observed;
3323-
if (chrCopyNumber_[index].isMedianCalculated()) {
3324-
expected = chrCopyNumber_[index].getMedianProfileAtI(i) ;
3325-
if (chrCopyNumber_[index].isSmoothed() && WESanalysis == false)
3326-
expected = chrCopyNumber_[index].getSmoothedProfileAtI(i);
3327-
}
3328-
if (!(expected == 1 || expected <= 0 || expected >= 2 || observed > 3 || observed <= 0)
3329-
&& (((1>observed)&&(1>expected))||((1<observed)&&(1<expected)))) {// should it be something related to ploidy_ and not 2
3330-
float p = (observed-expected)/(observed-expected+2/ploidy_*(1-observed));
3331-
if (p>-0.5 && p<1.5) {
3332-
values.push_back(p);
3333-
weights.push_back(chrCopyNumber_[index].getFragmentLengths_notNA_At(i));
3334-
}
3369+
if (observed!=NA) {
3370+
if(isRatioLogged_)
3371+
observed=pow(2,observed);
3372+
float expected = observed;
3373+
if (chrCopyNumber_[index].isMedianCalculated()) {
3374+
expected = chrCopyNumber_[index].getMedianProfileAtI(i) ;
3375+
if (chrCopyNumber_[index].isSmoothed() && WESanalysis == false)
3376+
expected = chrCopyNumber_[index].getSmoothedProfileAtI(i);
3377+
}
3378+
if (!(expected == 1 || expected <= 0 || expected >= 2 || observed > 3 || observed <= 0)
3379+
&& (((1>observed)&&(1>expected))||((1<observed)&&(1<expected)))) {// should it be something related to ploidy_ and not 2
3380+
float p = (observed-expected)/(observed-expected+2/ploidy_*(1-observed));
3381+
if (p>-0.5 && p<1.5) {
3382+
values.push_back(p);
3383+
weights.push_back(chrCopyNumber_[index].getFragmentLengths_notNA_At(i));
3384+
}
3385+
}
33353386
}
33363387
}
33373388
}
@@ -3373,6 +3424,8 @@ float GenomeCopyNumber::evaluateContaminationwithLR () {
33733424
for (int i = 0; i< length; i++) {
33743425
float observed = chrCopyNumber_[index].getRatioAtBin(i);
33753426
if (observed!=NA) {
3427+
if (isRatioLogged_)
3428+
observed=pow(2,observed);
33763429
float expected = observed;
33773430
if (chrCopyNumber_[index].isMedianCalculated()) {
33783431
expected = round_by_ploidy(chrCopyNumber_[index].getMedianProfileAtI(i),ploidy_) ;
@@ -4075,7 +4128,26 @@ int GenomeCopyNumber::processReadWithBowtie(std::string const& inputFormat, std:
40754128
}
40764129
return 0;
40774130
}
4131+
float GenomeCopyNumber::removeLargeExons(float iqrToKeep) {
4132+
float maxLength = 0;
4133+
vector<ChrCopyNumber>::iterator it;
4134+
int totalNumberExons=0;
4135+
float numberOfRemovedExons = 0;
4136+
vector <float> exonLengths;
4137+
4138+
for ( it=chrCopyNumber_.begin() ; it != chrCopyNumber_.end(); it++ ) {
4139+
for (int i=0; i < it->getLength(); i++) {
4140+
exonLengths.push_back(it->getEndAtBin(i)-it->getCoordinateAtBin(i));
4141+
totalNumberExons++;
4142+
}
4143+
}
4144+
maxLength=get_iqr(exonLengths)/2*iqrToKeep+get_median(exonLengths);
40784145

4146+
for ( it=chrCopyNumber_.begin() ; it != chrCopyNumber_.end(); it++ ) {
4147+
numberOfRemovedExons+=it->removeLargeExons(maxLength);
4148+
}
4149+
return numberOfRemovedExons/totalNumberExons;
4150+
}
40794151
int GenomeCopyNumber::focusOnCapture (std::string const& captureFile) {
40804152
ifstream file (captureFile.c_str());
40814153
string line;
@@ -4191,6 +4263,10 @@ void GenomeCopyNumber::setWESanalysis(bool WESgiven)
41914263
WESanalysis = WESgiven;
41924264
}
41934265

4266+
void GenomeCopyNumber::setIfLogged(bool isRatioLogged) {
4267+
isRatioLogged_=isRatioLogged;
4268+
}
4269+
41944270
void GenomeCopyNumber::setmakingPileup(bool makingPileup_given)
41954271
{
41964272
makingPileup = makingPileup_given;

src/freec

66.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)