99import java .io .FileReader ;
1010import java .util .Vector ;
1111import java .util .HashSet ;
12- import java .util .HashMap ;
13- import java .util .TreeSet ;
1412
1513import cmp .compile .runMTCWMain ;
1614import cmp .database .Globals ;
2220import util .methods .Out ;
2321import util .methods .RunCmd ;
2422import util .methods .Static ;
25- import util .methods .Stats ;
2623import util .methods .TCWprops ;
2724
2825public class ScoreMulti {
2926 public boolean bTest = false ;
30- public boolean SoP_NORM = runMTCWMain .bNSoP ; // CAS313 default true
27+ public boolean SoP_NORM = runMTCWMain .bNSoP ; // CAS313; CAS401 makes average /nCols (was doing min-max normalization)
3128
3229 private final int bothGAP = 0 ;
3330 private final int hangGAP = 0 ;
@@ -44,17 +41,16 @@ public ScoreMulti() {
4441 }
4542
4643 /**************************************************************
47- * Avg of columns sum of Sum of Pairs
44+ * Sum of Sum of Pairs
4845 // www.info.univ-angers.fr/~gh/Idas/Wphylog/guidetree.pdf
4946 // Its averaged by the number of columns, otherwise, bigger clusters will likely have bigger scores
5047 *****************************************************************/
5148 private final char BORDER =Globalx .hangCh ; // leading or trailing gap
52- public double scoreSumOfPairs (String grpID , String [] alignedSeq , boolean isRun ) {
49+ public double scoreSumOfPairs (String grpID , String [] alignedSeq ) {
5350 try {
5451 int nRows = alignedSeq .length ;
5552 int nCols = alignedSeq [0 ].length ();
5653 dScores = new double [nCols ];
57- String [] comp = new String [nCols ];
5854 strScores = null ;
5955
6056 if (nRows >maxRow ) {
@@ -85,102 +81,31 @@ public double scoreSumOfPairs(String grpID, String [] alignedSeq, boolean isRun)
8581 }
8682
8783 for (int c =0 ; c <nCols ; c ++) {
88- int col_stat = 0 ;
84+ dScores [ c ] = 0 ;
8985
9086 for (int r =1 ; r <nRows -1 ; r ++) { // first is consensus
9187 char a = seqs [r ][c ];
9288
9389 for (int x =r +1 ; x <nRows ; x ++)
94- col_stat += scoreCh (a , seqs [x ][c ]);
95- }
96- dScores [c ] = col_stat ;
97-
98- if (!isRun ) { // duplicate of what is in MultiAlignPanel - write to text file
99- HashMap <Character , Integer > aaMap = new HashMap <Character , Integer > ();
100- TreeSet <String > prtSet = new TreeSet <String > ();
101-
102- for (int r =1 ; r <nRows ; r ++) {
103- char a = seqs [r ][c ];
104- if (aaMap .containsKey (a )) aaMap .put (a , aaMap .get (a )+1 );
105- else aaMap .put (a , 1 );
106- }
107-
108- for (char a : aaMap .keySet ())
109- prtSet .add (String .format ("%02d:%c" , aaMap .get (a ), a )); // leading zero makes it sort right
110-
111- comp [c ] = null ;
112- for (String info : prtSet ) {
113- if (info .startsWith ("0" )) info = info .substring (1 );
114- if (comp [c ]==null ) comp [c ] = info ;
115- else comp [c ] = info + ", " + comp [c ];
116- }
117- }
118- }
119- if (SoP_NORM ) {
120- double [] tScore = dScores .clone ();
121- scoreSoP_norm (grpID );
122-
123- if (!isRun ) {
124- strScores = new String [tScore .length +2 ];
125- for (int i =0 ; i <tScore .length ; i ++) {
126- String x =" " ;
127- if (tScore [i ]<q1x ) x ="<" ;
128- else if (tScore [i ]>q3x ) x =">" ;
129- strScores [i ] = String .format ("%3d. %.3f %3d%s %s" ,
130- i , dScores [i ], (int ) tScore [i ], x , comp [i ]);
131- }
132- strScores [tScore .length ] = "" ;
133- strScores [tScore .length +1 ] = String .format ("Q1 Box %.1f Q3 Box %.1f" , q1x , q3x );
90+ dScores [c ] += scoreCh (a , seqs [x ][c ]);
13491 }
13592 }
13693
137- // Though there are (nRows*(nRows-1)/2) * nCols comparisons
138- // The average is on the nCols since its the column sum that is relevant
139- double sum =0 ;
140- for (double d : dScores ) sum += d ;
141- double score = (sum !=0 ) ? (Math .abs (sum )/(double )nCols ) : 0 ; // CAS312
94+ // #cmp = (nRows*(nRows-1)/2) * nCols
95+ // CAS401 -- was a pseudo min-max normalization, changed to /#cmp
96+ nRows --;
97+ double n = (SoP_NORM ) ? nCols : ((nRows *(nRows -1 )/2 ) * nCols );
98+ double sum = 0.0 ;
99+ for (double x : dScores ) sum += x ;
100+
101+ double score = (sum !=0 ) ? (Math .abs (sum )/n ) : 0 ; // CAS312
142102 if (sum <0 ) score = -score ;
143103
144104 return score ;
145105 }
146106 catch (Exception e ) {ErrorReport .reportError (e , "scoreAvgSumOfPairs " + grpID );}
147107 return Globalx .dNoScore ;
148108 }
149- // Min-max Normalization of column scores:
150- // Find the largest positive and smallest negative number.
151- // Add the absolute value of the smallest to each number
152- // Divide the result by max-min
153- // Z - X-min(X)/max(X)-min(X)
154- // Check for outliers
155- private void scoreSoP_norm (String grpID ) {
156- try {
157- double [] qrt = Stats .setQuartiles (dScores );
158- double q1 = qrt [0 ];
159- double q3 = qrt [2 ];
160- double iqr = (q3 -q1 )*1.5 ;
161- double min = qrt [3 ];
162- double max = qrt [4 ];
163-
164- double diff = max -min ;
165-
166- q1x = q1 -iqr ;
167- q3x = q3 +iqr ;
168-
169- // get rid of outliers
170- int cntL =0 , cntH =0 ;
171- for (int i =0 ; i <dScores .length ; i ++) {
172- double d = dScores [i ];
173- if (d <q1x ) cntL ++;
174- if (d >q3x ) cntH ++;
175- dScores [i ] = (d -min )/diff ;
176- }
177- if (bTest && cntL +cntH >0 ) {
178- Out .PrtSpCntMsgNz (1 , cntL , "Low outlier for " + grpID + String .format (" (q1 %5.1f, Box %5.1f) Col %d" , q1 , q1x , dScores .length ));
179- Out .PrtSpCntMsgNz (1 , cntH , "High outlier for " + grpID + String .format (" (q3 %5.1f, Box %5.1f) Col %d" , q3 , q3x , dScores .length ));
180- }
181- }
182- catch (Exception e ) {ErrorReport .reportError (e , "normalize SoP " );}
183- }
184109
185110 private double scoreCh (char c1 , char c2 ) {
186111 if (c1 ==Share .gapCh && c2 ==Share .gapCh ) return bothGAP ;
@@ -403,5 +328,4 @@ public double scoreMstatX(String type, String alignedFile, String resultFile) {
403328
404329 private double [] dScores ;
405330 private String [] strScores ;
406- private double q1x =0.0 , q3x =0.0 ;
407331}
0 commit comments