10
10
import org .baderlab .csplugins .enrichmentmap .model .GeneSet ;
11
11
import org .baderlab .csplugins .enrichmentmap .model .GenericResult ;
12
12
import org .baderlab .csplugins .enrichmentmap .model .SetOfEnrichmentResults ;
13
- import org .baderlab .csplugins .enrichmentmap .util .NullTaskMonitor ;
13
+ import org .baderlab .csplugins .enrichmentmap .util .DiscreteTaskMonitor ;
14
14
import org .cytoscape .work .AbstractTask ;
15
15
import org .cytoscape .work .TaskMonitor ;
16
16
17
+ import com .google .common .base .Strings ;
17
18
import com .google .common .collect .ImmutableSet ;
18
19
19
20
public class ParseGenericEnrichmentResults extends AbstractTask {
@@ -26,154 +27,130 @@ public ParseGenericEnrichmentResults(EMDataSet dataset) {
26
27
27
28
@ Override
28
29
public void run (TaskMonitor taskMonitor ) throws IOException {
29
- if (taskMonitor == null )
30
- taskMonitor = new NullTaskMonitor ();
31
- taskMonitor .setTitle ("Parsing Generic Result file" );
32
-
33
30
List <String > lines = LineReader .readLines (dataset .getDataSetFiles ().getEnrichmentFileName1 ());
34
-
35
- //Get the current genesets so we can check that all the results are in the geneset list
36
- //and put the size of the genesets into the visual style
37
- Map <String , GeneSet > genesets = dataset .getSetOfGeneSets ().getGeneSets ();
38
-
39
- int currentProgress = 0 ;
40
- int maxValue = lines .size ();
41
- taskMonitor .setStatusMessage ("Parsing Generic Results file - " + maxValue + " rows" );
42
- boolean FDR = false ;
43
-
44
- //skip the first line which just has the field names (start i=1)
45
- //check to see how many columns the data has
46
- String line = lines .get (0 );
47
- String [] tokens = line .split ("\t " );
48
- int length = tokens .length ;
49
-
31
+ DiscreteTaskMonitor tm = new DiscreteTaskMonitor (taskMonitor , lines .size ());
32
+ tm .setStatusMessage ("Parsing Generic Results file - " + lines .size () + " rows" );
33
+ tm .setTitle ("Parsing Generic Result file" );
34
+ parse (tm , lines );
35
+ }
36
+
37
+
38
+ private void parse (DiscreteTaskMonitor tm , List <String > lines ) {
39
+ boolean FDR = false ; // false data rate
40
+ boolean hasNegOneNES = false , hasPosOneNES = false , hasOtherNES = false ;
41
+
50
42
EnrichmentMap map = dataset .getMap ();
51
43
SetOfEnrichmentResults enrichments = dataset .getEnrichments ();
52
44
Map <String , EnrichmentResult > results = enrichments .getEnrichments ();
53
45
String upPhenotype = enrichments .getPhenotype1 ();
54
46
String downPhenotype = enrichments .getPhenotype2 ();
55
47
48
+ //Get the current genesets so we can check that all the results are in the geneset list
49
+ //and put the size of the genesets into the visual style
50
+ Map <String ,GeneSet > genesets = dataset .getSetOfGeneSets ().getGeneSets ();
51
+
56
52
//check to see if there are genesets.
57
53
//if there are no genesets then populate the genesets from the generic file
58
54
//can only do this if the 6th column has a list of genes for that geneset.
59
55
boolean populate_gs = false ;
60
- if (genesets == null || genesets .isEmpty ())
56
+ if (genesets == null || genesets .isEmpty ()) {
61
57
populate_gs = true ;
62
- //as this is the default for gprofiler use the Description in the visual style instead of the formatted name
63
- //but only if there is a gmt supplied. If using just the generic output file there is not field for description
64
- else
58
+ } else {
59
+ //as this is the default for gprofiler use the Description in the visual style instead of the formatted name
60
+ //but only if there is a gmt supplied. If using just the generic output file there is not field for description
65
61
dataset .getMap ().getParams ().setEMgmt (true );
62
+ }
66
63
67
- //if (length < 3)
68
- //not enough data in the file!!
69
-
64
+ //skip the first line which just has the field names (start i=1), check to see how many columns the data has
70
65
for (int i = 1 ; i < lines .size (); i ++) {
71
- line = lines .get (i );
72
-
73
- tokens = line .split ("\t " );
74
-
75
- //update the length each time because some line might have missing values
76
- length = tokens .length ;
66
+ String line = lines .get (i );
67
+ String [] tokens = line .split ("\t " );
77
68
78
69
double pvalue = 1.0 ;
79
70
double FDRqvalue = 1.0 ;
80
- GenericResult result ;
81
71
int gs_size = 0 ;
82
72
double NES = 1.0 ;
73
+
74
+ GenericResult result ;
83
75
84
76
//The first column of the file is the name of the geneset
85
77
final String name = tokens [0 ].toUpperCase ().trim ();
86
78
final String description = tokens [1 ].toUpperCase ();
87
-
88
79
if (genesets .containsKey (name )) {
89
80
gs_size = genesets .get (name ).getGenes ().size ();
90
81
}
91
-
92
- //The third column is the nominal p-value
93
- if (tokens [2 ] == null || tokens [2 ].equalsIgnoreCase ("" )) {
94
- //do nothing
95
- } else {
82
+ if (!Strings .isNullOrEmpty (tokens [2 ])) {
96
83
pvalue = Double .parseDouble (tokens [2 ]);
97
84
}
98
-
99
- if (length > 3 ) {
100
- //the fourth column is the FDR q-value
101
- if (tokens [3 ] == null || tokens [3 ].equalsIgnoreCase ("" )) {
102
- //do nothing
103
- } else {
85
+
86
+ // if (length < 3) not enough data in the file!! The fourth column is the FDR q-value.
87
+ if (tokens .length > 3 ) {
88
+ if (!Strings .isNullOrEmpty (tokens [3 ])) {
104
89
FDRqvalue = Double .parseDouble (tokens [3 ]);
105
90
FDR = true ;
106
91
}
107
- //the fifth column is the phenotype.
108
- //it can either be a signed number or it can be text specifying the phenotype
109
- //in order for it to be parseable the text has to match the user specified phenotypes
92
+
93
+ // the fifth column is the phenotype.
94
+ // it can either be a signed number or it can be text specifying the phenotype
95
+ // in order for it to be parseable the text has to match the user specified phenotypes
110
96
// and if it is a number the only important part is the sign
111
- if (length > 4 ) {
112
-
113
- if (tokens [4 ] == null || tokens [4 ].equalsIgnoreCase ("" )) {
114
-
115
- } else {
97
+ if (tokens .length > 4 ) {
98
+ if (!Strings .isNullOrEmpty (tokens [4 ])) {
116
99
//check to see if the string matches the specified phenotypes
117
- if (tokens [4 ].equalsIgnoreCase (upPhenotype ))
100
+ if (tokens [4 ].equalsIgnoreCase (upPhenotype )) {
118
101
NES = 1.0 ;
119
- else if (tokens [4 ].equalsIgnoreCase (downPhenotype ))
102
+ } else if (tokens [4 ].equalsIgnoreCase (downPhenotype )) {
120
103
NES = -1.0 ;
121
- //try and see if the user has specified the phenotype as a number
122
- else {
104
+ } else {
123
105
try {
106
+ //try and see if the user has specified the phenotype as a number
124
107
NES = Double .parseDouble (tokens [4 ]);
125
108
} catch (NumberFormatException nfe ) {
126
- throw new IllegalThreadStateException (tokens [4 ]
127
- + " is not a valid phenotype. Phenotype specified in generic enrichment results file must have the same phenotype as specified in advanced options or must be a positive or negative number." );
109
+ throw new IllegalArgumentException (tokens [4 ] + " is not a valid phenotype. Phenotype specified in generic enrichment results file must have the same phenotype as specified in advanced options or must be a positive or negative number." );
128
110
}
129
111
}
130
112
}
131
-
132
- //ticket#57 - adding additional column to generic format, similiar to Bingo and David
133
- // that outlines the genes from the query that are found in the geneset and results in
134
- //its enrichment
135
- if (length > 5 && populate_gs ) {
136
-
137
- //get all the genes in the field
113
+
114
+ if (NES == 1.0 )
115
+ hasPosOneNES = true ;
116
+ else if (NES == -1.0 )
117
+ hasNegOneNES = true ;
118
+ else
119
+ hasOtherNES = true ;
120
+
121
+ // ticket#57 - adding additional column to generic format, similiar to Bingo and David
122
+ // that outlines the genes from the query that are found in the geneset and results in its enrichment
123
+ if (tokens .length > 5 && populate_gs ) {
138
124
String [] gene_tokens = tokens [5 ].split ("," );
139
125
140
126
ImmutableSet .Builder <Integer > builder = ImmutableSet .builder ();
141
127
142
- //All subsequent fields in the list are the geneset associated with this geneset.
128
+ //All subsequent fields in the list are the genes associated with this geneset.
143
129
for (String token : gene_tokens ) {
144
130
String gene = token .trim ().toUpperCase ();
145
131
146
- //Check to see if the gene is already in the hashmap of genes
147
- //if it is already in the hash then get its associated key and put it into the set of genes
148
132
if (map .containsGene (gene )) {
149
133
builder .add (map .getHashFromGene (gene ));
150
- }
151
- else if (!gene .isEmpty ()) {
134
+ } else if (!gene .isEmpty ()) {
152
135
Integer hash = map .addGene (gene ).get ();
153
136
builder .add (hash );
154
137
}
155
138
}
156
139
157
140
GeneSet gs = new GeneSet (name , description , builder .build ());
158
141
gs_size = gs .getGenes ().size ();
159
- //put the new or filtered geneset back into the set.
160
142
genesets .put (name , gs );
161
143
162
144
} //end of tokens>5
163
145
result = new GenericResult (name , description , pvalue , gs_size , FDRqvalue , NES );
164
- } //end of tokens>4
165
-
166
- else
146
+ } else { //end of tokens>4
167
147
result = new GenericResult (name , description , pvalue , gs_size , FDRqvalue );
168
-
148
+ }
169
149
} else {
170
150
result = new GenericResult (name , description , pvalue , gs_size );
171
151
}
172
152
173
- // Calculate Percentage. This must be a value between 0..100.
174
- int percentComplete = (int ) (((double ) currentProgress / maxValue ) * 100 );
175
- taskMonitor .setProgress (percentComplete );
176
- currentProgress ++;
153
+ tm .inc ();
177
154
178
155
//check to see if the gene set has already been entered in the results
179
156
//it is possible that one geneset will be in both phenotypes.
@@ -183,14 +160,14 @@ else if(!gene.isEmpty()) {
183
160
GenericResult temp = (GenericResult ) results .get (name );
184
161
if (temp == null )
185
162
results .put (name , result );
186
- else {
187
- if (result .getPvalue () < temp .getPvalue ())
188
- results .put (name , result );
189
- }
190
-
163
+ else if (result .getPvalue () < temp .getPvalue ())
164
+ results .put (name , result );
191
165
}
166
+
192
167
if (FDR )
193
168
dataset .getMap ().getParams ().setFDR (FDR );
169
+ if (hasPosOneNES && hasNegOneNES && !hasOtherNES )
170
+ dataset .setIsTwoPhenotypeGeneric (true );
194
171
}
195
172
196
173
}
0 commit comments