|
1 | 1 | package org.baderlab.csplugins.enrichmentmap.resolver;
|
2 | 2 |
|
| 3 | +import java.io.BufferedReader; |
| 4 | +import java.io.FileReader; |
3 | 5 | import java.io.IOException;
|
4 |
| -import java.io.UncheckedIOException; |
5 |
| -import java.nio.charset.Charset; |
6 | 6 | import java.nio.file.Files;
|
7 | 7 | import java.nio.file.Path;
|
8 | 8 | import java.nio.file.Paths;
|
@@ -55,6 +55,16 @@ public boolean isEnrichmentFile() {
|
55 | 55 | }
|
56 | 56 |
|
57 | 57 |
|
| 58 | + private static class DataLines { |
| 59 | + final String firstLine; |
| 60 | + final String firstDataLine; |
| 61 | + public DataLines(String firstLine, String firstDataLine) { |
| 62 | + this.firstLine = firstLine; |
| 63 | + this.firstDataLine = firstDataLine; |
| 64 | + } |
| 65 | + } |
| 66 | + |
| 67 | + |
58 | 68 | public static List<DataSetParameters> guessDataSets(Path rootFolder, CancelStatus cancelStatus) {
|
59 | 69 | // First test if rootFolder is itself a GSEA results folder
|
60 | 70 | Optional<DataSetParameters> dataset = GSEAResolver.resolveGSEAResultsFolder(rootFolder);
|
@@ -225,45 +235,45 @@ private static Type guess(Path path) {
|
225 | 235 | Map<Type,Integer> scores = new EnumMap<>(Type.class);
|
226 | 236 |
|
227 | 237 | String fileName = path.getFileName().toString();
|
228 |
| - Optional<String> firstLine = getFirstDataLine(path); |
| 238 | + DataLines lines = getFirstDataLines(path); |
229 | 239 |
|
230 |
| - if(firstLine.isPresent() && isTabSeparated(firstLine.get())) { |
231 |
| - // Guess based on extension and/or first line of file |
232 |
| - if(hasExtension(path, "gct")) { |
233 |
| - addScore(scores, Type.RANKS, 1); |
234 |
| - } |
235 |
| - if(hasExtension(path, "gmt")) { |
236 |
| - addScore(scores, Type.GENE_SETS, 1); |
237 |
| - } |
238 |
| - if(hasExtension(path, "rnk")) { |
239 |
| - addScore(scores, Type.RANKS, 1); |
240 |
| - addScore(scores, Type.EXPRESSION, 1); |
241 |
| - } |
242 |
| - if(hasExtension(path, "xls", "bgo", "tsv", "txt")) { |
243 |
| - Type type = guessEnrichmentType(path); |
244 |
| - if(type == Type.IGNORE) { |
245 |
| - addScore(scores, Type.ENRICHMENT_GENERIC, 1); |
| 240 | + if(lines != null) { |
| 241 | + if(isTabSeparated(lines.firstDataLine)) { |
| 242 | + // Guess based on extension and/or first line of file |
| 243 | + if(hasExtension(path, "gct")) { |
| 244 | + addScore(scores, Type.RANKS, 1); |
| 245 | + } |
| 246 | + if(hasExtension(path, "gmt")) { |
| 247 | + addScore(scores, Type.GENE_SETS, 1); |
| 248 | + } |
| 249 | + if(hasExtension(path, "rnk")) { |
| 250 | + addScore(scores, Type.RANKS, 1); |
246 | 251 | addScore(scores, Type.EXPRESSION, 1);
|
247 |
| - } else { |
| 252 | + } |
| 253 | + if(hasExtension(path, "xls", "tsv", "txt")) { |
| 254 | + Type type = guessEnrichmentType(lines.firstLine); |
248 | 255 | addScore(scores, type, 2); // this is a lot of evidence
|
249 | 256 | }
|
250 |
| - } |
251 |
| - |
252 |
| - // Test first line |
253 |
| - if(!isRankLine(firstLine.get())) { |
254 |
| - addScore(scores, Type.RANKS, -1); |
255 |
| - } |
256 |
| - if(!isExpressionLine(firstLine.get())) { |
257 |
| - addScore(scores, Type.EXPRESSION, -1); |
258 |
| - } |
259 |
| - |
260 |
| - // Guess based on file name |
| 257 | + |
| 258 | + // Test first line |
| 259 | + if(!isRankLine(lines.firstDataLine)) { |
| 260 | + addScore(scores, Type.RANKS, -1); |
| 261 | + } |
| 262 | + if(!isExpressionLine(lines.firstDataLine)) { |
| 263 | + addScore(scores, Type.EXPRESSION, -1); |
| 264 | + } |
| 265 | + |
| 266 | + // Guess based on file name |
| 267 | + if(matches(fileName, ".*expr(ession)?.*")) { |
| 268 | + addScore(scores, Type.EXPRESSION, 3); |
| 269 | + } |
| 270 | + if(matches(fileName, ".*rank.*")) { |
| 271 | + addScore(scores, Type.RANKS, 3); |
| 272 | + } |
| 273 | + } |
261 | 274 |
|
262 |
| - if(matches(fileName, ".*expr(ession)?.*")) { |
263 |
| - addScore(scores, Type.EXPRESSION, 3); |
264 |
| - } |
265 |
| - if(matches(fileName, ".*rank.*")) { |
266 |
| - addScore(scores, Type.RANKS, 3); |
| 275 | + if(hasExtension(path, "bgo") || isBingoHeader(lines.firstLine)) { |
| 276 | + addScore(scores, Type.ENRICHMENT_BINGO, 2); |
267 | 277 | }
|
268 | 278 | }
|
269 | 279 |
|
@@ -346,15 +356,29 @@ private static boolean hasExtension(Path path, String... extensions) {
|
346 | 356 | }
|
347 | 357 |
|
348 | 358 |
|
349 |
| - private static Optional<String> getFirstDataLine(Path path) { |
350 |
| - try(Stream<String> lines = Files.lines(path)) { |
351 |
| - return lines |
352 |
| - .filter(l -> !l.startsWith("#")) // filter out comment lines |
353 |
| - .skip(1) // skip header line |
354 |
| - .findFirst(); |
355 |
| - } catch(IOException | UncheckedIOException e) { |
356 |
| - return Optional.empty(); |
| 359 | + private static DataLines getFirstDataLines(Path path) { |
| 360 | + try(FileReader fileReader = new FileReader(path.toFile()); |
| 361 | + BufferedReader reader = new BufferedReader(fileReader)) |
| 362 | + { |
| 363 | + String firstLine = null; |
| 364 | + String firstDataLine = null; |
| 365 | + |
| 366 | + String line = null; |
| 367 | + while((line = reader.readLine()) != null) { |
| 368 | + if(firstLine == null) { |
| 369 | + firstLine = line; |
| 370 | + } else if(!line.startsWith("#")) { |
| 371 | + firstDataLine = line; |
| 372 | + break; |
| 373 | + } |
| 374 | + } |
| 375 | + |
| 376 | + if(firstLine != null && firstDataLine != null) { |
| 377 | + return new DataLines(firstLine, firstDataLine); |
| 378 | + } |
| 379 | + } catch(IOException e) { |
357 | 380 | }
|
| 381 | + return null; |
358 | 382 | }
|
359 | 383 |
|
360 | 384 | private static boolean isExpressionLine(String line) {
|
@@ -384,54 +408,51 @@ private static boolean isTabSeparated(String line) {
|
384 | 408 | }
|
385 | 409 |
|
386 | 410 |
|
387 |
| - public static Type guessEnrichmentType(String path) { |
388 |
| - return guessEnrichmentType(Paths.get(path)); |
| 411 | + public static Type guessEnrichmentTypeFromPath(String path) { |
| 412 | + Path p = Paths.get(path); |
| 413 | + DataLines lines = getFirstDataLines(p); |
| 414 | + return guessEnrichmentType(lines.firstLine); |
389 | 415 | }
|
390 | 416 |
|
391 | 417 | /*
|
392 | 418 | * This logic was moved here from {@link DetermineEnrichmentResultFileReader}
|
393 | 419 | */
|
394 |
| - public static Type guessEnrichmentType(Path path) { |
395 |
| - try { |
396 |
| - String firstLine = com.google.common.io.Files.readFirstLine(path.toFile(), Charset.defaultCharset()); |
397 |
| - |
398 |
| - String[] tokens = firstLine.split("\t"); |
399 |
| - |
400 |
| - //check to see if there are exactly 11 columns - = GSEA results |
401 |
| - if(tokens.length == 11) { |
402 |
| - //check to see if the ES is the 5th column and that NES is the 6th column |
403 |
| - if((tokens[4].equalsIgnoreCase("ES")) && (tokens[5].equalsIgnoreCase("NES"))) |
404 |
| - return Type.ENRICHMENT_GSEA; |
405 |
| - //it is possible that the file can have 11 columns but that it is still a generic file |
406 |
| - //if it doesn't specify ES and NES in the 5 and 6th columns |
407 |
| - else |
408 |
| - return Type.ENRICHMENT_GENERIC; |
409 |
| - } |
410 |
| - //check to see if there are exactly 13 columns - = DAVID results |
411 |
| - else if(tokens.length == 13) { |
412 |
| - //check to see that the 6th column is called Genes and that the 12th column is called "Benjamini" |
413 |
| - if((tokens[5].equalsIgnoreCase("Genes")) && tokens[11].equalsIgnoreCase("Benjamini")) |
414 |
| - return Type.ENRICHMENT_DAVID; |
415 |
| - else |
416 |
| - return Type.ENRICHMENT_GENERIC; |
| 420 | + public static Type guessEnrichmentType(String firstLine) { |
| 421 | + String[] tokens = firstLine.split("\t"); |
417 | 422 |
|
418 |
| - } |
419 |
| - //fix bug with new version of bingo plugin change the case of the header file. |
420 |
| - else if(firstLine.toLowerCase().contains("File created with BiNGO".toLowerCase())) { |
421 |
| - return Type.ENRICHMENT_BINGO; |
422 |
| - } else if(firstLine.contains("GREAT version")) { |
423 |
| - return Type.ENRICHMENT_GREAT; |
424 |
| - } else if(tokens.length == 9 && firstLine.contains("Term") && firstLine.contains("Old P-value")) { |
425 |
| - return Type.ENRICHMENT_ENRICHR; |
426 |
| - } else { |
| 423 | + //check to see if there are exactly 11 columns - = GSEA results |
| 424 | + if(tokens.length == 11) { |
| 425 | + //check to see if the ES is the 5th column and that NES is the 6th column |
| 426 | + if((tokens[4].equalsIgnoreCase("ES")) && (tokens[5].equalsIgnoreCase("NES"))) |
| 427 | + return Type.ENRICHMENT_GSEA; |
| 428 | + //it is possible that the file can have 11 columns but that it is still a generic file |
| 429 | + //if it doesn't specify ES and NES in the 5 and 6th columns |
| 430 | + else |
427 | 431 | return Type.ENRICHMENT_GENERIC;
|
428 |
| - } |
429 | 432 | }
|
430 |
| - catch(IOException e) { |
431 |
| - // MKTODO log the exception |
| 433 | + //check to see if there are exactly 13 columns - = DAVID results |
| 434 | + else if(tokens.length == 13) { |
| 435 | + //check to see that the 6th column is called Genes and that the 12th column is called "Benjamini" |
| 436 | + if((tokens[5].equalsIgnoreCase("Genes")) && tokens[11].equalsIgnoreCase("Benjamini")) |
| 437 | + return Type.ENRICHMENT_DAVID; |
| 438 | + else |
| 439 | + return Type.ENRICHMENT_GENERIC; |
| 440 | + |
432 | 441 | }
|
433 |
| - |
434 |
| - return Type.IGNORE; |
| 442 | + else if(isBingoHeader(firstLine)) { |
| 443 | + return Type.ENRICHMENT_BINGO; |
| 444 | + } else if(firstLine.contains("GREAT version")) { |
| 445 | + return Type.ENRICHMENT_GREAT; |
| 446 | + } else if(tokens.length == 9 && firstLine.contains("Term") && firstLine.contains("Old P-value")) { |
| 447 | + return Type.ENRICHMENT_ENRICHR; |
| 448 | + } else { |
| 449 | + return Type.ENRICHMENT_GENERIC; |
| 450 | + } |
| 451 | + } |
| 452 | + |
| 453 | + |
| 454 | + private static boolean isBingoHeader(String firstLine) { |
| 455 | + return firstLine.toLowerCase().contains("File created with BiNGO".toLowerCase()); |
435 | 456 | }
|
436 | 457 |
|
437 | 458 |
|
|
0 commit comments