11#include " IndexCreator.h"
2+ #include " editNames.h"
23#include < iostream>
34#include < istream>
45#include < string>
@@ -43,6 +44,8 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
4344 string databaseAssemblyList = assemblyList + " .databaseAssembly" ;
4445 string totalExludedAssemblyList = assemblyList + " .totalExcludedAssembly" ;
4546
47+ // editNames(taxonomyPath + "/names.dmp", par.assacc2taxid);
48+
4649 // Load taxonomy
4750 TaxonomyWrapper taxonomy (taxonomyPath + " /names.dmp" ,
4851 taxonomyPath + " /nodes.dmp" ,
@@ -52,18 +55,21 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
5255
5356 cout << " Making name2taxid map..." ;
5457 std::unordered_map<std::string, TaxID> name2InternalTaxId;
58+ std::unordered_map<std::string, TaxID> tempMap;
5559 taxonomy.getName2InternalTaxid (name2InternalTaxId);
5660 for (auto &it : name2InternalTaxId) {
5761 if (it.first .find (" ." ) == std::string::npos) {
5862 continue ;
5963 }
6064 string accessionNoVersion = it.first .substr (0 , it.first .find (" ." ));
61- name2InternalTaxId[accessionNoVersion] = it.second ;
65+ tempMap[accessionNoVersion] = it.second ;
66+ }
67+ for (auto &it : tempMap) {
68+ name2InternalTaxId[it.first ] = it.second ;
6269 }
6370 cout << " done." << endl;
6471
65- cout << " Making observedAcc2taxid map..." ;
66- std::unordered_map<std::string, TaxID> observedAcc2taxid;
72+
6773 cout << " Loading assembly list..." ;
6874 vector<string> totalAssemblyAccessions;
6975 vector<Assembly> assemblies;
@@ -72,8 +78,11 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
7278 cerr << " Error: could not open assembly list file " << assemblyList << endl;
7379 return 1 ;
7480 }
81+
7582 string assemblyAccession;
7683 TaxID taxid;
84+ cout << " Making observedAcc2taxid map..." ;
85+ std::unordered_map<std::string, TaxID> observedAcc2taxid;
7786 while (getline (assemblyListFile, assemblyAccession)) {
7887 string assAccNoVersion = assemblyAccession.substr (0 , assemblyAccession.find (" ." ));
7988
@@ -88,7 +97,7 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
8897 } else if (name2InternalTaxId.find (assAccNoVersion) != name2InternalTaxId.end ()) {
8998 taxid = name2InternalTaxId[assAccNoVersion];
9099 } else {
91- cerr << " Error: accession " << assemblyAccession << " not found in the taxonomy" << endl;
100+ cerr << " Error: accession " << assemblyAccession << " " << assAccNoVersion << " not found in the taxonomy" << endl;
92101 return 1 ;
93102 }
94103 observedAcc2taxid[assemblyAccession] = taxid;
@@ -257,7 +266,7 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
257266 }
258267
259268 // Select n genera with multiple species
260- n = int (genusWithMultipleSpecies.size () / 4 );
269+ n = int (genusWithMultipleSpecies.size () / 3 );
261270 cout << " Excluding " << n << " species" << endl;
262271 vector<TaxID> selectedGenera;
263272 for (int i = 0 ; i < n; i++) {
@@ -314,7 +323,7 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
314323 }
315324
316325 // 2. Select n species with multiple assemblies
317- n = int (speciesWithMultipleAssemblies.size ()/2 );
326+ n = int (speciesWithMultipleAssemblies.size ()/3 );
318327 cout << " Excluding " << n << " assemblies" << endl;
319328 vector<TaxID> selectedSpecies;
320329 for (int i = 0 ; i < n; i++) {
@@ -409,17 +418,23 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
409418 const string & rank = taxonomy.getString (lcaTaxon->rankIdx );
410419 if (rank == " order" ) {
411420 orderCount++;
412- } else if (taxonomy.findRankIndex (rank) == -1 ) {
413- cout << " Error: LCA rank index is -1." << endl;
414- return 1 ;
415- } else if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" order" )) {
416- cout << " Error: " << excludedGenusAssemblies[i] << " is not a valid family exclusion. LCA is below order rank." << endl;
417- cout << excludedGenusAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
421+ continue ;
422+ }
423+
424+ if (strcmp (taxonomy.getString (lcaTaxon->nameIdx ), " root" ) == 0 ) {
425+ continue ;
426+ }
427+
428+ if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" order" )) {
429+ cout << " Error: " << excludedFamilyAssemblies[i] << " is not a valid family exclusion. LCA is below order rank." << endl;
430+ cout << " Name: " << taxonomy.getString (lcaTaxon->nameIdx ) << " TaxID: " << lcaTaxon->taxId << endl;
431+ cout << " Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
432+ cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
418433 return 1 ;
419434 }
420435 }
421436 if (orderCount == 0 ) {
422- cout << " Error: " << excludedGenusAssemblies [i] << " is not a valid exclusion. No order rank LCA." << endl;
437+ cout << " Error: " << excludedFamilyAssemblies [i] << " is not a valid exclusion. No order rank LCA." << endl;
423438 return 1 ;
424439 }
425440 }
@@ -438,14 +453,28 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
438453 const string & rank = taxonomy.getString (lcaTaxon->rankIdx );
439454 if (rank == " family" ) {
440455 familyCount++;
441- } else if (taxonomy.findRankIndex (rank) == -1 ) {
442- cout << " Error: LCA rank index is -1." << endl;
443- return 1 ;
444- } else if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" family" )) {
445- cout << " Error: " << excludedGenusAssemblies[i] << " is not a valid exclusion. LCA is below Family rank." << endl;
446- cout << excludedGenusAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
447- return 1 ;
456+ continue ;
457+ }
458+
459+ if (strcmp (taxonomy.getString (lcaTaxon->nameIdx ), " root" ) == 0 ) {
460+ continue ;
448461 }
462+
463+ if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" family" )) {
464+ cout << " Error: " << excludedFamilyAssemblies[i] << " is not a valid genus exclusion. LCA is below family rank." << endl;
465+ cout << " Name: " << taxonomy.getString (lcaTaxon->nameIdx ) << " TaxID: " << lcaTaxon->taxId << endl;
466+ cout << " Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
467+ cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
468+ return 1 ;
469+ }
470+ // else if (taxonomy.findRankIndex(rank) == -1) {
471+ // cout << "Error: LCA rank index is -1." << endl;
472+ // return 1;
473+ // } else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("family")) {
474+ // cout << "Error: " << excludedGenusAssemblies[i] << " is not a valid exclusion. LCA is below Family rank." << endl;
475+ // cout << excludedGenusAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
476+ // return 1;
477+ // }
449478 }
450479 if (familyCount == 0 ) {
451480 cout << " Error: " << excludedGenusAssemblies[i] << " is not a valid exclusion. No Family rank LCA." << endl;
@@ -465,16 +494,34 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
465494 TaxID databaseTaxid = observedAcc2taxid[databaseAssemblies[j]];
466495 const TaxonNode * lcaTaxon = taxonomy.taxonNode (taxonomy.LCA (excludedTaxid, databaseTaxid));
467496 const string & rank = taxonomy.getString (lcaTaxon->rankIdx );
497+
468498 if (rank == " genus" ) {
469499 genusCount++;
470- } else if (taxonomy.findRankIndex (rank) == -1 ) {
471- cout << " Error: LCA rank index is -1." << endl;
472- return 1 ;
473- } else if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" genus" )) {
474- cout << " Error: " << excludedSpeciesAssemblies[i] << " is not a valid exclusion. LCA is below Genus rank." << endl;
475- cout << excludedSpeciesAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
476- return 1 ;
500+ continue ;
501+ }
502+
503+ if (strcmp (taxonomy.getString (lcaTaxon->nameIdx ), " root" ) == 0 ) {
504+ continue ;
477505 }
506+
507+ if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" genus" )) {
508+ cout << " Error: " << excludedFamilyAssemblies[i] << " is not a valid species exclusion. LCA is below genus rank." << endl;
509+ cout << " Name: " << taxonomy.getString (lcaTaxon->nameIdx ) << " TaxID: " << lcaTaxon->taxId << endl;
510+ cout << " Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
511+ cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
512+ return 1 ;
513+ }
514+
515+ // if (rank == "genus") {
516+ // genusCount++;
517+ // } else if (taxonomy.findRankIndex(rank) == -1) {
518+ // cout << "Error: LCA rank index is -1." << endl;
519+ // return 1;
520+ // } else if (taxonomy.findRankIndex(rank) < taxonomy.findRankIndex("genus")) {
521+ // cout << "Error: " << excludedSpeciesAssemblies[i] << " is not a valid exclusion. LCA is below Genus rank." << endl;
522+ // cout << excludedSpeciesAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
523+ // return 1;
524+ // }
478525 }
479526 if (genusCount == 0 ) {
480527 cout << " Error: " << excludedSpeciesAssemblies[i] << " is not a valid exclusion. No Genus rank LCA." << endl;
@@ -496,11 +543,20 @@ int makeGtdbBenchmarkSet(const LocalParameters & par) {
496543 const string & rank = taxonomy.getString (lcaTaxon->rankIdx );
497544 if (rank == " species" ) {
498545 speciesCount++;
499- } else if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" species" )) {
500- cout << " Error: " << excludedSubspeciesAssemblies[i] << " is not a valid exclusion. LCA is below Species rank." << endl;
501- cout << excludedSubspeciesAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
502- return 1 ;
546+ continue ;
547+ }
548+
549+ if (strcmp (taxonomy.getString (lcaTaxon->nameIdx ), " root" ) == 0 ) {
550+ continue ;
503551 }
552+
553+ if (taxonomy.findRankIndex (rank) < taxonomy.findRankIndex (" species" )) {
554+ cout << " Error: " << excludedFamilyAssemblies[i] << " is not a valid exclusion. LCA is below species rank." << endl;
555+ cout << " Name: " << taxonomy.getString (lcaTaxon->nameIdx ) << " TaxID: " << lcaTaxon->taxId << endl;
556+ cout << " Rank: " << rank << " RankIdx: " << lcaTaxon->rankIdx << endl;
557+ cout << excludedFamilyAssemblies[i] << " " << databaseAssemblies[j] << " " << rank << endl;
558+ return 1 ;
559+ }
504560 }
505561 if (speciesCount == 0 ) {
506562 cout << " Error: " << excludedSubspeciesAssemblies[i] << " is not a valid exclusion. No Species rank LCA." << endl;
0 commit comments