diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 3e4b9bd5c..04191ac34 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -305,6 +305,7 @@ Parameters::Parameters(): PARAM_ID_MODE(PARAM_ID_MODE_ID, "--id-mode", "Database ID mode", "Select DB entries based on 0: database keys, 1: FASTA identifiers (.lookup)", typeid(int), (void *) &dbIdMode, "^[0-1]{1}$"), PARAM_TAR_INCLUDE(PARAM_TAR_INCLUDE_ID, "--tar-include", "Tar Inclusion Regex", "Include file names based on this regex", typeid(std::string), (void *) &tarInclude, "^.*$"), PARAM_TAR_EXCLUDE(PARAM_TAR_EXCLUDE_ID, "--tar-exclude", "Tar Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &tarExclude, "^.*$"), + PARAM_INPUT_MODE(PARAM_INPUT_MODE_ID, "--input-mode", "Input list mode", "0: only index, 1: index and range", typeid(int), (void *) &inputmode, "^[0-1]{1}$"), // unpackdb PARAM_UNPACK_SUFFIX(PARAM_UNPACK_SUFFIX_ID, "--unpack-suffix", "Unpack suffix", "File suffix for unpacked files.\nAdd .gz suffix to write compressed files.", typeid(std::string), (void *) &unpackSuffix, "^.*$"), PARAM_UNPACK_NAME_MODE(PARAM_UNPACK_NAME_MODE_ID, "--unpack-name-mode", "Unpack name mode", "Name unpacked files by 0: DB key, 1: accession (through .lookup)", typeid(int), (void *) &unpackNameMode, "^[0-1]{1}$"), @@ -1169,6 +1170,7 @@ Parameters::Parameters(): createsubdb.push_back(&PARAM_SUBDB_MODE); createsubdb.push_back(&PARAM_ID_MODE); createsubdb.push_back(&PARAM_V); + createsubdb.push_back(&PARAM_INPUT_MODE); // renamedbkeys renamedbkeys.push_back(&PARAM_SUBDB_MODE); @@ -2623,6 +2625,7 @@ void Parameters::setDefaults() { // createsubdb subDbMode = Parameters::SUBDB_MODE_HARD; dbIdMode = Parameters::ID_MODE_KEYS; + inputmode = 0; // tar2db tarInclude = ".*"; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 174145011..3004dd90e 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -708,6 +708,7 @@ class Parameters { // createsubdb int subDbMode; int dbIdMode; + int inputmode; // tar2db std::string tarInclude; @@ -1065,6 +1066,7 @@ class Parameters { // createsubdb PARAMETER(PARAM_SUBDB_MODE) PARAMETER(PARAM_ID_MODE) + PARAMETER(PARAM_INPUT_MODE) // tar2db PARAMETER(PARAM_TAR_INCLUDE) diff --git a/src/util/createsubdb.cpp b/src/util/createsubdb.cpp index dd20ccc30..13baa4787 100644 --- a/src/util/createsubdb.cpp +++ b/src/util/createsubdb.cpp @@ -11,9 +11,11 @@ int createsubdb(int argc, const char **argv, const Command& command) { Parameters& par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); + bool isIndex = false; FILE *orderFile = NULL; if (FileUtil::fileExists(par.db1Index.c_str())) { orderFile = fopen(par.db1Index.c_str(), "r"); + isIndex = true; } else { if(FileUtil::fileExists(par.db1.c_str())){ orderFile = fopen(par.db1.c_str(), "r"); @@ -22,7 +24,8 @@ int createsubdb(int argc, const char **argv, const Command& command) { EXIT(EXIT_FAILURE); } } - + //no multithreading + unsigned int thread_idx = 0; const bool lookupMode = par.dbIdMode == Parameters::ID_MODE_LOOKUP; int dbMode = DBReader::USE_INDEX|DBReader::USE_DATA; if (lookupMode) { @@ -32,7 +35,7 @@ int createsubdb(int argc, const char **argv, const Command& command) { reader.open(DBReader::NOSORT); const bool isCompressed = reader.isCompressed(); - DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE); + DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, isCompressed, Parameters::DBTYPE_OMIT_FILE); writer.open(); // getline reallocs automatic char *line = NULL; @@ -40,8 +43,13 @@ int createsubdb(int argc, const char **argv, const Command& command) { char dbKey[256]; unsigned int prevKey = 0; bool isOrdered = true; + char* result; + char newLine = '\n'; + char nullByte = '\0'; + std::vector arr; while (getline(&line, &len, orderFile) != -1) { Util::parseKey(line, dbKey); + arr = Util::split(line, "\t"); unsigned int key; if (lookupMode) { size_t lookupId = reader.getLookupIdByAccession(dbKey); @@ -62,21 +70,51 @@ int createsubdb(int argc, const char **argv, const Command& command) { continue; } if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) { - writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), 0); - } else { + writer.writeIndexEntry(key, reader.getOffset(id), reader.getEntryLen(id), thread_idx); + } else if (isIndex == true || arr.size() == 1 || par.inputmode == 0) { char* data = reader.getDataUncompressed(id); size_t originalLength = reader.getEntryLen(id); size_t entryLength = std::max(originalLength, static_cast(1)) - 1; - if (isCompressed) { // copy also the null byte since it contains the information if compressed or not entryLength = *(reinterpret_cast(data)) + sizeof(unsigned int) + 1; - writer.writeData(data, entryLength, key, 0, false, false); + writer.writeData(data, entryLength, key, thread_idx, false, false); } else { - writer.writeData(data, entryLength, key, 0, true, false); + writer.writeData(data, entryLength, key, thread_idx, true, false); } // do not write null byte since - writer.writeIndexEntry(key, writer.getStart(0), originalLength, 0); + writer.writeIndexEntry(key, writer.getStart(0), originalLength, thread_idx); + } else { + if (arr.size()%2 == 0) { + Debug(Debug::ERROR) << "Input list not in format\n"; + } else { + char* data; + if (isCompressed) { + data = reader.getDataCompressed(id, thread_idx); + } else { + data = reader.getDataUncompressed(id); + } + size_t entryLength = std::max(reader.getEntryLen(id), static_cast(1)); + int totalLength = 0; + result = new char[entryLength]; + for (int ord = 0 ; ord < int((arr.size()-1)/2); ord ++) { + int currLength = std::stoi(arr[ord * 2 + 2]) - std::stoi(arr[ord * 2 + 1]) + 1; + strncpy(result + totalLength, data + std::stoi(arr[ord * 2 + 1]), currLength); + totalLength += currLength; + } + result[totalLength] = newLine; + if (isCompressed) { + writer.writeData(result, totalLength + 1, key, thread_idx, true, false); + } else { + writer.writeData(result, totalLength, key, thread_idx, false, false); + writer.writeAdd(&newLine, sizeof(char), thread_idx); + writer.writeAdd(&nullByte, sizeof(char), thread_idx); + } + delete [] result; + result = nullptr; + + writer.writeIndexEntry(key, writer.getStart(0), totalLength + 2, thread_idx); + } } } // merge any kind of sequence database @@ -89,7 +127,6 @@ int createsubdb(int argc, const char **argv, const Command& command) { } DBWriter::writeDbtypeFile(par.db3.c_str(), reader.getDbtype(), isCompressed); DBReader::softlinkDb(par.db2, par.db3, DBFiles::SEQUENCE_ANCILLARY); - free(line); reader.close(); if (fclose(orderFile) != 0) {