Skip to content

Commit cc0493c

Browse files
authored
Merge pull request #85 from jaebeom-kim/windows
DB search is now 30% faster in Windows OS (in cygwin)
2 parents 7d8f497 + 946c6a6 commit cc0493c

19 files changed

+1399
-374
lines changed

src/LocalCommandDeclarations.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,7 @@ extern int binning2report(int argc, const char **argv, const Command& command);
1515
extern int filterByGenus(int argc, const char **argv, const Command& command);
1616
extern int databaseReport(int argc, const char **argv, const Command& command);
1717
extern int mapping2taxon(int argc, const char **argv, const Command& command);
18+
extern int expand_diffidx(int argc, const char **argv, const Command& command);
19+
extern int makeAAoffset(int argc, const char **argv, const Command& command);
1820

1921
#endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H

src/commons/FileMerger.cpp

Lines changed: 0 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -20,164 +20,6 @@ FileMerger::~FileMerger() {
2020
delete taxonomy;
2121
}
2222

23-
//void FileMerger::mergeTargetFiles(std::vector<char*> diffIdxFileNames, std::vector<char*> infoFileNames, vector<int> & taxIdListAtRank, vector<int> & taxIdList) {
24-
// size_t writtenKmerCnt = 0;
25-
//
26-
// ///Files to write on & buffers to fill them
27-
// FILE * mergedDiffFile = fopen(mergedDiffFileName, "wb");
28-
// FILE * mergedInfoFile = fopen(mergedInfoFileName, "wb");
29-
// FILE * diffIdxSplitFile = fopen(diffIdxSplitFileName, "wb");
30-
// uint16_t * diffBuffer = (uint16_t *)malloc(sizeof(uint16_t) * kmerBufSize);
31-
// size_t diffBufferIdx = 0;
32-
// size_t totalBufferIdx = 0;
33-
// TargetKmerInfo * infoBuffer = (TargetKmerInfo *)malloc(sizeof(TargetKmerInfo) * kmerBufSize);
34-
// size_t infoBufferIdx = 0;
35-
// size_t totalInfoIdx = 0;
36-
//
37-
// ///Prepare files to merge
38-
// size_t numOfSplitFiles = diffIdxFileNames.size();
39-
// size_t numOfincompletedFiles = numOfSplitFiles;
40-
// size_t numOfKmerBeforeMerge = 0;
41-
// uint64_t * lookingKmers = new uint64_t[numOfSplitFiles];
42-
//// uint64_t lookingKmers[numOfSplitFiles];
43-
//// TargetKmerInfo lookingInfos[numOfSplitFiles];
44-
// auto * lookingInfos = new TargetKmerInfo[numOfSplitFiles];
45-
// //size_t diffFileIdx[numOfSplitFiles];
46-
// auto * diffFileIdx = new size_t[numOfSplitFiles];
47-
// memset(diffFileIdx, 0, numOfSplitFiles * sizeof(size_t));
48-
// auto * infoFileIdx = new size_t[numOfSplitFiles];
49-
//// size_t infoFileIdx[numOfSplitFiles];
50-
// memset(infoFileIdx, 0, numOfSplitFiles * sizeof(size_t));
51-
// size_t maxIdxOfEachFiles[numOfSplitFiles];
52-
// struct MmapedData<uint16_t> *diffFileList = new struct MmapedData<uint16_t>[numOfSplitFiles];
53-
// struct MmapedData<TargetKmerInfo> *infoFileList = new struct MmapedData<TargetKmerInfo>[numOfSplitFiles];
54-
// for (size_t file = 0; file < numOfSplitFiles; file++) {
55-
// diffFileList[file] = mmapData<uint16_t>(diffIdxFileNames[file]);
56-
// infoFileList[file] = mmapData<TargetKmerInfo>(infoFileNames[file]);
57-
// maxIdxOfEachFiles[file] = diffFileList[file].fileSize / sizeof(uint16_t);
58-
// numOfKmerBeforeMerge += infoFileList[file].fileSize / sizeof(TargetKmerInfo);
59-
// }
60-
//
61-
// ///To make differential index splits
62-
// uint64_t AAofTempSplitOffset = UINT64_MAX;
63-
// size_t sizeOfSplit = numOfKmerBeforeMerge / (SplitNum - 1);
64-
// size_t offsetList[SplitNum + 1];
65-
// int offsetListIdx = 1;
66-
// for(size_t os = 0; os < SplitNum; os++){
67-
// offsetList[os] = os * sizeOfSplit;
68-
// }
69-
// offsetList[SplitNum] = UINT64_MAX;
70-
//
71-
// DiffIdxSplit splitList[SplitNum];
72-
// memset(splitList, 0, sizeof(DiffIdxSplit) * SplitNum);
73-
// int splitListIdx = 1;
74-
//
75-
// /// get the first k-mer to write
76-
// for(size_t file = 0; file < numOfSplitFiles; file++){
77-
// lookingKmers[file] = getNextKmer(0, diffFileList[file], diffFileIdx[file]);
78-
// lookingInfos[file] = infoFileList[file].data[0];
79-
// infoFileIdx[file] ++;
80-
// }
81-
//
82-
// size_t idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
83-
// uint64_t lastWrittenKmer = 0;
84-
// uint64_t entryKmer = lookingKmers[idxOfMin];
85-
// TargetKmerInfo entryInfo = lookingInfos[idxOfMin];
86-
//
87-
// // write first k-mer
88-
// getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx);
89-
// lastWrittenKmer = entryKmer;
90-
// writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx);
91-
// writtenKmerCnt++;
92-
// int splitCheck = 0;
93-
// int endFlag = 0;
94-
//
95-
// while(true){
96-
// // update entry k-mer
97-
// entryKmer = lookingKmers[idxOfMin];
98-
// entryInfo = lookingInfos[idxOfMin];
99-
//
100-
// ///update looking k-mers
101-
// lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]);
102-
// lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]];
103-
// infoFileIdx[idxOfMin] ++;
104-
// if( diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){
105-
// lookingKmers[idxOfMin] = UINT64_MAX;
106-
// numOfincompletedFiles--;
107-
// if(numOfincompletedFiles == 0) break;
108-
// }
109-
// idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
110-
//
111-
// int hasSeenOtherStrains = 0;
112-
// while(taxIdListAtRank[entryInfo.sequenceID] == taxIdListAtRank[lookingInfos[idxOfMin].sequenceID]){
113-
// if(entryKmer != lookingKmers[idxOfMin]) break;
114-
//
115-
// hasSeenOtherStrains += (taxIdList[entryInfo.sequenceID] != taxIdList[lookingInfos[idxOfMin].sequenceID]);
116-
//
117-
// lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]);
118-
// lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]];
119-
// infoFileIdx[idxOfMin] ++;
120-
//
121-
// if(diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){
122-
// lookingKmers[idxOfMin] = UINT64_MAX;
123-
// numOfincompletedFiles--;
124-
// if(numOfincompletedFiles == 0){
125-
// endFlag = 1;
126-
// break;
127-
// }
128-
// }
129-
// idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
130-
// }
131-
//
132-
// entryInfo.redundancy = (hasSeenOtherStrains > 0 || entryInfo.redundancy);
133-
// getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx);
134-
// lastWrittenKmer = entryKmer;
135-
// writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx);
136-
// writtenKmerCnt++;
137-
//
138-
// if(AminoAcid(lastWrittenKmer) != AAofTempSplitOffset && splitCheck == 1){
139-
// splitList[splitListIdx++] = {lastWrittenKmer, totalBufferIdx, totalInfoIdx};
140-
// splitCheck = 0;
141-
// }
142-
//
143-
// if(writtenKmerCnt == offsetList[offsetListIdx]){
144-
// AAofTempSplitOffset = AminoAcid(lastWrittenKmer);
145-
// splitCheck = 1;
146-
// offsetListIdx++;
147-
// }
148-
//
149-
// if(endFlag == 1) break;
150-
// }
151-
//
152-
// cre->flushInfoBuf(infoBuffer, mergedInfoFile, infoBufferIdx);
153-
// cre->flushKmerBuf(diffBuffer, mergedDiffFile, diffBufferIdx);
154-
// fwrite(splitList, sizeof(DiffIdxSplit), SplitNum, diffIdxSplitFile);
155-
// for(int i = 0; i < SplitNum; i++){
156-
// cout<<splitList[i].ADkmer<< " "<<splitList[i].diffIdxOffset<< " "<<splitList[i].infoIdxOffset<<endl;
157-
// }
158-
// free(diffBuffer);
159-
// free(infoBuffer);
160-
// fclose(mergedDiffFile);
161-
// fclose(mergedInfoFile);
162-
// fclose(diffIdxSplitFile);
163-
//
164-
// for(size_t file = 0; file < numOfSplitFiles; file++){
165-
// munmap(diffFileList[file].data, diffFileList[file].fileSize + 1);
166-
// munmap(infoFileList[file].data, infoFileList[file].fileSize + 1);
167-
// }
168-
// cout<<"Creating target DB is done"<<endl;
169-
// cout<<"Total k-mer count : " << numOfKmerBeforeMerge <<endl;
170-
// cout<<"Written k-mer count : " << writtenKmerCnt << endl;
171-
//
172-
// delete[] diffFileList;
173-
// delete[] infoFileList;
174-
// delete[] lookingInfos;
175-
// delete[] lookingKmers;
176-
// delete[] diffFileIdx;
177-
// delete[] infoFileIdx;
178-
//}
179-
180-
18123
// Merge differential index and k-mer information files, reducing redundancy
18224
void FileMerger::mergeTargetFiles(const LocalParameters & par, int numOfSplits) {
18325
size_t writtenKmerCnt = 0;

src/commons/IndexCreator.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,7 +1076,6 @@ void IndexCreator::editTaxonomyDumpFiles(const vector<pair<string, pair<TaxID, T
10761076
}
10771077

10781078
std::string line;
1079-
size_t count = 0;
10801079
unordered_map<int, int> mergedMap;
10811080
while (std::getline(ss, line)) {
10821081
std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
@@ -1171,4 +1170,4 @@ TaxID IndexCreator::getMaxTaxID() {
11711170
ss.close();
11721171

11731172
return maxTaxID;
1174-
}
1173+
}

src/commons/IndexCreator.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,23 +185,32 @@ class IndexCreator{
185185
unordered_map<string, TaxID> & foundAcc2taxid);
186186

187187
static void getSeqSegmentsWithHead(vector<SequenceBlock> & seqSegments, const char * seqFileName);
188+
188189
IndexCreator(const LocalParameters & par);
190+
189191
IndexCreator() {taxonomy = nullptr;}
192+
190193
~IndexCreator();
194+
191195
int getNumOfFlush();
192-
void startIndexCreatingParallel(const LocalParameters & par);
193196

194197
void createIndex(const LocalParameters & par);
195198

196199
void updateIndex(const LocalParameters & par);
197200

198201
void getDiffIdx(const uint64_t & lastKmer, const uint64_t & entryToWrite, FILE* handleKmerTable,
199202
uint16_t *kmerBuf, size_t & localBufIdx);
203+
200204
void getDiffIdx(const uint64_t & lastKmer, const uint64_t & entryToWrite, FILE* handleKmerTable,
201205
uint16_t *kmerBuf, size_t & localBufIdx, size_t & totalBufferIdx);
206+
202207
void writeInfo(TargetKmerInfo * entryToWrite, FILE * infoFile, TargetKmerInfo * infoBuffer, size_t & infoBufferIdx);
208+
203209
static void flushKmerBuf(uint16_t *buffer, FILE *handleKmerTable, size_t & localBufIdx);
210+
204211
static void flushInfoBuf(TargetKmerInfo * buffer, FILE * infoFile, size_t & localBufIdx );
205212

213+
void makeAAoffsets(const LocalParameters & par);
214+
206215
};
207216
#endif //ADKMER4_INDEXCREATOR_H

src/commons/Kmer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ struct DiffIdxSplit{
4141
DiffIdxSplit(uint64_t ADkmer, size_t diffIdxOffset, size_t infoIdxOffset) : ADkmer(ADkmer), diffIdxOffset(diffIdxOffset), infoIdxOffset(infoIdxOffset) { }
4242
DiffIdxSplit(const DiffIdxSplit & copy) {ADkmer = copy.ADkmer; diffIdxOffset = copy.diffIdxOffset; infoIdxOffset=copy.infoIdxOffset;}
4343
DiffIdxSplit() {};
44+
DiffIdxSplit& operator=(const DiffIdxSplit&) = default;
4445
uint64_t ADkmer;
4546
size_t diffIdxOffset;
4647
size_t infoIdxOffset;

0 commit comments

Comments
 (0)