@@ -20,164 +20,6 @@ FileMerger::~FileMerger() {
2020 delete taxonomy;
2121}
2222
23- // void FileMerger::mergeTargetFiles(std::vector<char*> diffIdxFileNames, std::vector<char*> infoFileNames, vector<int> & taxIdListAtRank, vector<int> & taxIdList) {
24- // size_t writtenKmerCnt = 0;
25- //
26- // ///Files to write on & buffers to fill them
27- // FILE * mergedDiffFile = fopen(mergedDiffFileName, "wb");
28- // FILE * mergedInfoFile = fopen(mergedInfoFileName, "wb");
29- // FILE * diffIdxSplitFile = fopen(diffIdxSplitFileName, "wb");
30- // uint16_t * diffBuffer = (uint16_t *)malloc(sizeof(uint16_t) * kmerBufSize);
31- // size_t diffBufferIdx = 0;
32- // size_t totalBufferIdx = 0;
33- // TargetKmerInfo * infoBuffer = (TargetKmerInfo *)malloc(sizeof(TargetKmerInfo) * kmerBufSize);
34- // size_t infoBufferIdx = 0;
35- // size_t totalInfoIdx = 0;
36- //
37- // ///Prepare files to merge
38- // size_t numOfSplitFiles = diffIdxFileNames.size();
39- // size_t numOfincompletedFiles = numOfSplitFiles;
40- // size_t numOfKmerBeforeMerge = 0;
41- // uint64_t * lookingKmers = new uint64_t[numOfSplitFiles];
42- // // uint64_t lookingKmers[numOfSplitFiles];
43- // // TargetKmerInfo lookingInfos[numOfSplitFiles];
44- // auto * lookingInfos = new TargetKmerInfo[numOfSplitFiles];
45- // //size_t diffFileIdx[numOfSplitFiles];
46- // auto * diffFileIdx = new size_t[numOfSplitFiles];
47- // memset(diffFileIdx, 0, numOfSplitFiles * sizeof(size_t));
48- // auto * infoFileIdx = new size_t[numOfSplitFiles];
49- // // size_t infoFileIdx[numOfSplitFiles];
50- // memset(infoFileIdx, 0, numOfSplitFiles * sizeof(size_t));
51- // size_t maxIdxOfEachFiles[numOfSplitFiles];
52- // struct MmapedData<uint16_t> *diffFileList = new struct MmapedData<uint16_t>[numOfSplitFiles];
53- // struct MmapedData<TargetKmerInfo> *infoFileList = new struct MmapedData<TargetKmerInfo>[numOfSplitFiles];
54- // for (size_t file = 0; file < numOfSplitFiles; file++) {
55- // diffFileList[file] = mmapData<uint16_t>(diffIdxFileNames[file]);
56- // infoFileList[file] = mmapData<TargetKmerInfo>(infoFileNames[file]);
57- // maxIdxOfEachFiles[file] = diffFileList[file].fileSize / sizeof(uint16_t);
58- // numOfKmerBeforeMerge += infoFileList[file].fileSize / sizeof(TargetKmerInfo);
59- // }
60- //
61- // ///To make differential index splits
62- // uint64_t AAofTempSplitOffset = UINT64_MAX;
63- // size_t sizeOfSplit = numOfKmerBeforeMerge / (SplitNum - 1);
64- // size_t offsetList[SplitNum + 1];
65- // int offsetListIdx = 1;
66- // for(size_t os = 0; os < SplitNum; os++){
67- // offsetList[os] = os * sizeOfSplit;
68- // }
69- // offsetList[SplitNum] = UINT64_MAX;
70- //
71- // DiffIdxSplit splitList[SplitNum];
72- // memset(splitList, 0, sizeof(DiffIdxSplit) * SplitNum);
73- // int splitListIdx = 1;
74- //
75- // /// get the first k-mer to write
76- // for(size_t file = 0; file < numOfSplitFiles; file++){
77- // lookingKmers[file] = getNextKmer(0, diffFileList[file], diffFileIdx[file]);
78- // lookingInfos[file] = infoFileList[file].data[0];
79- // infoFileIdx[file] ++;
80- // }
81- //
82- // size_t idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
83- // uint64_t lastWrittenKmer = 0;
84- // uint64_t entryKmer = lookingKmers[idxOfMin];
85- // TargetKmerInfo entryInfo = lookingInfos[idxOfMin];
86- //
87- // // write first k-mer
88- // getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx);
89- // lastWrittenKmer = entryKmer;
90- // writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx);
91- // writtenKmerCnt++;
92- // int splitCheck = 0;
93- // int endFlag = 0;
94- //
95- // while(true){
96- // // update entry k-mer
97- // entryKmer = lookingKmers[idxOfMin];
98- // entryInfo = lookingInfos[idxOfMin];
99- //
100- // ///update looking k-mers
101- // lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]);
102- // lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]];
103- // infoFileIdx[idxOfMin] ++;
104- // if( diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){
105- // lookingKmers[idxOfMin] = UINT64_MAX;
106- // numOfincompletedFiles--;
107- // if(numOfincompletedFiles == 0) break;
108- // }
109- // idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
110- //
111- // int hasSeenOtherStrains = 0;
112- // while(taxIdListAtRank[entryInfo.sequenceID] == taxIdListAtRank[lookingInfos[idxOfMin].sequenceID]){
113- // if(entryKmer != lookingKmers[idxOfMin]) break;
114- //
115- // hasSeenOtherStrains += (taxIdList[entryInfo.sequenceID] != taxIdList[lookingInfos[idxOfMin].sequenceID]);
116- //
117- // lookingKmers[idxOfMin] = getNextKmer(entryKmer, diffFileList[idxOfMin], diffFileIdx[idxOfMin]);
118- // lookingInfos[idxOfMin] = infoFileList[idxOfMin].data[infoFileIdx[idxOfMin]];
119- // infoFileIdx[idxOfMin] ++;
120- //
121- // if(diffFileIdx[idxOfMin] > maxIdxOfEachFiles[idxOfMin] ){
122- // lookingKmers[idxOfMin] = UINT64_MAX;
123- // numOfincompletedFiles--;
124- // if(numOfincompletedFiles == 0){
125- // endFlag = 1;
126- // break;
127- // }
128- // }
129- // idxOfMin = smallest(lookingKmers, lookingInfos, taxIdListAtRank, numOfSplitFiles);
130- // }
131- //
132- // entryInfo.redundancy = (hasSeenOtherStrains > 0 || entryInfo.redundancy);
133- // getDiffIdx(lastWrittenKmer, entryKmer, mergedDiffFile, diffBuffer, diffBufferIdx, totalBufferIdx);
134- // lastWrittenKmer = entryKmer;
135- // writeInfo(&entryInfo, mergedInfoFile, infoBuffer, infoBufferIdx, totalInfoIdx);
136- // writtenKmerCnt++;
137- //
138- // if(AminoAcid(lastWrittenKmer) != AAofTempSplitOffset && splitCheck == 1){
139- // splitList[splitListIdx++] = {lastWrittenKmer, totalBufferIdx, totalInfoIdx};
140- // splitCheck = 0;
141- // }
142- //
143- // if(writtenKmerCnt == offsetList[offsetListIdx]){
144- // AAofTempSplitOffset = AminoAcid(lastWrittenKmer);
145- // splitCheck = 1;
146- // offsetListIdx++;
147- // }
148- //
149- // if(endFlag == 1) break;
150- // }
151- //
152- // cre->flushInfoBuf(infoBuffer, mergedInfoFile, infoBufferIdx);
153- // cre->flushKmerBuf(diffBuffer, mergedDiffFile, diffBufferIdx);
154- // fwrite(splitList, sizeof(DiffIdxSplit), SplitNum, diffIdxSplitFile);
155- // for(int i = 0; i < SplitNum; i++){
156- // cout<<splitList[i].ADkmer<< " "<<splitList[i].diffIdxOffset<< " "<<splitList[i].infoIdxOffset<<endl;
157- // }
158- // free(diffBuffer);
159- // free(infoBuffer);
160- // fclose(mergedDiffFile);
161- // fclose(mergedInfoFile);
162- // fclose(diffIdxSplitFile);
163- //
164- // for(size_t file = 0; file < numOfSplitFiles; file++){
165- // munmap(diffFileList[file].data, diffFileList[file].fileSize + 1);
166- // munmap(infoFileList[file].data, infoFileList[file].fileSize + 1);
167- // }
168- // cout<<"Creating target DB is done"<<endl;
169- // cout<<"Total k-mer count : " << numOfKmerBeforeMerge <<endl;
170- // cout<<"Written k-mer count : " << writtenKmerCnt << endl;
171- //
172- // delete[] diffFileList;
173- // delete[] infoFileList;
174- // delete[] lookingInfos;
175- // delete[] lookingKmers;
176- // delete[] diffFileIdx;
177- // delete[] infoFileIdx;
178- // }
179-
180-
18123// Merge differential index and k-mer information files, reducing redundancy
18224void FileMerger::mergeTargetFiles (const LocalParameters & par, int numOfSplits) {
18325 size_t writtenKmerCnt = 0 ;
0 commit comments