|
12 | 12 | * |
13 | 13 | * Likelihood function for GPU |
14 | 14 | **********************************************************************/ |
| 15 | +inline void computeBoundsGPU(int threads, int packets, size_t elements, vector<size_t> &limits) { |
| 16 | + int parallel_threads = 1; // to replace the VectorClass::size() |
| 17 | + |
| 18 | + //It is assumed that threads divides packets evenly |
| 19 | + limits.reserve(packets+1); |
| 20 | + elements = roundUpToMultiple(elements, parallel_threads); |
| 21 | + size_t block_start = 0; |
| 22 | + |
| 23 | + for (int wave = packets/threads; wave>=1; --wave) { |
| 24 | + size_t elementsThisWave = (elements-block_start); |
| 25 | + if (1<wave) { |
| 26 | + elementsThisWave = (elementsThisWave * 3) / 4; |
| 27 | + } |
| 28 | + elementsThisWave = roundUpToMultiple(elementsThisWave, parallel_threads); |
| 29 | + size_t stopElementThisWave = block_start + elementsThisWave; |
| 30 | + for (int threads_to_go=threads; 1<=threads_to_go; --threads_to_go) { |
| 31 | + limits.push_back(block_start); |
| 32 | + size_t block_size = (stopElementThisWave - block_start)/threads_to_go; |
| 33 | + block_size = roundUpToMultiple(block_size, parallel_threads); |
| 34 | + block_start += block_size; |
| 35 | + } |
| 36 | + } |
| 37 | + limits.push_back(elements); |
| 38 | + |
| 39 | + if (limits.size() != packets+1) { |
| 40 | + if (Params::getInstance().num_threads == 0) |
| 41 | + outError("Too many threads may slow down analysis [-nt option]. Reduce threads"); |
| 42 | + else |
| 43 | + outError("Too many threads may slow down analysis [-nt option]. Reduce threads or use -nt AUTO to automatically determine it"); |
| 44 | + } |
| 45 | +} |
15 | 46 |
|
16 | 47 | //void PhyloTree::computePartialLikelihoodGPU(TraversalInfo &info |
17 | 48 | // , size_t ptn_lower, size_t ptn_upper, int packet_id) |
|
87 | 118 | // partial_lh_leaves = aligned_alloc<double>(get_safe_upper_limit((aln->STATE_UNKNOWN+1)*block*num_leaves)); |
88 | 119 | // double *buffer_tmp = aligned_alloc<double>(nstates); |
89 | 120 | // |
90 | | -// computePartialInfo<VectorClass>(info, (VectorClass*)buffer_tmp, echildren, partial_lh_leaves); |
91 | | -// |
| 121 | +// computePartialInfoGPU(info, buffer_tmp, echildren, partial_lh_leaves); |
92 | 122 | // aligned_free(buffer_tmp); |
93 | 123 | // } else { |
94 | 124 | // echildren = info.echildren; |
|
123 | 153 | // double *vec_left = buffer_partial_lh_ptr + thread_buf_size * packet_id; |
124 | 154 | // |
125 | 155 | // double *vec_right = &vec_left[block*parallel_threads]; // HK: tmp remove SITE_MODEL |
126 | | -// VectorClass *partial_lh_tmp = (VectorClass*)vec_right+block; // HK: tmp remove SITE_MODEL |
| 156 | +// double *partial_lh_tmp = vec_right+block; // HK: tmp remove SITE_MODEL |
127 | 157 | // |
128 | 158 | // auto leftStateRow = this->getConvertedSequenceByNumber(left->node->id); |
129 | 159 | // auto rightStateRow = this->getConvertedSequenceByNumber(right->node->id); |
130 | 160 | // auto unknown = aln->STATE_UNKNOWN; |
131 | 161 | // |
132 | 162 | // for (size_t ptn = ptn_lower; ptn < ptn_upper; ptn+=parallel_threads) { |
133 | | -// VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block); |
| 163 | +// double *partial_lh = dad_branch->partial_lh + ptn*block; |
134 | 164 | // |
135 | 165 | // // HK: tmp remove SITE_MODEL |
136 | | -// VectorClass *vleft = (VectorClass*)vec_left; |
137 | | -// VectorClass *vright = (VectorClass*)vec_right; |
| 166 | +// double *vleft = vec_left; |
| 167 | +// double *vright = vec_right; |
138 | 168 | // // load data for tip |
139 | 169 | // for (size_t x = 0; x < parallel_threads; x++) { |
140 | 170 | // int leftState; |
|
211 | 241 | // |
212 | 242 | // |
213 | 243 | // double *vec_left = buffer_partial_lh_ptr + thread_buf_size * packet_id; |
214 | | -// VectorClass *partial_lh_tmp = (VectorClass*)vec_left+block; // HK: tmp remove SITE_MODEL |
| 244 | +// double *partial_lh_tmp = vec_left+block; // HK: tmp remove SITE_MODEL |
215 | 245 | // |
216 | 246 | // auto leftStateRow = this->getConvertedSequenceByNumber(left->node->id); |
217 | 247 | // auto unknown = aln->STATE_UNKNOWN; |
218 | 248 | // |
219 | 249 | // for (size_t ptn = ptn_lower; ptn < ptn_upper; ptn+=parallel_threads) { |
220 | | -// VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block); |
221 | | -// VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block); |
222 | | -// VectorClass lh_max = 0.0; |
| 250 | +// double *partial_lh = dad_branch->partial_lh + ptn*block; |
| 251 | +// double *partial_lh_right = right->partial_lh + ptn*block; |
| 252 | +// double lh_max = 0.0; |
223 | 253 | // |
224 | 254 | // // HK: tmp remove SITE_MODEL |
225 | | -// VectorClass *vleft = (VectorClass*)vec_left; |
| 255 | +// double *vleft = vec_left; |
226 | 256 | // // load data for tip |
227 | 257 | // for (size_t x = 0; x < parallel_threads; x++) { |
228 | 258 | // int state; |
|
253 | 283 | // double *inv_evec_ptr = inv_evec + mix_addr_malign[c]; |
254 | 284 | // // compute real partial likelihood vector |
255 | 285 | // for (size_t x = 0; x < nstates; x++) { |
256 | | -// VectorClass vright; |
| 286 | +// double vright; |
257 | 287 | // |
258 | 288 | // dotProductVec<VectorClass, double, FMA>(eright_ptr, partial_lh_right, vright, nstates); |
259 | 289 | // |
|
295 | 325 | // |
296 | 326 | // /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/ |
297 | 327 | // |
298 | | -// VectorClass *partial_lh_tmp |
299 | | -// = (VectorClass*)(buffer_partial_lh_ptr + thread_buf_size * packet_id); |
| 328 | +// double *partial_lh_tmp |
| 329 | +// = buffer_partial_lh_ptr + thread_buf_size * packet_id; |
300 | 330 | // for (size_t ptn = ptn_lower; ptn < ptn_upper; ptn+=parallel_threads) { |
301 | | -// VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block); |
302 | | -// VectorClass *partial_lh_left = (VectorClass*)(left->partial_lh + ptn*block); |
303 | | -// VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block); |
304 | | -// VectorClass lh_max = 0.0; |
| 331 | +// double *partial_lh = (dad_branch->partial_lh + ptn*block); |
| 332 | +// double *partial_lh_left = (left->partial_lh + ptn*block); |
| 333 | +// double *partial_lh_right = (right->partial_lh + ptn*block); |
| 334 | +// double lh_max = 0.0; |
305 | 335 | // UBYTE *scale_dad, *scale_left, *scale_right; |
306 | 336 | // |
307 | 337 | // // HK: tmp remove SAFE_NUMERIC |
|
315 | 345 | // |
316 | 346 | // double *eleft_ptr = eleft; |
317 | 347 | // double *eright_ptr = eright; |
318 | | -// VectorClass *expleft, *expright, *eval_ptr, *evec_ptr, *inv_evec_ptr; |
| 348 | +// double *expleft, *expright, *eval_ptr, *evec_ptr, *inv_evec_ptr; |
319 | 349 | // |
320 | 350 | // // HK: tmp remove SITE_MODEL |
321 | 351 | // |
@@ -461,5 +491,114 @@ void PhyloTree::computePartialInfoGPU(TraversalInfo &info, double* buffer, doubl |
461 | 491 | } |
462 | 492 |
|
463 | 493 |
|
| 494 | +void PhyloTree::computeTraversalInfoGPU(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh) { |
| 495 | + |
| 496 | + if ((tip_partial_lh_computed & 1) == 0) { |
| 497 | + computeTipPartialLikelihoodGPU(); |
| 498 | + } |
| 499 | + |
| 500 | + traversal_info.clear(); |
| 501 | + size_t nstates = aln->num_states; |
| 502 | + int parallel_threads = 1; |
| 503 | + |
| 504 | + // reserve beginning of buffer_partial_lh for other purpose |
| 505 | + size_t ncat_mix = 1; |
| 506 | + size_t block = aln->num_states; |
| 507 | + double *buffer = buffer_partial_lh + block*parallel_threads*num_packets + get_safe_upper_limit(block)*(aln->STATE_UNKNOWN+2); |
| 508 | + |
| 509 | + // HK: tmp remove non-reversible models |
| 510 | +/* |
| 511 | + // more buffer for non-reversible models |
| 512 | + if (!model->useRevKernel()) { |
| 513 | + buffer += get_safe_upper_limit(3*block*nstates); |
| 514 | + buffer += get_safe_upper_limit(block)*(aln->STATE_UNKNOWN+1)*2; |
| 515 | + buffer += block*2*parallel_threads*num_packets; |
| 516 | + } |
| 517 | +*/ |
| 518 | + |
| 519 | + // HK: tmp remove mem save |
| 520 | + |
| 521 | + PhyloNeighbor *dad_branch = (PhyloNeighbor*)dad->findNeighbor(node); |
| 522 | + PhyloNeighbor *node_branch = (PhyloNeighbor*)node->findNeighbor(dad); |
| 523 | + bool dad_locked = computeTraversalInfo(dad_branch, dad, buffer); |
| 524 | + bool node_locked = computeTraversalInfo(node_branch, node, buffer); |
| 525 | + |
| 526 | + // HK: tmp remove mem save |
| 527 | + |
| 528 | +/* |
| 529 | + if (verbose_mode >= VB_DEBUG && traversal_info.size() > 0) { |
| 530 | + Node *saved = root; |
| 531 | + root = dad; |
| 532 | + drawTree(cout); |
| 533 | + root = saved; |
| 534 | + } |
| 535 | +*/ |
| 536 | + |
| 537 | + if (traversal_info.empty()) |
| 538 | + return; |
| 539 | + |
| 540 | + if (!model->isSiteSpecificModel()) { |
| 541 | + |
| 542 | + int num_info = traversal_info.size(); |
| 543 | + |
| 544 | + // HK: tmp debugging verbose mode |
| 545 | + /* if (verbose_mode >= VB_DEBUG) { |
| 546 | + cout << "traversal order:"; |
| 547 | + for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) { |
| 548 | + cout << " "; |
| 549 | + if (it->dad->isLeaf()) |
| 550 | + cout << it->dad->name; |
| 551 | + else |
| 552 | + cout << it->dad->id; |
| 553 | + cout << "->"; |
| 554 | + if (it->dad_branch->node->isLeaf()) |
| 555 | + cout << it->dad_branch->node->name; |
| 556 | + else |
| 557 | + cout << it->dad_branch->node->id; |
| 558 | + if (params->lh_mem_save == LM_MEM_SAVE) { |
| 559 | + if (it->dad_branch->partial_lh_computed) |
| 560 | + cout << " ["; |
| 561 | + else |
| 562 | + cout << " ("; |
| 563 | + cout << mem_slots.findNei(it->dad_branch) - mem_slots.begin(); |
| 564 | + if (it->dad_branch->partial_lh_computed) |
| 565 | + cout << "]"; |
| 566 | + else |
| 567 | + cout << ")"; |
| 568 | + } |
| 569 | + } |
| 570 | + cout << endl; |
| 571 | + }*/ |
| 572 | + |
| 573 | + |
| 574 | + if (!Params::getInstance().buffer_mem_save) { |
| 575 | + |
| 576 | + double *buffer_tmp = (double*)buffer; |
| 577 | + |
| 578 | + for (int i = 0; i < num_info; i++) { |
| 579 | + computePartialInfoGPU(traversal_info[i], buffer_tmp); |
| 580 | + } |
| 581 | + |
| 582 | + } |
| 583 | + } |
| 584 | + |
| 585 | + if (compute_partial_lh) { |
| 586 | + vector<size_t> limits; |
| 587 | + size_t orig_nptn = roundUpToMultiple(aln->size(), parallel_threads); |
| 588 | + size_t nptn = roundUpToMultiple(orig_nptn+model_factory->unobserved_ptns.size(),parallel_threads); |
| 589 | + computeBoundsGPU(num_threads, num_packets, nptn, limits); |
| 590 | + |
| 591 | +#ifdef _OPENMP |
| 592 | +#pragma omp parallel for schedule(dynamic,1) num_threads(num_threads) |
| 593 | +#endif |
| 594 | + for (int packet_id = 0; packet_id < num_packets; ++packet_id) { |
| 595 | + for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) { |
| 596 | + computePartialLikelihood(*it, limits[packet_id], limits[packet_id+1], packet_id); |
| 597 | + } |
| 598 | + } |
| 599 | + traversal_info.clear(); |
| 600 | + } |
| 601 | + return; |
| 602 | +} |
464 | 603 |
|
465 | 604 | #endif //IQTREE_PHYLOKERNELGPU_H |
0 commit comments