Skip to content

Commit 6a88403

Browse files
committed
add computeBoundsGPU, computeTraversalInfoGPU, computeTipPartialLikelihoodGPU method for simple models removing vector class usage
1 parent 4494d80 commit 6a88403

File tree

4 files changed

+175
-194
lines changed

4 files changed

+175
-194
lines changed

tree/phylokernelgpu.h

Lines changed: 158 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,37 @@
1212
*
1313
* Likelihood function for GPU
1414
**********************************************************************/
15+
inline void computeBoundsGPU(int threads, int packets, size_t elements, vector<size_t> &limits) {
16+
int parallel_threads = 1; // to replace the VectorClass::size()
17+
18+
//It is assumed that threads divides packets evenly
19+
limits.reserve(packets+1);
20+
elements = roundUpToMultiple(elements, parallel_threads);
21+
size_t block_start = 0;
22+
23+
for (int wave = packets/threads; wave>=1; --wave) {
24+
size_t elementsThisWave = (elements-block_start);
25+
if (1<wave) {
26+
elementsThisWave = (elementsThisWave * 3) / 4;
27+
}
28+
elementsThisWave = roundUpToMultiple(elementsThisWave, parallel_threads);
29+
size_t stopElementThisWave = block_start + elementsThisWave;
30+
for (int threads_to_go=threads; 1<=threads_to_go; --threads_to_go) {
31+
limits.push_back(block_start);
32+
size_t block_size = (stopElementThisWave - block_start)/threads_to_go;
33+
block_size = roundUpToMultiple(block_size, parallel_threads);
34+
block_start += block_size;
35+
}
36+
}
37+
limits.push_back(elements);
38+
39+
if (limits.size() != packets+1) {
40+
if (Params::getInstance().num_threads == 0)
41+
outError("Too many threads may slow down analysis [-nt option]. Reduce threads");
42+
else
43+
outError("Too many threads may slow down analysis [-nt option]. Reduce threads or use -nt AUTO to automatically determine it");
44+
}
45+
}
1546

1647
//void PhyloTree::computePartialLikelihoodGPU(TraversalInfo &info
1748
// , size_t ptn_lower, size_t ptn_upper, int packet_id)
@@ -87,8 +118,7 @@
87118
// partial_lh_leaves = aligned_alloc<double>(get_safe_upper_limit((aln->STATE_UNKNOWN+1)*block*num_leaves));
88119
// double *buffer_tmp = aligned_alloc<double>(nstates);
89120
//
90-
// computePartialInfo<VectorClass>(info, (VectorClass*)buffer_tmp, echildren, partial_lh_leaves);
91-
//
121+
// computePartialInfoGPU(info, buffer_tmp, echildren, partial_lh_leaves);
92122
// aligned_free(buffer_tmp);
93123
// } else {
94124
// echildren = info.echildren;
@@ -123,18 +153,18 @@
123153
// double *vec_left = buffer_partial_lh_ptr + thread_buf_size * packet_id;
124154
//
125155
// double *vec_right = &vec_left[block*parallel_threads]; // HK: tmp remove SITE_MODEL
126-
// VectorClass *partial_lh_tmp = (VectorClass*)vec_right+block; // HK: tmp remove SITE_MODEL
156+
// double *partial_lh_tmp = vec_right+block; // HK: tmp remove SITE_MODEL
127157
//
128158
// auto leftStateRow = this->getConvertedSequenceByNumber(left->node->id);
129159
// auto rightStateRow = this->getConvertedSequenceByNumber(right->node->id);
130160
// auto unknown = aln->STATE_UNKNOWN;
131161
//
132162
// for (size_t ptn = ptn_lower; ptn < ptn_upper; ptn+=parallel_threads) {
133-
// VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
163+
// double *partial_lh = dad_branch->partial_lh + ptn*block;
134164
//
135165
// // HK: tmp remove SITE_MODEL
136-
// VectorClass *vleft = (VectorClass*)vec_left;
137-
// VectorClass *vright = (VectorClass*)vec_right;
166+
// double *vleft = vec_left;
167+
// double *vright = vec_right;
138168
// // load data for tip
139169
// for (size_t x = 0; x < parallel_threads; x++) {
140170
// int leftState;
@@ -211,18 +241,18 @@
211241
//
212242
//
213243
// double *vec_left = buffer_partial_lh_ptr + thread_buf_size * packet_id;
214-
// VectorClass *partial_lh_tmp = (VectorClass*)vec_left+block; // HK: tmp remove SITE_MODEL
244+
// double *partial_lh_tmp = vec_left+block; // HK: tmp remove SITE_MODEL
215245
//
216246
// auto leftStateRow = this->getConvertedSequenceByNumber(left->node->id);
217247
// auto unknown = aln->STATE_UNKNOWN;
218248
//
219249
// for (size_t ptn = ptn_lower; ptn < ptn_upper; ptn+=parallel_threads) {
220-
// VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
221-
// VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
222-
// VectorClass lh_max = 0.0;
250+
// double *partial_lh = dad_branch->partial_lh + ptn*block;
251+
// double *partial_lh_right = right->partial_lh + ptn*block;
252+
// double lh_max = 0.0;
223253
//
224254
// // HK: tmp remove SITE_MODEL
225-
// VectorClass *vleft = (VectorClass*)vec_left;
255+
// double *vleft = vec_left;
226256
// // load data for tip
227257
// for (size_t x = 0; x < parallel_threads; x++) {
228258
// int state;
@@ -253,7 +283,7 @@
253283
// double *inv_evec_ptr = inv_evec + mix_addr_malign[c];
254284
// // compute real partial likelihood vector
255285
// for (size_t x = 0; x < nstates; x++) {
256-
// VectorClass vright;
286+
// double vright;
257287
//
258288
// dotProductVec<VectorClass, double, FMA>(eright_ptr, partial_lh_right, vright, nstates);
259289
//
@@ -295,13 +325,13 @@
295325
//
296326
// /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
297327
//
298-
// VectorClass *partial_lh_tmp
299-
// = (VectorClass*)(buffer_partial_lh_ptr + thread_buf_size * packet_id);
328+
// double *partial_lh_tmp
329+
// = buffer_partial_lh_ptr + thread_buf_size * packet_id;
300330
// for (size_t ptn = ptn_lower; ptn < ptn_upper; ptn+=parallel_threads) {
301-
// VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
302-
// VectorClass *partial_lh_left = (VectorClass*)(left->partial_lh + ptn*block);
303-
// VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
304-
// VectorClass lh_max = 0.0;
331+
// double *partial_lh = (dad_branch->partial_lh + ptn*block);
332+
// double *partial_lh_left = (left->partial_lh + ptn*block);
333+
// double *partial_lh_right = (right->partial_lh + ptn*block);
334+
// double lh_max = 0.0;
305335
// UBYTE *scale_dad, *scale_left, *scale_right;
306336
//
307337
// // HK: tmp remove SAFE_NUMERIC
@@ -315,7 +345,7 @@
315345
//
316346
// double *eleft_ptr = eleft;
317347
// double *eright_ptr = eright;
318-
// VectorClass *expleft, *expright, *eval_ptr, *evec_ptr, *inv_evec_ptr;
348+
// double *expleft, *expright, *eval_ptr, *evec_ptr, *inv_evec_ptr;
319349
//
320350
// // HK: tmp remove SITE_MODEL
321351
//
@@ -461,5 +491,114 @@ void PhyloTree::computePartialInfoGPU(TraversalInfo &info, double* buffer, doubl
461491
}
462492

463493

494+
void PhyloTree::computeTraversalInfoGPU(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh) {
495+
496+
if ((tip_partial_lh_computed & 1) == 0) {
497+
computeTipPartialLikelihoodGPU();
498+
}
499+
500+
traversal_info.clear();
501+
size_t nstates = aln->num_states;
502+
int parallel_threads = 1;
503+
504+
// reserve beginning of buffer_partial_lh for other purpose
505+
size_t ncat_mix = 1;
506+
size_t block = aln->num_states;
507+
double *buffer = buffer_partial_lh + block*parallel_threads*num_packets + get_safe_upper_limit(block)*(aln->STATE_UNKNOWN+2);
508+
509+
// HK: tmp remove non-reversible models
510+
/*
511+
// more buffer for non-reversible models
512+
if (!model->useRevKernel()) {
513+
buffer += get_safe_upper_limit(3*block*nstates);
514+
buffer += get_safe_upper_limit(block)*(aln->STATE_UNKNOWN+1)*2;
515+
buffer += block*2*parallel_threads*num_packets;
516+
}
517+
*/
518+
519+
// HK: tmp remove mem save
520+
521+
PhyloNeighbor *dad_branch = (PhyloNeighbor*)dad->findNeighbor(node);
522+
PhyloNeighbor *node_branch = (PhyloNeighbor*)node->findNeighbor(dad);
523+
bool dad_locked = computeTraversalInfo(dad_branch, dad, buffer);
524+
bool node_locked = computeTraversalInfo(node_branch, node, buffer);
525+
526+
// HK: tmp remove mem save
527+
528+
/*
529+
if (verbose_mode >= VB_DEBUG && traversal_info.size() > 0) {
530+
Node *saved = root;
531+
root = dad;
532+
drawTree(cout);
533+
root = saved;
534+
}
535+
*/
536+
537+
if (traversal_info.empty())
538+
return;
539+
540+
if (!model->isSiteSpecificModel()) {
541+
542+
int num_info = traversal_info.size();
543+
544+
// HK: tmp debugging verbose mode
545+
/* if (verbose_mode >= VB_DEBUG) {
546+
cout << "traversal order:";
547+
for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) {
548+
cout << " ";
549+
if (it->dad->isLeaf())
550+
cout << it->dad->name;
551+
else
552+
cout << it->dad->id;
553+
cout << "->";
554+
if (it->dad_branch->node->isLeaf())
555+
cout << it->dad_branch->node->name;
556+
else
557+
cout << it->dad_branch->node->id;
558+
if (params->lh_mem_save == LM_MEM_SAVE) {
559+
if (it->dad_branch->partial_lh_computed)
560+
cout << " [";
561+
else
562+
cout << " (";
563+
cout << mem_slots.findNei(it->dad_branch) - mem_slots.begin();
564+
if (it->dad_branch->partial_lh_computed)
565+
cout << "]";
566+
else
567+
cout << ")";
568+
}
569+
}
570+
cout << endl;
571+
}*/
572+
573+
574+
if (!Params::getInstance().buffer_mem_save) {
575+
576+
double *buffer_tmp = (double*)buffer;
577+
578+
for (int i = 0; i < num_info; i++) {
579+
computePartialInfoGPU(traversal_info[i], buffer_tmp);
580+
}
581+
582+
}
583+
}
584+
585+
if (compute_partial_lh) {
586+
vector<size_t> limits;
587+
size_t orig_nptn = roundUpToMultiple(aln->size(), parallel_threads);
588+
size_t nptn = roundUpToMultiple(orig_nptn+model_factory->unobserved_ptns.size(),parallel_threads);
589+
computeBoundsGPU(num_threads, num_packets, nptn, limits);
590+
591+
#ifdef _OPENMP
592+
#pragma omp parallel for schedule(dynamic,1) num_threads(num_threads)
593+
#endif
594+
for (int packet_id = 0; packet_id < num_packets; ++packet_id) {
595+
for (auto it = traversal_info.begin(); it != traversal_info.end(); it++) {
596+
computePartialLikelihood(*it, limits[packet_id], limits[packet_id+1], packet_id);
597+
}
598+
}
599+
traversal_info.clear();
600+
}
601+
return;
602+
}
464603

465604
#endif //IQTREE_PHYLOKERNELGPU_H

tree/phylokernelnew.h

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,13 +1156,8 @@ template<class VectorClass>
11561156
#endif
11571157
void PhyloTree::computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh) {
11581158

1159-
if ((tip_partial_lh_computed & 1) == 0) {
1160-
#ifdef OPENMP_GPU
1161-
computeTipPartialLikelihoodGPU();
1162-
#else
1159+
if ((tip_partial_lh_computed & 1) == 0)
11631160
computeTipPartialLikelihood();
1164-
#endif
1165-
}
11661161

11671162
traversal_info.clear();
11681163
#ifndef KERNEL_FIX_STATES
@@ -1252,9 +1247,7 @@ void PhyloTree::computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compu
12521247

12531248

12541249
if (!Params::getInstance().buffer_mem_save) {
1255-
#ifdef OPENMP_GPU
1256-
double *buffer_tmp = (double*)buffer;
1257-
#elif _OPENMP
1250+
#ifdef _OPENMP
12581251
#pragma omp parallel if (num_info >= 3) num_threads(num_threads)
12591252
{
12601253
VectorClass *buffer_tmp = (VectorClass*)buffer + aln->num_states*omp_get_thread_num();
@@ -1263,16 +1256,13 @@ void PhyloTree::computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compu
12631256
VectorClass *buffer_tmp = (VectorClass*)buffer;
12641257
#endif
12651258
for (int i = 0; i < num_info; i++) {
1266-
1267-
#ifdef OPENMP_GPU
1268-
computePartialInfoGPU(traversal_info[i], buffer_tmp);
1269-
#elif defined(KERNEL_FIX_STATES)
1259+
#ifdef KERNEL_FIX_STATES
12701260
computePartialInfo<VectorClass, nstates>(traversal_info[i], buffer_tmp);
12711261
#else
12721262
computePartialInfo<VectorClass>(traversal_info[i], buffer_tmp);
12731263
#endif
12741264
}
1275-
#if defined(_OPENMP) && !defined(OPENMP_GPU)
1265+
#ifdef _OPENMP
12761266
}
12771267
#endif
12781268
}
@@ -2271,7 +2261,9 @@ void PhyloTree::computeLikelihoodDervGenericSIMD(PhyloNeighbor *dad_branch, Phyl
22712261
}
22722262
int branch_id = node_branch->id;
22732263

2274-
#ifdef KERNEL_FIX_STATES
2264+
#ifdef OPENMP_GPU
2265+
computeTraversalInfoGPU(node, dad, false);
2266+
#elif defined(KERNEL_FIX_STATES)
22752267
computeTraversalInfo<VectorClass, nstates>(node, dad, false);
22762268
#else
22772269
computeTraversalInfo<VectorClass>(node, dad, false);
@@ -2692,7 +2684,9 @@ double PhyloTree::computeLikelihoodBranchGenericSIMD(PhyloNeighbor *dad_branch,
26922684
node_branch = tmp_nei;
26932685
}
26942686

2695-
#ifdef KERNEL_FIX_STATES
2687+
#ifdef OPENMP_GPU
2688+
computeTraversalInfoGPU(node, dad, false);
2689+
#elif defined(KERNEL_FIX_STATES)
26962690
computeTraversalInfo<VectorClass, nstates>(node, dad, false);
26972691
#else
26982692
computeTraversalInfo<VectorClass>(node, dad, false);
@@ -3532,7 +3526,12 @@ void PhyloTree::computeLikelihoodDervMixlenGenericSIMD(PhyloNeighbor *dad_branch
35323526
node_branch = tmp_nei;
35333527
}
35343528

3535-
#ifdef KERNEL_FIX_STATES
3529+
#ifdef OPENMP_GPU
3530+
computeTraversalInfoGPU(node, dad, false);
3531+
#ifndef KERNEL_FIX_STATES
3532+
size_t nstates = aln->num_states;
3533+
#endif
3534+
#elif defined(KERNEL_FIX_STATES)
35363535
computeTraversalInfo<VectorClass, nstates>(node, dad, false);
35373536
#else
35383537
computeTraversalInfo<VectorClass>(node, dad, false);

tree/phylotree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,7 @@ class PhyloTree : public MTree, public Optimization, public CheckpointFactory {
870870
template<class VectorClass>
871871
void computeTraversalInfo(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh);
872872

873+
void computeTraversalInfoGPU(PhyloNode *node, PhyloNode *dad, bool compute_partial_lh);
873874
/**
874875
precompute info for models
875876
*/

0 commit comments

Comments
 (0)