Skip to content

Commit 26f6f1a

Browse files
Refactor MPI execution branching selection and allow unused branches to be optimized away
(cherry picked from commit 14709d7)
1 parent 3f94f3a commit 26f6f1a

File tree

4 files changed

+40
-15
lines changed

4 files changed

+40
-15
lines changed

Compiler/src/exastencils/app/ir/IR_LayerHandler.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,8 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
212212
CUDA_AdaptKernelDimensionality,
213213
CUDA_HandleFragmentLoops,
214214
CUDA_HandleReductions,
215-
CUDA_ReplaceStdFunctionCallsWrapper))
215+
CUDA_ReplaceStdFunctionCallsWrapper,
216+
CUDA_SetExecutionBranching))
216217

217218
scheduler.register(IR_LayoutTansformation)
218219

Compiler/src/exastencils/parallelization/api/cuda/CUDA_AnnotateLoop.scala

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,6 @@ object CUDA_AnnotateLoop extends DefaultStrategy("Calculate the annotations for
171171
}
172172
}, false)
173173

174-
this += new Transformation("Set final condition for host/device selection", {
175-
case c : IR_IfCondition if c.hasAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION) =>
176-
c.condition = c.removeAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION).get.asInstanceOf[NoDuplicateWrapper[IR_Expression]].value
177-
c
178-
}, false)
179-
180174
/// CUDA_GatherLoopIteratorUsage
181175
object CUDA_GatherLoopIteratorUsage extends QuietDefaultStrategy("Gather surrounding loop iterator accesses") {
182176
var loopIterators : Set[String] = Set[String]()

Compiler/src/exastencils/parallelization/api/cuda/CUDA_ExecutionBranching.scala

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,38 @@ import exastencils.base.ir._
66
import exastencils.base.ir.IR_ImplicitConversion._
77
import exastencils.config.Knowledge
88
import exastencils.config.Platform
9+
import exastencils.datastructures.DefaultStrategy
10+
import exastencils.datastructures.Transformation
911
import exastencils.util.NoDuplicateWrapper
1012

1113
// compile switch for cpu/gpu exec
1214
trait CUDA_ExecutionBranching {
13-
def getHostDeviceBranchingMPI(hostStmts : ListBuffer[IR_Statement], deviceStmts : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = {
14-
val defaultChoice : IR_Expression = Knowledge.cuda_preferredExecution match {
15-
case _ if !Platform.hw_gpu_gpuDirectAvailable => 1 // if GPUDirect is not available default to CPU
16-
case "Host" => 1 // CPU by default
17-
case "Device" => 0 // GPU by default
18-
case "Performance" => 1 // FIXME: Knowledge flag
15+
16+
private def getDefaultChoiceMPI() : IR_Expression = {
17+
Knowledge.cuda_preferredExecution match {
18+
case _ if !Platform.hw_gpu_gpuDirectAvailable => true // if GPUDirect is not available default to CPU
19+
case "Host" => true // CPU by default
20+
case "Device" => false // GPU by default
21+
case "Performance" => true // FIXME: Knowledge flag
1922
case "Condition" => Knowledge.cuda_executionCondition
2023
}
24+
}
25+
26+
def getHostDeviceBranchingMPICondWrapper(condWrapper : NoDuplicateWrapper[IR_Expression],
27+
hostStmts : ListBuffer[IR_Statement], deviceStmts : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = {
28+
29+
// get execution choice
30+
condWrapper.value = getDefaultChoiceMPI()
31+
32+
// set dummy first to prevent IR_GeneralSimplify from removing the branch statement until the condition is final
33+
val branch = IR_IfCondition(IR_VariableAccess("replaceIn_CUDA_SetExecutionBranching", IR_BooleanDatatype), hostStmts, deviceStmts)
34+
branch.annotate(CUDA_Util.CUDA_BRANCH_CONDITION, condWrapper)
35+
ListBuffer[IR_Statement](branch)
36+
}
37+
38+
def getHostDeviceBranchingMPI(hostStmts : ListBuffer[IR_Statement], deviceStmts : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = {
39+
// get execution choice
40+
val defaultChoice = getDefaultChoiceMPI()
2141

2242
ListBuffer[IR_Statement](IR_IfCondition(defaultChoice, hostStmts, deviceStmts))
2343
}
@@ -45,8 +65,16 @@ trait CUDA_ExecutionBranching {
4565
condWrapper.value = getDefaultChoice(estimatedFasterHostExec)
4666

4767
// set dummy first to prevent IR_GeneralSimplify from removing the branch statement until the condition is final
48-
val branch = IR_IfCondition(IR_VariableAccess("replaceIn_CUDA_AnnotateLoops", IR_BooleanDatatype), hostStmts, deviceStmts)
68+
val branch = IR_IfCondition(IR_VariableAccess("replaceIn_CUDA_SetExecutionBranching", IR_BooleanDatatype), hostStmts, deviceStmts)
4969
branch.annotate(CUDA_Util.CUDA_BRANCH_CONDITION, condWrapper)
5070
ListBuffer[IR_Statement](branch)
5171
}
5272
}
73+
74+
object CUDA_SetExecutionBranching extends DefaultStrategy("Set final condition for host/device selection") {
75+
this += new Transformation("..", {
76+
case c : IR_IfCondition if c.hasAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION) =>
77+
c.condition = c.removeAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION).get.asInstanceOf[NoDuplicateWrapper[IR_Expression]].value
78+
c
79+
}, false)
80+
}

Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import exastencils.logger.Logger
3333
import exastencils.parallelization.api.mpi._
3434
import exastencils.parallelization.ir.IR_HasParallelizationInfo
3535
import exastencils.timing.ir.IR_TimerFunctions
36+
import exastencils.util.NoDuplicateWrapper
3637
import exastencils.util.ir._
3738

3839
/// CUDA_PrepareMPICode
@@ -251,7 +252,8 @@ object CUDA_PrepareMPICode extends DefaultStrategy("Prepare CUDA relevant code b
251252
deviceStmts ++= afterDevice
252253

253254
/// compile final switch
254-
getHostDeviceBranchingMPI(hostStmts, deviceStmts)
255+
val condWrapper = NoDuplicateWrapper[IR_Expression](null)
256+
getHostDeviceBranchingMPICondWrapper(condWrapper, hostStmts, deviceStmts)
255257
}
256258
}, false)
257259
}

0 commit comments

Comments
 (0)