Refactor MPI execution branching selection and allow unused branches to be optimized away

RichardAngersbach · RichardAngersbach · commit 26f6f1a7085b · 2024-12-19T16:02:13.000+01:00
(cherry picked from commit 14709d7)
diff --git a/Compiler/src/exastencils/app/ir/IR_LayerHandler.scala b/Compiler/src/exastencils/app/ir/IR_LayerHandler.scala
@@ -212,7 +212,8 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
       CUDA_AdaptKernelDimensionality,
       CUDA_HandleFragmentLoops,
       CUDA_HandleReductions,
-      CUDA_ReplaceStdFunctionCallsWrapper))
+      CUDA_ReplaceStdFunctionCallsWrapper,
+      CUDA_SetExecutionBranching))
 
     scheduler.register(IR_LayoutTansformation)
 
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_AnnotateLoop.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_AnnotateLoop.scala
@@ -171,12 +171,6 @@ object CUDA_AnnotateLoop extends DefaultStrategy("Calculate the annotations for
       }
   }, false)
 
-  this += new Transformation("Set final condition for host/device selection", {
-    case c : IR_IfCondition if c.hasAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION) =>
-      c.condition = c.removeAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION).get.asInstanceOf[NoDuplicateWrapper[IR_Expression]].value
-      c
-  }, false)
-
   /// CUDA_GatherLoopIteratorUsage
   object CUDA_GatherLoopIteratorUsage extends QuietDefaultStrategy("Gather surrounding loop iterator accesses") {
     var loopIterators : Set[String] = Set[String]()
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_ExecutionBranching.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_ExecutionBranching.scala
@@ -6,18 +6,38 @@ import exastencils.base.ir._
 import exastencils.base.ir.IR_ImplicitConversion._
 import exastencils.config.Knowledge
 import exastencils.config.Platform
+import exastencils.datastructures.DefaultStrategy
+import exastencils.datastructures.Transformation
 import exastencils.util.NoDuplicateWrapper
 
 // compile switch for cpu/gpu exec
 trait CUDA_ExecutionBranching {
-  def getHostDeviceBranchingMPI(hostStmts : ListBuffer[IR_Statement], deviceStmts : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = {
-    val defaultChoice : IR_Expression = Knowledge.cuda_preferredExecution match {
-      case _ if !Platform.hw_gpu_gpuDirectAvailable => 1 // if GPUDirect is not available default to CPU
-      case "Host"                                   => 1 // CPU by default
-      case "Device"                                 => 0 // GPU by default
-      case "Performance"                            => 1 // FIXME: Knowledge flag
+
+  private def getDefaultChoiceMPI() : IR_Expression = {
+    Knowledge.cuda_preferredExecution match {
+      case _ if !Platform.hw_gpu_gpuDirectAvailable => true // if GPUDirect is not available default to CPU
+      case "Host"                                   => true // CPU by default
+      case "Device"                                 => false // GPU by default
+      case "Performance"                            => true // FIXME: Knowledge flag
       case "Condition"                              => Knowledge.cuda_executionCondition
     }
+  }
+
+  def getHostDeviceBranchingMPICondWrapper(condWrapper : NoDuplicateWrapper[IR_Expression],
+    hostStmts : ListBuffer[IR_Statement], deviceStmts : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = {
+
+    // get execution choice
+    condWrapper.value = getDefaultChoiceMPI()
+
+    // set dummy first to prevent IR_GeneralSimplify from removing the branch statement until the condition is final
+    val branch = IR_IfCondition(IR_VariableAccess("replaceIn_CUDA_SetExecutionBranching", IR_BooleanDatatype), hostStmts, deviceStmts)
+    branch.annotate(CUDA_Util.CUDA_BRANCH_CONDITION, condWrapper)
+    ListBuffer[IR_Statement](branch)
+  }
+
+  def getHostDeviceBranchingMPI(hostStmts : ListBuffer[IR_Statement], deviceStmts : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = {
+    // get execution choice
+    val defaultChoice = getDefaultChoiceMPI()
 
     ListBuffer[IR_Statement](IR_IfCondition(defaultChoice, hostStmts, deviceStmts))
   }
@@ -45,8 +65,16 @@ trait CUDA_ExecutionBranching {
     condWrapper.value = getDefaultChoice(estimatedFasterHostExec)
 
     // set dummy first to prevent IR_GeneralSimplify from removing the branch statement until the condition is final
-    val branch = IR_IfCondition(IR_VariableAccess("replaceIn_CUDA_AnnotateLoops", IR_BooleanDatatype), hostStmts, deviceStmts)
+    val branch = IR_IfCondition(IR_VariableAccess("replaceIn_CUDA_SetExecutionBranching", IR_BooleanDatatype), hostStmts, deviceStmts)
     branch.annotate(CUDA_Util.CUDA_BRANCH_CONDITION, condWrapper)
     ListBuffer[IR_Statement](branch)
   }
 }
+
+object CUDA_SetExecutionBranching extends DefaultStrategy("Set final condition for host/device selection") {
+  this += new Transformation("..", {
+    case c : IR_IfCondition if c.hasAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION) =>
+      c.condition = c.removeAnnotation(CUDA_Util.CUDA_BRANCH_CONDITION).get.asInstanceOf[NoDuplicateWrapper[IR_Expression]].value
+      c
+  }, false)
+}
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala
@@ -33,6 +33,7 @@ import exastencils.logger.Logger
 import exastencils.parallelization.api.mpi._
 import exastencils.parallelization.ir.IR_HasParallelizationInfo
 import exastencils.timing.ir.IR_TimerFunctions
+import exastencils.util.NoDuplicateWrapper
 import exastencils.util.ir._
 
 /// CUDA_PrepareMPICode
@@ -251,7 +252,8 @@ object CUDA_PrepareMPICode extends DefaultStrategy("Prepare CUDA relevant code b
         deviceStmts ++= afterDevice
 
         /// compile final switch
-        getHostDeviceBranchingMPI(hostStmts, deviceStmts)
+        val condWrapper = NoDuplicateWrapper[IR_Expression](null)
+        getHostDeviceBranchingMPICondWrapper(condWrapper, hostStmts, deviceStmts)
       }
   }, false)
 }