@@ -440,7 +440,8 @@ void CodeGenFunction::EmitNoLoopXteamScanInit(const OMPLoopDirective &LD,
440440 const FunctionArgList *Args,
441441 llvm::Value *&GpuThreadId,
442442 llvm::Value *&GlobalGpuThreadId,
443- llvm::Value *&WorkGroupId) {
443+ llvm::Value *&WorkGroupId,
444+ llvm::Value *&TotalNumThreads) {
444445 auto IVPair = EmitNoLoopIV (LD, Args);
445446 Address OMPIterationVarAddr = IVPair.second ;
446447
@@ -468,6 +469,8 @@ void CodeGenFunction::EmitNoLoopXteamScanInit(const OMPLoopDirective &LD,
468469 CGM.updateXteamRedKernel (
469470 CapturedForStmt, Builder.CreateIntCast (OMPIterationVar, Int64Ty, false ),
470471 NumTeams);
472+ TotalNumThreads =
473+ Builder.CreateMul (NumTeams, WorkGroupSize, " total_num_threads" );
471474 Builder.CreateStore (OMPIterationVar, OMPIterationVarAddr);
472475
473476 // Emit updates of the original loop indices
@@ -488,8 +491,9 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseOneCode(
488491 llvm::Value *GpuThreadId = nullptr ;
489492 llvm::Value *GlobalGpuThreadId = nullptr ;
490493 llvm::Value *WorkGroupId = nullptr ;
494+ llvm::Value *TotalNumThreads = nullptr ;
491495 EmitNoLoopXteamScanInit (LD, CapturedForStmt, Args, GpuThreadId,
492- GlobalGpuThreadId, WorkGroupId);
496+ GlobalGpuThreadId, WorkGroupId, TotalNumThreads );
493497
494498 // Branch to end if original loop condition not satisfied
495499 llvm::Value *IvCmp = EvaluateExprAsBool (LD.getCond ());
@@ -539,8 +543,9 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode(
539543 llvm::Value *GpuThreadId = nullptr ;
540544 llvm::Value *GlobalGpuThreadId = nullptr ;
541545 llvm::Value *WorkGroupId = nullptr ;
546+ llvm::Value *TotalNumThreads = nullptr ;
542547 EmitNoLoopXteamScanInit (LD, CapturedForStmt, Args, GpuThreadId,
543- GlobalGpuThreadId, WorkGroupId);
548+ GlobalGpuThreadId, WorkGroupId, TotalNumThreads );
544549
545550 const CodeGenModule::XteamRedVarMap &RedVarMap =
546551 CGM.getXteamRedVarMap (CapturedForStmt);
@@ -559,137 +564,21 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode(
559564 Address XteamRedSumArg3 = GetAddrOfLocalVar ((*Args)[RVI.ArgPos + 2 ]);
560565 llvm::Value *DScanStorage = Builder.CreateLoad (XteamRedSumArg3);
561566
562- // TODO: Extract a DeviceRTL function out of the PhaseTwo of Xteam Scan
563- // codegen.
564- if (CGM.OMPPresentScanDirective ->hasClausesOfKind <OMPInclusiveClause>()) {
565- // Handle the redistribution of cross-team scan result inside every
566- // constituent team member by emitting this -
567- // RedVar = Storage[GlobalTID]
568- // if(TeamID >= 1)
569- // {
570- // RedVar += TeamVals[TeamID - 1]
571- // }
572- Address ScanStorageValGEP = Address (
573- Builder.CreateGEP (RedVarType, DScanStorage, GlobalGpuThreadId),
574- RedVarType,
575- getContext ().getTypeAlignInChars (
576- XteamVD->getType ())); // Storage[GlobalTID]
577- Builder.CreateStore (Builder.CreateLoad (ScanStorageValGEP),
578- RVI.RedVarAddr ); // RedVar = Storage[GlobalTID]
579- llvm::Value *IsAfterFirstTeam = Builder.CreateICmpUGE (
580- WorkGroupId, llvm::ConstantInt::get (Int32Ty, 1 )); // TeamID >= 1
581- llvm::BasicBlock *IsAfterFirstTeamThenBlock =
582- createBasicBlock (" omp.is.after.first.team.then" );
583- llvm::BasicBlock *InclusiveScanEndBlock =
584- createBasicBlock (" omp.xteam.inclusive.scan.end" );
585- Builder.CreateCondBr (IsAfterFirstTeam, IsAfterFirstTeamThenBlock,
586- InclusiveScanEndBlock);
587- EmitBlock (IsAfterFirstTeamThenBlock);
588- Address PrevTeamValGEP =
589- Address (Builder.CreateGEP (
590- RedVarType, DTeamVals,
591- Builder.CreateSub (WorkGroupId,
592- llvm::ConstantInt::get (Int32Ty, 1 ))),
593- RedVarType,
594- getContext ().getTypeAlignInChars (
595- XteamVD->getType ())); // TeamVals[TeamID - 1]
596- Builder.CreateStore (Builder.CreateAdd (Builder.CreateLoad (RVI.RedVarAddr ),
597- Builder.CreateLoad (PrevTeamValGEP)),
598- RVI.RedVarAddr ); // RedVar += TeamVals[TeamID - 1]
599- EmitBranch (InclusiveScanEndBlock);
600- EmitBlock (InclusiveScanEndBlock);
601- } else {
602- // Redistribution for the 'exclusive' scan is handled differently because
603- // each work-item accesses the temporary output 'Storage' at the index
604- // before it's own global thread id(GlobalTID). Emits the following -
605- // RedVar = 0
606- // if(GlobalTID >= 1)
607- // {
608- // RedVar = Storage[GlobalTID - 1]
609- // if(TeamID >= 1)
610- // {
611- // if(localTID >= 1)
612- // RedVar += TeamVals[TeamID - 1];
613- // else if(TeamID >= 2)
614- // RedVar += TeamVals[TeamID - 2];
615- // }
616- // }
617-
618- Builder.CreateStore (llvm::ConstantInt::get (RedVarType, 0 ),
619- RVI.RedVarAddr ); // RedVar = 0
620- llvm::Value *IsNotFirstThread = Builder.CreateICmpUGE (
621- GlobalGpuThreadId,
622- llvm::ConstantInt::get (Int32Ty, 1 )); // GlobalTID >= 1
623- llvm::BasicBlock *IsNotFirstThreadThenBlock =
624- createBasicBlock (" omp.is.not.first.thread.then" );
625- llvm::BasicBlock *ExclusiveScanEndBlock =
626- createBasicBlock (" omp.xteam.exclusive.scan.end" );
627- Builder.CreateCondBr (IsNotFirstThread, IsNotFirstThreadThenBlock,
628- ExclusiveScanEndBlock);
629- EmitBlock (IsNotFirstThreadThenBlock);
630- llvm::Value *PrevGlobalGpuThreadId = Builder.CreateSub (
631- GlobalGpuThreadId,
632- llvm::ConstantInt::get (Int32Ty, 1 )); // GlobalTID - 1
633- Address ScanStoragePrevValGEP = Address (
634- Builder.CreateGEP (RedVarType, DScanStorage, PrevGlobalGpuThreadId),
635- RedVarType,
636- getContext ().getTypeAlignInChars (
637- XteamVD->getType ())); // Storage[GlobalTID - 1]
638- Builder.CreateStore (Builder.CreateLoad (ScanStoragePrevValGEP),
639- RVI.RedVarAddr ); // RedVar = Storage[GlobalTID - 1]
640-
641- llvm::Value *IsAfterFirstTeam = Builder.CreateICmpUGE (
642- WorkGroupId, llvm::ConstantInt::get (Int32Ty, 1 )); // TeamID >= 1
643- llvm::BasicBlock *IsAfterFirstTeamThenBlock =
644- createBasicBlock (" omp.is.after.first.team.then" );
645- Builder.CreateCondBr (IsAfterFirstTeam, IsAfterFirstTeamThenBlock,
646- ExclusiveScanEndBlock);
647- EmitBlock (IsAfterFirstTeamThenBlock);
648- llvm::Value *IsNotFirstThreadInTeam = Builder.CreateICmpUGE (
649- GpuThreadId, llvm::ConstantInt::get (Int32Ty, 1 )); // LocalTID >= 1
650- llvm::BasicBlock *IsNotFirstThreadInTeamThenBlock =
651- createBasicBlock (" omp.is.not.first.thread.in.team.then" );
652- llvm::BasicBlock *IsNotFirstThreadInTeamElseBlock =
653- createBasicBlock (" omp.is.not.first.thread.in.team.else" );
654- Builder.CreateCondBr (IsNotFirstThreadInTeam,
655- IsNotFirstThreadInTeamThenBlock,
656- IsNotFirstThreadInTeamElseBlock);
657- EmitBlock (IsNotFirstThreadInTeamThenBlock);
658- Address PrevTeamValGEP =
659- Address (Builder.CreateGEP (
660- RedVarType, DTeamVals,
661- Builder.CreateSub (WorkGroupId,
662- llvm::ConstantInt::get (Int32Ty, 1 ))),
663- RedVarType,
664- getContext ().getTypeAlignInChars (
665- XteamVD->getType ())); // TeamVals[TeamID - 1]
666- Builder.CreateStore (Builder.CreateAdd (Builder.CreateLoad (RVI.RedVarAddr ),
667- Builder.CreateLoad (PrevTeamValGEP)),
668- RVI.RedVarAddr ); // RedVar += TeamVals[TeamID - 1]
669- EmitBranch (ExclusiveScanEndBlock);
670- EmitBlock (IsNotFirstThreadInTeamElseBlock);
671- llvm::Value *IsAfterSecondTeam = Builder.CreateICmpUGE (
672- WorkGroupId, llvm::ConstantInt::get (Int32Ty, 2 )); // TeamID >= 2
673- llvm::BasicBlock *IsAfterSecondTeamThenBlock =
674- createBasicBlock (" omp.is.after.second.team.then" );
675- Builder.CreateCondBr (IsAfterSecondTeam, IsAfterSecondTeamThenBlock,
676- ExclusiveScanEndBlock);
677- EmitBlock (IsAfterSecondTeamThenBlock);
678- Address PrevPrevTeamValGEP =
679- Address (Builder.CreateGEP (
680- RedVarType, DTeamVals,
681- Builder.CreateSub (WorkGroupId,
682- llvm::ConstantInt::get (Int32Ty, 2 ))),
683- RedVarType,
684- getContext ().getTypeAlignInChars (
685- XteamVD->getType ())); // TeamVals[TeamID - 2]
686- Builder.CreateStore (
687- Builder.CreateAdd (Builder.CreateLoad (RVI.RedVarAddr ),
688- Builder.CreateLoad (PrevPrevTeamValGEP)),
689- RVI.RedVarAddr ); // RedVar += TeamVals[TeamID - 2]
690- EmitBranch (ExclusiveScanEndBlock);
691- EmitBlock (ExclusiveScanEndBlock);
692- }
567+ EmitXteamScanPhaseTwo (
568+ CapturedForStmt, /* SegmentSize=*/ Builder.getInt32 (1 ), *Args,
569+ CGM.getXteamRedBlockSize (D),
570+ CGM.OMPPresentScanDirective ->hasClausesOfKind <OMPInclusiveClause>());
571+
572+ // Emit: RedVar = Storage[Offset + GlobalTID]
573+ // The offset is calculated to index into the second half of the Storage[]
574+ // data structure.
575+ llvm::Value *StorageOffset =
576+ Builder.CreateAdd (GlobalGpuThreadId, TotalNumThreads);
577+ Address ScanStorageValGEP = Address (
578+ Builder.CreateGEP (RedVarType, DScanStorage, StorageOffset), RedVarType,
579+ getContext ().getTypeAlignInChars (
580+ XteamVD->getType ())); // Storage[Offset + GlobalTID]
581+ Builder.CreateStore (Builder.CreateLoad (ScanStorageValGEP), RVI.RedVarAddr );
693582 }
694583
695584 // After the 'scanned' results are put in the respective private copies, the
@@ -949,8 +838,17 @@ void CodeGenFunction::EmitXteamScanPhaseTwo(const ForStmt *FStmt,
949838 Address XteamRedSumArg2 = GetAddrOfLocalVar (Args[RVI.ArgPos + 2 ]);
950839 llvm::Value *DScanStorage = Builder.CreateLoad (XteamRedSumArg2);
951840
952- Address XteamRedSumArg3 = GetAddrOfLocalVar (Args[RVI.ArgPos + 3 ]);
953- llvm::Value *DSegmentVals = Builder.CreateLoad (XteamRedSumArg3);
841+ llvm::Value *DSegmentVals = nullptr ;
842+ if (CGM.isXteamSegmentedScanKernel ()) {
843+ Address XteamRedSumArg3 = GetAddrOfLocalVar (Args[RVI.ArgPos + 3 ]);
844+ DSegmentVals = Builder.CreateLoad (XteamRedSumArg3);
845+ } else {
846+ // For No-Loop Scan, the SegmentVals[] is not required and therefore was
847+ // not created in the first place. Here we want to use the same
848+ // kmpc_xteams_phase2* API to compute Phase 2 of scan, therefore we're
849+ // passing the pointer of Storage[] as a dummy ptr.
850+ DSegmentVals = DScanStorage;
851+ }
954852
955853 const Expr *OrigRedVarExpr = RVI.RedVarExpr ;
956854 const DeclRefExpr *DRE = cast<DeclRefExpr>(OrigRedVarExpr);
0 commit comments