Skip to content

Commit 41c796a

Browse files
committed
[TSAR, Parallel, Shared, DVMH] Collapse parallel loops for the GPU target.
1 parent 800a40c commit 41c796a

File tree

4 files changed

+50
-17
lines changed

4 files changed

+50
-17
lines changed

lib/Transform/Clang/DVMHSMAutoPar.cpp

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "tsar/Analysis/Clang/ASTDependenceAnalysis.h"
2929
#include "tsar/Analysis/Clang/CanonicalLoop.h"
3030
#include "tsar/Analysis/Clang/LoopMatcher.h"
31+
#include "tsar/Analysis/Clang/PerfectLoop.h"
3132
#include "tsar/Analysis/Passes.h"
3233
#include "tsar/Analysis/Parallel/Passes.h"
3334
#include "tsar/Analysis/Parallel/ParallelLoop.h"
@@ -53,7 +54,7 @@ class ClangDVMHSMParallelization : public ClangSMParallelization {
5354
initializeClangDVMHSMParallelizationPass(*PassRegistry::getPassRegistry());
5455
}
5556
private:
56-
bool exploitParallelism(const Loop &IR, const clang::ForStmt &AST,
57+
bool exploitParallelism(const DFLoop &IR, const clang::ForStmt &AST,
5758
const ClangSMParallelProvider &Provider,
5859
tsar::ClangDependenceAnalyzer &ASTDepInfo,
5960
TransformationContext &TfmCtx) override;
@@ -108,28 +109,39 @@ void addVarList(const ClangDependenceAnalyzer::ReductionVarListT &VarInfoList,
108109
ParallelFor.push_back(')');
109110
}
110111
}
112+
113+
unsigned getPerfectNestSize(const DFLoop &DFL,
114+
const PerfectLoopInfo &PerfectInfo,
115+
const CanonicalLoopSet &CanonicalLoops) {
116+
auto *CurrDFL = &DFL;
117+
unsigned PerfectSize = 1;
118+
for (; PerfectInfo.count(CurrDFL) && CurrDFL->getNumRegions() > 0;
119+
++PerfectSize) {
120+
CurrDFL = dyn_cast<DFLoop>(*CurrDFL->region_begin());
121+
if (!CurrDFL)
122+
return PerfectSize;
123+
auto CanonicalItr = CanonicalLoops.find_as(const_cast<DFLoop *>(CurrDFL));
124+
if (CanonicalItr == CanonicalLoops.end() || !(**CanonicalItr).isCanonical())
125+
return PerfectSize;
126+
}
127+
return PerfectSize;
128+
}
111129
} // namespace
112130

113131
bool ClangDVMHSMParallelization::exploitParallelism(
114-
const Loop &IR, const clang::ForStmt &AST,
132+
const DFLoop &IR, const clang::ForStmt &AST,
115133
const ClangSMParallelProvider &Provider,
116134
tsar::ClangDependenceAnalyzer &ASTRegionAnalysis,
117135
TransformationContext &TfmCtx) {
118136
auto &ASTDepInfo = ASTRegionAnalysis.getDependenceInfo();
119137
if (!ASTDepInfo.get<trait::FirstPrivate>().empty() ||
120138
!ASTDepInfo.get<trait::LastPrivate>().empty())
121139
return false;
122-
SmallString<128> ParallelFor("#pragma dvm parallel (1)");
123-
if (!ASTDepInfo.get<trait::Private>().empty()) {
124-
ParallelFor += " private";
125-
addVarList(ASTDepInfo.get<trait::Private>(), ParallelFor);
126-
}
127-
addVarList(ASTDepInfo.get<trait::Reduction>(), ParallelFor);
128-
ParallelFor += '\n';
129140
SmallString<128> DVMHRegion("#pragma dvm region");
130141
SmallString<128> DVMHActual, DVMHGetActual;
131142
auto &PI = Provider.get<ParallelLoopPass>().getParallelLoopInfo();
132-
if (!PI[&IR].isHostOnly() && ASTRegionAnalysis.evaluateDefUse()) {
143+
bool HostOnly = false;
144+
if (!PI[IR.getLoop()].isHostOnly() && ASTRegionAnalysis.evaluateDefUse()) {
133145
if (!ASTDepInfo.get<trait::ReadOccurred>().empty()) {
134146
DVMHActual += "#pragma dvm actual";
135147
addVarList(ASTDepInfo.get<trait::ReadOccurred>(), DVMHActual);
@@ -150,7 +162,23 @@ bool ClangDVMHSMParallelization::exploitParallelism(
150162
}
151163
} else {
152164
DVMHRegion += " targets(HOST)";
165+
HostOnly = true;
153166
}
167+
auto &PerfectInfo = Provider.get<ClangPerfectLoopPass>().getPerfectLoopInfo();
168+
auto &CanonicalInfo = Provider.get<CanonicalLoopPass>().getCanonicalLoopInfo();
169+
SmallString<128> ParallelFor("#pragma dvm parallel (");
170+
if (HostOnly)
171+
ParallelFor += "1";
172+
else
173+
Twine(getPerfectNestSize(IR, PerfectInfo, CanonicalInfo))
174+
.toStringRef(ParallelFor);
175+
ParallelFor += ")";
176+
if (!ASTDepInfo.get<trait::Private>().empty()) {
177+
ParallelFor += " private";
178+
addVarList(ASTDepInfo.get<trait::Private>(), ParallelFor);
179+
}
180+
addVarList(ASTDepInfo.get<trait::Reduction>(), ParallelFor);
181+
ParallelFor += '\n';
154182
DVMHRegion += "\n{\n";
155183
// Add directives to the source code.
156184
auto &Rewriter = TfmCtx.getRewriter();

lib/Transform/Clang/OpenMPAutoPar.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class ClangOpenMPParallelization : public ClangSMParallelization {
4646
initializeClangOpenMPParallelizationPass(*PassRegistry::getPassRegistry());
4747
}
4848
private:
49-
bool exploitParallelism(const Loop &IR, const clang::ForStmt &AST,
49+
bool exploitParallelism(const DFLoop &IR, const clang::ForStmt &AST,
5050
const ClangSMParallelProvider &Provider,
5151
tsar::ClangDependenceAnalyzer &ASTDepInfo,
5252
TransformationContext &TfmCtx) override;
@@ -107,7 +107,7 @@ struct ClausePrinter {
107107

108108

109109
bool ClangOpenMPParallelization::exploitParallelism(
110-
const Loop &IR, const clang::ForStmt &AST,
110+
const DFLoop &IR, const clang::ForStmt &AST,
111111
const ClangSMParallelProvider &Provider,
112112
tsar::ClangDependenceAnalyzer &ASTDepInfo,
113113
TransformationContext &TfmCtx) {

lib/Transform/Clang/SharedMemoryAutoPar.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "tsar/Analysis/Clang/DIMemoryMatcher.h"
3131
#include "tsar/Analysis/Clang/LoopMatcher.h"
3232
#include "tsar/Analysis/Clang/MemoryMatcher.h"
33+
#include "tsar/Analysis/Clang/PerfectLoop.h"
3334
#include "tsar/Analysis/Clang/RegionDirectiveInfo.h"
3435
#include "tsar/Analysis/DFRegionInfo.h"
3536
#include "tsar/Analysis/Memory/ClonedDIMemoryMatcher.h"
@@ -99,7 +100,8 @@ bool ClangSMParallelization::findParallelLoops(
99100
if (LMatchItr != LM.end())
100101
toDiag(Diags, LMatchItr->get<AST>()->getLocStart(),
101102
clang::diag::remark_parallel_loop);
102-
auto CanonicalItr = CL.find_as(RI.getRegionFor(&L));
103+
auto DFL = cast<DFLoop>(RI.getRegionFor(&L));
104+
auto CanonicalItr = CL.find_as(DFL);
103105
if (CanonicalItr == CL.end() || !(**CanonicalItr).isCanonical()) {
104106
toDiag(Diags, LMatchItr->get<AST>()->getLocStart(),
105107
clang::diag::warn_parallel_not_canonical);
@@ -129,7 +131,7 @@ bool ClangSMParallelization::findParallelLoops(
129131
*mGlobalOpts, Diags, DIAT, DIDepSet, *DIMemoryMatcher, ASTToClient);
130132
if (!RegionAnalysis.evaluateDependency())
131133
return findParallelLoops(L.begin(), L.end(), F, Provider);
132-
if (!exploitParallelism(L, *ForStmt, Provider, RegionAnalysis, *mTfmCtx))
134+
if (!exploitParallelism(*DFL, *ForStmt, Provider, RegionAnalysis, *mTfmCtx))
133135
return findParallelLoops(L.begin(), L.end(), F, Provider);
134136
for (auto *BB : L.blocks())
135137
for (auto &I : *BB) {

lib/Transform/Clang/SharedMemoryAutoPar.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class ForStmt;
4141
}
4242

4343
namespace tsar {
44+
class DFLoop;
4445
class AnalysisSocketInfo;
4546
class ClangDependenceAnalyzer;
4647
class DIMemoryEnvironment;
@@ -56,6 +57,7 @@ class Function;
5657
class Loop;
5758

5859
class CanonicalLoopPass;
60+
class ClangPerfectLoopPass;
5961
class ClangDIMemoryMatcherPass;
6062
class DFRegionInfoPass;
6163
class LoopMatcherPass;
@@ -66,7 +68,8 @@ class ParallelLoopPass;
6668
using ClangSMParallelProvider =
6769
FunctionPassAAProvider<AnalysisSocketImmutableWrapper, LoopInfoWrapperPass,
6870
ParallelLoopPass, CanonicalLoopPass, LoopMatcherPass,
69-
DFRegionInfoPass, ClangDIMemoryMatcherPass>;
71+
DFRegionInfoPass, ClangDIMemoryMatcherPass,
72+
ClangPerfectLoopPass>;
7073

7174
/// This pass try to insert directives into a source code to obtain
7275
/// a parallel program for a shared memory.
@@ -112,8 +115,8 @@ class ClangSMParallelization: public ModulePass, private bcl::Uncopyable{
112115
/// successfully checked.
113116
/// \return true if a specified loop could be parallelized and inner loops
114117
/// should not be processed.
115-
virtual bool exploitParallelism(const Loop &IR, const clang::ForStmt &AST,
116-
const ClangSMParallelProvider &Provider,
118+
virtual bool exploitParallelism(const tsar::DFLoop &IR,
119+
const clang::ForStmt &AST, const ClangSMParallelProvider &Provider,
117120
tsar::ClangDependenceAnalyzer &ASTDepInfo,
118121
tsar::TransformationContext &TfmCtx) = 0;
119122

0 commit comments

Comments
 (0)