Skip to content

Commit fd850c1

Browse files
committed
[TSAR, Parallel, DVMH, SM] Add 'tie' clause if possible.
1 parent 22a05e2 commit fd850c1

25 files changed

+639
-6
lines changed

lib/Transform/Clang/DVMHSMAutoPar.cpp

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "tsar/Analysis/Clang/CanonicalLoop.h"
3030
#include "tsar/Analysis/Clang/LoopMatcher.h"
3131
#include "tsar/Analysis/Clang/PerfectLoop.h"
32+
#include "tsar/Analysis/Memory/DIArrayAccess.h"
3233
#include "tsar/Analysis/Passes.h"
3334
#include "tsar/Analysis/Parallel/Passes.h"
3435
#include "tsar/Analysis/Parallel/ParallelLoop.h"
@@ -53,6 +54,7 @@ class ClangDVMHSMParallelization : public ClangSMParallelization {
5354
ClangDVMHSMParallelization() : ClangSMParallelization(ID) {
5455
initializeClangDVMHSMParallelizationPass(*PassRegistry::getPassRegistry());
5556
}
57+
5658
private:
5759
bool exploitParallelism(const DFLoop &IR, const clang::ForStmt &AST,
5860
const ClangSMParallelProvider &Provider,
@@ -113,7 +115,8 @@ void addVarList(const ClangDependenceAnalyzer::ReductionVarListT &VarInfoList,
113115
void addOnClause(const DFLoop &DFL, const PerfectLoopInfo &PerfectInfo,
114116
const CanonicalLoopSet &CanonicalLoops,
115117
const MemoryMatchInfo &MemoryMatcher,
116-
SmallVectorImpl<char> &Out) {
118+
SmallVectorImpl<std::pair<ObjectID, StringRef>> &ParallelNest,
119+
SmallVectorImpl<char> &Out, unsigned NestSize = 0) {
117120
Out.append({' ', 'o', 'n', '('});
118121
auto *CurrDFL = &DFL;
119122
auto CanonicalItr = CanonicalLoops.find_as(const_cast<DFLoop *>(CurrDFL));
@@ -129,10 +132,19 @@ void addOnClause(const DFLoop &DFL, const PerfectLoopInfo &PerfectInfo,
129132
Out.append(MatchItr->get<AST>()->getName().begin(),
130133
MatchItr->get<AST>()->getName().end());
131134
Out.append({']'});
135+
auto LoopID = CurrDFL->getLoop()->getLoopID();
136+
assert(LoopID &&
137+
"Loop ID must be available for the outermost parallel loop!");
138+
ParallelNest.emplace_back(LoopID, MatchItr->get<AST>()->getName());
132139
for (; PerfectInfo.count(CurrDFL) && CurrDFL->getNumRegions() > 0;) {
140+
if (NestSize > 0 && NestSize <= ParallelNest.size())
141+
break;
133142
CurrDFL = dyn_cast<DFLoop>(*CurrDFL->region_begin());
134143
if (!CurrDFL)
135144
break;
145+
auto LoopID = CurrDFL->getLoop()->getLoopID();
146+
if (!LoopID)
147+
break;
136148
auto CanonicalItr = CanonicalLoops.find_as(const_cast<DFLoop *>(CurrDFL));
137149
if (CanonicalItr == CanonicalLoops.end() || !(**CanonicalItr).isCanonical())
138150
break;
@@ -145,6 +157,7 @@ void addOnClause(const DFLoop &DFL, const PerfectLoopInfo &PerfectInfo,
145157
Out.append(MatchItr->get<AST>()->getName().begin(),
146158
MatchItr->get<AST>()->getName().end());
147159
Out.append({']'});
160+
ParallelNest.emplace_back(LoopID, MatchItr->get<AST>()->getName());
148161
}
149162
Out.append({')'});
150163
}
@@ -189,12 +202,63 @@ bool ClangDVMHSMParallelization::exploitParallelism(
189202
auto &PerfectInfo = Provider.get<ClangPerfectLoopPass>().getPerfectLoopInfo();
190203
auto &CanonicalInfo = Provider.get<CanonicalLoopPass>().getCanonicalLoopInfo();
191204
auto &MemoryMatcher = Provider.get<MemoryMatcherImmutableWrapper>().get();
192-
193205
SmallString<128> ParallelFor("#pragma dvm parallel");
194-
if (HostOnly)
195-
ParallelFor += "(1)";
196-
else
197-
addOnClause(IR, PerfectInfo, CanonicalInfo, MemoryMatcher, ParallelFor);
206+
SmallVector<std::pair<ObjectID, StringRef>, 4> ParallelNest;
207+
addOnClause(IR, PerfectInfo, CanonicalInfo, MemoryMatcher, ParallelNest,
208+
ParallelFor, HostOnly ? 1 : 0);
209+
auto *AccessInfo = getAnalysis<DIArrayAccessWrapper>().getAccessInfo();
210+
if (AccessInfo) {
211+
bool EmptyTie = true;
212+
auto arraycmp = [](const DIEstimateMemory *LHS,
213+
const DIEstimateMemory *RHS) {
214+
return LHS->getVariable()->getName() < RHS->getVariable()->getName();
215+
};
216+
std::map<DIEstimateMemory *, SmallVector<std::string, 5>,
217+
decltype(arraycmp)>
218+
Mapping(arraycmp);
219+
for (auto &Access :
220+
AccessInfo->scope_accesses(ParallelNest.front().first)) {
221+
if (!isa<DIEstimateMemory>(Access.getArray()))
222+
continue;
223+
auto MappingItr =
224+
Mapping.emplace(std::piecewise_construct,
225+
std::forward_as_tuple(cast<DIEstimateMemory>(Access.getArray())),
226+
std::forward_as_tuple(Access.size(), "*")).first;
227+
auto StashSize{ ParallelFor.size() };
228+
for (auto *Subscript : Access) {
229+
if (!Subscript || MappingItr->second[Subscript->getDimension()] != "*")
230+
continue;
231+
if (auto *Affine = dyn_cast<DIAffineSubscript>(Subscript)) {
232+
for (unsigned I = 0, EI = Affine->getNumberOfMonoms(); I < EI; ++I) {
233+
if (Affine->getMonom(I).Value.isNullValue())
234+
continue;
235+
auto LoopItr = find_if(ParallelNest, [Affine, I](auto &Loop) {
236+
return Loop.first == Affine->getMonom(I).Column;
237+
});
238+
if (LoopItr != ParallelNest.end()) {
239+
MappingItr->second[Affine->getDimension()] =
240+
((Affine->getMonom(I).Value.isNegative() ? "-" : "") +
241+
LoopItr->second)
242+
.str();
243+
EmptyTie = false;
244+
}
245+
}
246+
}
247+
}
248+
}
249+
if (!EmptyTie) {
250+
ParallelFor += " tie(";
251+
for (auto &Map : Mapping) {
252+
if (all_of(Map.second, [](StringRef S) { return S == "*"; }))
253+
continue;
254+
ParallelFor +=
255+
cast<DIEstimateMemory>(Map.first)->getVariable()->getName();
256+
ParallelFor += "[" + join(Map.second, "][") + "]";
257+
ParallelFor += ",";
258+
}
259+
ParallelFor.back() = ')';
260+
}
261+
}
198262
if (!ASTDepInfo.get<trait::Private>().empty()) {
199263
ParallelFor += " private";
200264
addVarList(ASTDepInfo.get<trait::Private>(), ParallelFor);

lib/Transform/Clang/SharedMemoryAutoPar.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ void ClangSMParallelization::getAnalysisUsage(AnalysisUsage &AU) const {
284284
AU.addRequired<GlobalsAAWrapperPass>();
285285
AU.addRequired<ClangRegionCollector>();
286286
AU.addRequired<DIMemoryEnvironmentWrapper>();
287+
AU.addRequired<DIArrayAccessWrapper>();
287288
AU.setPreservesAll();
288289
}
289290

lib/Transform/Clang/SharedMemoryAutoPar.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "tsar/ADT/DenseMapTraits.h"
3030
#include "tsar/Analysis/AnalysisSocket.h"
3131
#include "tsar/Analysis/Clang/MemoryMatcher.h"
32+
#include "tsar/Analysis/Memory/DIArrayAccess.h"
3233
#include "tsar/Support/PassAAProvider.h"
3334
#include "tsar/Support/PassGroupRegistry.h"
3435
#include <bcl/tagged.h>
@@ -187,6 +188,7 @@ class ClangSMParallelizationInfo final : public tsar::PassGroupInfo {
187188
INITIALIZE_PASS_DEPENDENCY(CanonicalLoopPass) \
188189
INITIALIZE_PASS_DEPENDENCY(ClangRegionCollector) \
189190
INITIALIZE_PASS_DEPENDENCY(DIMemoryEnvironmentWrapper) \
191+
INITIALIZE_PASS_DEPENDENCY(DIArrayAccessWrapper) \
190192
INITIALIZE_PASS_IN_GROUP_END(passName, arg, name, false, false, \
191193
TransformationQueryManager::getPassRegistry())
192194
#endif//TSAR_CLANG_SHARED_PARALLEL_H

test/transform/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ add_subdirectory(inline)
33
add_subdirectory(propagate)
44
add_subdirectory(replace)
55
add_subdirectory(openmp)
6+
add_subdirectory(dvmh_sm)

test/transform/dvmh_sm/Adi.func.c

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
//===--- Adi.c ---- Alternating Direction Implicit ---------------*- C -*-===//
2+
//
3+
// This file implements the Alternating Direction Implicit(ADI) method which is
4+
// an iterative method used to solve partial differential equations.
5+
//
6+
//===----------------------------------------------------------------------===//
7+
8+
#include <math.h>
9+
#include <stdio.h>
10+
#include <stdlib.h>
11+
12+
#define MAX(A, b) ((A) > (b) ? (A) : (b))
13+
14+
#define NX 384
15+
#define NY 384
16+
#define NZ 384
17+
18+
void init(double (*A)[NY][NZ]);
19+
double iter(double (*A)[NY][NZ]);
20+
21+
int main(int Argc, char *Argv[]) {
22+
double MaxEps, Eps;
23+
double(*A)[NY][NZ];
24+
int It, ItMax, I, J, K;
25+
MaxEps = 0.01;
26+
ItMax = 100;
27+
A = (double(*)[NY][NZ])malloc(NX * NY * NZ * sizeof(double));
28+
init(A);
29+
for (It = 1; It <= ItMax; It++) {
30+
Eps = iter(A);
31+
printf(" IT = %4i EPS = %14.7E\n", It, Eps);
32+
if (Eps < MaxEps)
33+
break;
34+
}
35+
free(A);
36+
printf(" ADI Benchmark Completed.\n");
37+
printf(" Size = %4d x %4d x %4d\n", NX, NY, NZ);
38+
printf(" Iterations = %12d\n", ItMax);
39+
printf(" Operation type = double precision\n");
40+
printf(" Verification = %12s\n",
41+
(fabs(Eps - 0.07249074) < 1e-6 ? "SUCCESSFUL" : "UNSUCCESSFUL"));
42+
printf(" END OF ADI Benchmark\n");
43+
return 0;
44+
}
45+
46+
void init(double (*A)[NY][NZ]) {
47+
int I, J, K;
48+
for (I = 0; I < NX; I++)
49+
for (J = 0; J < NY; J++)
50+
for (K = 0; K < NZ; K++)
51+
if (K == 0 || K == NZ - 1 || J == 0 || J == NY - 1 || I == 0 ||
52+
I == NX - 1)
53+
A[I][J][K] =
54+
10.0 * I / (NX - 1) + 10.0 * J / (NY - 1) + 10.0 * K / (NZ - 1);
55+
else
56+
A[I][J][K] = 0;
57+
}
58+
59+
double iter(double(*A)[NY][NZ]) {
60+
int I, J, K;
61+
double Eps = 0;
62+
for (I = 1; I < NX - 1; I++)
63+
for (J = 1; J < NY - 1; J++)
64+
for (K = 1; K < NZ - 1; K++)
65+
A[I][J][K] = (A[I - 1][J][K] + A[I + 1][J][K]) / 2;
66+
for (I = 1; I < NX - 1; I++)
67+
for (J = 1; J < NY - 1; J++)
68+
for (K = 1; K < NZ - 1; K++)
69+
A[I][J][K] = (A[I][J - 1][K] + A[I][J + 1][K]) / 2;
70+
for (I = 1; I < NX - 1; I++)
71+
for (J = 1; J < NY - 1; J++)
72+
for (K = 1; K < NZ - 1; K++) {
73+
double Tmp1 = (A[I][J][K - 1] + A[I][J][K + 1]) / 2;
74+
double Tmp2 = fabs(A[I][J][K] - Tmp1);
75+
Eps = MAX(Eps, Tmp2);
76+
A[I][J][K] = Tmp1;
77+
}
78+
return Eps;
79+
}
80+
//CHECK: Adi.func.c:70:3: remark: parallel execution of loop is possible
81+
//CHECK: for (I = 1; I < NX - 1; I++)
82+
//CHECK: ^
83+
//CHECK: Adi.func.c:70:3: warning: unable to create parallel directive
84+
//CHECK: Adi.func.c:59:21: note: unable to localize inout variable
85+
//CHECK: double iter(double(*A)[NY][NZ]) {
86+
//CHECK: ^
87+
//CHECK: Adi.func.c:66:3: remark: parallel execution of loop is possible
88+
//CHECK: for (I = 1; I < NX - 1; I++)
89+
//CHECK: ^
90+
//CHECK: Adi.func.c:66:3: warning: unable to create parallel directive
91+
//CHECK: Adi.func.c:59:21: note: unable to localize inout variable
92+
//CHECK: double iter(double(*A)[NY][NZ]) {
93+
//CHECK: ^
94+
//CHECK: Adi.func.c:62:3: remark: parallel execution of loop is possible
95+
//CHECK: for (I = 1; I < NX - 1; I++)
96+
//CHECK: ^
97+
//CHECK: Adi.func.c:62:3: warning: unable to create parallel directive
98+
//CHECK: Adi.func.c:59:21: note: can not analyze variable 'A'
99+
//CHECK: double iter(double(*A)[NY][NZ]) {
100+
//CHECK: ^
101+
//CHECK: Adi.func.c:63:5: remark: parallel execution of loop is possible
102+
//CHECK: for (J = 1; J < NY - 1; J++)
103+
//CHECK: ^
104+
//CHECK: Adi.func.c:63:5: warning: unable to create parallel directive
105+
//CHECK: Adi.func.c:59:21: note: unable to localize inout variable
106+
//CHECK: double iter(double(*A)[NY][NZ]) {
107+
//CHECK: ^
108+
//CHECK: Adi.func.c:48:3: remark: parallel execution of loop is possible
109+
//CHECK: for (I = 0; I < NX; I++)
110+
//CHECK: ^
111+
//CHECK: Adi.func.c:48:3: warning: unable to create parallel directive
112+
//CHECK: Adi.func.c:46:20: note: unable to localize inout variable
113+
//CHECK: void init(double (*A)[NY][NZ]) {
114+
//CHECK: ^
115+
//CHECK: 5 warnings generated.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
name = Adi.func
2+
plugin = TsarPlugin
3+
4+
suffix = dvmhsm
5+
sample = $name.c
6+
sample_diff = $name.$suffix.c
7+
options = -clang-dvmh-sm-parallel -output-suffix=$suffix
8+
run = "$tsar $sample $options"
9+
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
//===--- Adi.c ---- Alternating Direction Implicit ---------------*- C -*-===//
2+
//
3+
// This file implements the Alternating Direction Implicit(ADI) method which is
4+
// an iterative method used to solve partial differential equations.
5+
//
6+
//===----------------------------------------------------------------------===//
7+
8+
#include <math.h>
9+
#include <stdio.h>
10+
#include <stdlib.h>
11+
12+
#define MAX(A, b) ((A) > (b) ? (A) : (b))
13+
14+
#define NX 384
15+
#define NY 384
16+
#define NZ 384
17+
18+
void init(double (*A)[NY][NZ]);
19+
double iter(double (*A)[NY][NZ]);
20+
21+
int main(int Argc, char *Argv[]) {
22+
double MaxEps, Eps;
23+
double(*A)[NY][NZ];
24+
int It, ItMax, I, J, K;
25+
MaxEps = 0.01;
26+
ItMax = 100;
27+
A = (double(*)[NY][NZ])malloc(NX * NY * NZ * sizeof(double));
28+
init(A);
29+
for (It = 1; It <= ItMax; It++) {
30+
Eps = iter(A);
31+
printf(" IT = %4i EPS = %14.7E\n", It, Eps);
32+
if (Eps < MaxEps)
33+
break;
34+
}
35+
free(A);
36+
printf(" ADI Benchmark Completed.\n");
37+
printf(" Size = %4d x %4d x %4d\n", NX, NY, NZ);
38+
printf(" Iterations = %12d\n", ItMax);
39+
printf(" Operation type = double precision\n");
40+
printf(" Verification = %12s\n",
41+
(fabs(Eps - 0.07249074) < 1e-6 ? "SUCCESSFUL" : "UNSUCCESSFUL"));
42+
printf(" END OF ADI Benchmark\n");
43+
return 0;
44+
}
45+
46+
void init(double (*A)[NY][NZ]) {
47+
int I, J, K;
48+
#pragma dvm region targets(HOST)
49+
{
50+
#pragma dvm parallel on([I]) tie(A[I][*][*]) private(J, K)
51+
for (I = 0; I < NX; I++)
52+
for (J = 0; J < NY; J++)
53+
for (K = 0; K < NZ; K++)
54+
if (K == 0 || K == NZ - 1 || J == 0 || J == NY - 1 || I == 0 ||
55+
I == NX - 1)
56+
A[I][J][K] =
57+
10.0 * I / (NX - 1) + 10.0 * J / (NY - 1) + 10.0 * K / (NZ - 1);
58+
else
59+
A[I][J][K] = 0;
60+
}
61+
}
62+
63+
double iter(double (*A)[NY][NZ]) {
64+
int I, J, K;
65+
double Eps = 0;
66+
for (I = 1; I < NX - 1; I++)
67+
#pragma dvm region targets(HOST)
68+
{
69+
#pragma dvm parallel on([J]) tie(A[*][J][*]) private(K)
70+
for (J = 1; J < NY - 1; J++)
71+
for (K = 1; K < NZ - 1; K++)
72+
A[I][J][K] = (A[I - 1][J][K] + A[I + 1][J][K]) / 2;
73+
}
74+
#pragma dvm region targets(HOST)
75+
{
76+
#pragma dvm parallel on([I]) tie(A[I][*][*]) private(J, K)
77+
for (I = 1; I < NX - 1; I++)
78+
for (J = 1; J < NY - 1; J++)
79+
for (K = 1; K < NZ - 1; K++)
80+
A[I][J][K] = (A[I][J - 1][K] + A[I][J + 1][K]) / 2;
81+
}
82+
#pragma dvm region targets(HOST)
83+
{
84+
#pragma dvm parallel on([I]) tie(A[I][*][*]) private(J, K) reduction(max(Eps))
85+
for (I = 1; I < NX - 1; I++)
86+
for (J = 1; J < NY - 1; J++)
87+
for (K = 1; K < NZ - 1; K++) {
88+
double Tmp1 = (A[I][J][K - 1] + A[I][J][K + 1]) / 2;
89+
double Tmp2 = fabs(A[I][J][K] - Tmp1);
90+
Eps = MAX(Eps, Tmp2);
91+
A[I][J][K] = Tmp1;
92+
}
93+
}
94+
return Eps;
95+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
include(tsar-testing)
2+
tsar_test(TARGET ClangDVMHSMParallel PASSNAME "-clang-dvmh-sm-parallel")

0 commit comments

Comments
 (0)