Skip to content

Commit c89f950

Browse files
committed
Do chase simultaneously in all devices
1 parent 19f3054 commit c89f950

File tree

2 files changed

+31
-5
lines changed

2 files changed

+31
-5
lines changed

cscs-checks/microbenchmarks/gpu/pointer_chase/pointer_chase.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,8 @@ def __init__(self):
344344
'average_latency': (760, None, 0.1, 'clock cycles')
345345
},
346346
'ault:amdvega': {
347-
'average_latency': (3550, None, 0.1, 'clock cycles')
347+
'average_latency': (
348+
3550, None, 0.1, 'clock cycles'
349+
)
348350
},
349351
}

cscs-checks/microbenchmarks/gpu/pointer_chase/src/pointer_chase.cu

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
#include <memory>
88
#include <algorithm>
99
#include <queue>
10+
#include <thread>
11+
#include <mutex>
12+
#include <vector>
13+
#include <functional>
1014

1115
/*
1216
~~ GPU Linked list pointer chase algorithm ~~
@@ -103,20 +107,40 @@ uint64_t general_pointer_chase(int local_device, int remote_device, int init_mod
103107
return l.timer;
104108
}
105109

110+
std::mutex mtx;
111+
template < class L >
112+
void loc_ptr_ch(int gpu_id, int init_mode, size_t num_nodes, size_t stride, size_t num_jumps, char * nid)
113+
{
114+
/*
115+
* Low-level thread-safe local pointer chase function.
116+
*/
117+
uint64_t total_cycles = general_pointer_chase< L >(gpu_id, gpu_id, init_mode, num_nodes, stride, num_jumps);
118+
119+
// Print the timings of the pointer chase
120+
{
121+
std::lock_guard<std::mutex> lg(mtx);
122+
printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, total_cycles/num_jumps);
123+
}
124+
}
106125

107126
template < class List >
108127
void local_pointer_chase(int num_devices, int init_mode, size_t num_nodes, size_t stride, size_t num_jumps, char * nid)
109128
{
110129
/*
111130
* Specialised pointer chase on a single device.
112131
*/
132+
std::vector<std::thread> threads;
113133
for (int gpu_id = 0; gpu_id < num_devices; gpu_id++)
114134
{
115-
uint64_t total_cycles = general_pointer_chase< List >(gpu_id, gpu_id, init_mode, num_nodes, stride, num_jumps);
116-
117-
// Print the timings of the pointer chase
118-
printf("[%s] On device %d, the chase took on average %d cycles per node jump.\n", nid, gpu_id, total_cycles/num_jumps);
135+
threads.push_back(std::thread(loc_ptr_ch<List>,
136+
gpu_id, init_mode,
137+
num_nodes, stride, num_jumps, nid
138+
)
139+
);
119140
}
141+
142+
// Join all threads
143+
std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
120144
}
121145

122146

0 commit comments

Comments
 (0)