Skip to content

Commit 241a0f8

Browse files
author
Maximilian
committed
Final SW-CleanUp
1 parent a8c0f66 commit 241a0f8

File tree

3 files changed

+36
-29
lines changed
  • examples

3 files changed

+36
-29
lines changed

examples/09_perf_rdma/sw/src/server/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,8 @@ int main(int argc, char *argv[]) {
106106
unsigned int curr_size = min_size;
107107
while(curr_size <= max_size) {
108108
coyote::rdmaSg sg = { .len = curr_size };
109-
run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
110-
run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
109+
// run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
110+
run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs + 10, operation);
111111
curr_size *= 2;
112112
}
113113

examples/11_rdma_scatter/sw/src/client/main.cpp

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
#include <cstdlib>
2929

3030
// AMD GPU management & run-time libraries
31-
// #include <hip/hip_runtime.h>
31+
#include <hip/hip_runtime.h>
3232

3333
// External library for easier parsing of CLI arguments by the executable
3434
#include <boost/program_options.hpp>
@@ -89,11 +89,6 @@ double run_bench(
8989
bench.execute(bench_fn, prep_fn);
9090

9191
// Functional correctness check
92-
if (!operation) {
93-
for (int i = 0; i < sg.len / sizeof(int); i++) {
94-
assert(mem[i] == i);
95-
}
96-
}
9792

9893
// For writes, divide by 2, since that is sent two ways (from client to server and then from server to client)
9994
// Reads are one way, so no need to scale
@@ -134,13 +129,13 @@ int main(int argc, char *argv[]) {
134129
if (!mem) { throw std::runtime_error("Could not allocate memory; exiting..."); }
135130

136131
// GPU memory will be allocated on the GPU set using hipSetDevice(...)
137-
// if (hipSetDevice(DEFAULT_GPU_ID)) { throw std::runtime_error("Couldn't select GPU!"); }
132+
if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
138133

139134
// Allocate four buffers for the scatter operation
140-
int* vaddr_1 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
141-
int* vaddr_2 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
142-
int* vaddr_3 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
143-
int* vaddr_4 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
135+
int* vaddr_1 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 0});
136+
int* vaddr_2 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 1});
137+
int* vaddr_3 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 2});
138+
int* vaddr_4 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 3});
144139

145140
// Print all the new buffer addresses
146141
std::cout << "Scatter buffer addresses:" << std::endl;

examples/12_rdma_scatter_baseline/sw/src/client/main.cpp

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
#include <cstdlib>
2929

3030
// AMD GPU management & run-time libraries
31-
// #include <hip/hip_runtime.h>
31+
#include <hip/hip_runtime.h>
3232

3333
// External library for easier parsing of CLI arguments by the executable
3434
#include <boost/program_options.hpp>
@@ -39,6 +39,7 @@
3939
#include "constants.hpp"
4040

4141
constexpr bool const IS_CLIENT = true;
42+
const int NUM_GPUS = 4;
4243

4344
// Registers, corresponding to the registers defined in the vFPGA
4445
enum class ScatterRegisters: uint32_t {
@@ -53,7 +54,7 @@ enum class ScatterRegisters: uint32_t {
5354
// the thread object which can lead to undefined behaviour and bugs.
5455
double run_bench(
5556
coyote::cThread &coyote_thread, coyote::rdmaSg &sg,
56-
int *mem, uint transfers, uint n_runs, bool operation
57+
int *mem, int* dest_buffers[], uint transfers, uint n_runs, bool operation
5758
) {
5859
// When writing, the server asserts the written payload is correct (which the client sets)
5960
// When reading, the client asserts the read payload is correct (which the server sets)
@@ -80,18 +81,25 @@ double run_bench(
8081
}
8182

8283
while (coyote_thread.checkCompleted(coyote::CoyoteOper::LOCAL_WRITE) != transfers) {}
84+
85+
// After having received all the data, scatter it to the four GPU buffers
86+
for(int i = 0; i < sg.len / 8192; i++) {
87+
// printf("Copying chunk %d/%d to GPU %d\n", i+1, sg.len/4096, i%4);
88+
if (hipSetDevice(i%4)) { throw std::runtime_error("Couldn't select GPU!"); }
89+
hipMemcpyAsync(dest_buffers[i % 4], &mem[i * 8192], 8192, hipMemcpyHostToDevice);
90+
}
91+
92+
// Sync mem copy for all the GPUs
93+
for(int i = 0; i < NUM_GPUS; i++) {
94+
if (hipSetDevice(i)) { throw std::runtime_error("Couldn't select GPU!"); }
95+
if (hipDeviceSynchronize() != hipSuccess) { throw std::runtime_error("Couldn't synchronize stream!"); }
96+
}
8397
};
8498

8599
// Execute benchmark
86-
coyote::cBench bench(n_runs, 0);
100+
coyote::cBench bench(n_runs, 10);
87101
bench.execute(bench_fn, prep_fn);
88102

89-
// Functional correctness check
90-
if (!operation) {
91-
for (int i = 0; i < sg.len / sizeof(int); i++) {
92-
assert(mem[i] == i);
93-
}
94-
}
95103

96104
// For writes, divide by 2, since that is sent two ways (from client to server and then from server to client)
97105
// Reads are one way, so no need to scale
@@ -135,10 +143,11 @@ int main(int argc, char *argv[]) {
135143
// if (hipSetDevice(DEFAULT_GPU_ID)) { throw std::runtime_error("Couldn't select GPU!"); }
136144

137145
// Allocate four buffers for the scatter operation
138-
int* vaddr_1 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
139-
int* vaddr_2 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
140-
int* vaddr_3 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
141-
int* vaddr_4 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::HPF, max_size});
146+
if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
147+
int* vaddr_1 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 0});
148+
int* vaddr_2 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 1});
149+
int* vaddr_3 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 2});
150+
int* vaddr_4 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 3});
142151

143152
// Print all the new buffer addresses
144153
std::cout << "Scatter buffer addresses:" << std::endl;
@@ -153,11 +162,14 @@ int main(int argc, char *argv[]) {
153162
}
154163

155164
// Write the buffer addresses to the vFPGA registers
156-
coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_1), static_cast<uint32_t>(ScatterRegisters::VADDR_1));
165+
/* coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_1), static_cast<uint32_t>(ScatterRegisters::VADDR_1));
157166
coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_2), static_cast<uint32_t>(ScatterRegisters::VADDR_2));
158167
coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_3), static_cast<uint32_t>(ScatterRegisters::VADDR_3));
159168
coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_4), static_cast<uint32_t>(ScatterRegisters::VADDR_4));
160-
coyote_thread.setCSR(static_cast<uint64_t>(true), static_cast<uint32_t>(ScatterRegisters::VADDR_VALID));
169+
coyote_thread.setCSR(static_cast<uint64_t>(true), static_cast<uint32_t>(ScatterRegisters::VADDR_VALID)); */
170+
171+
// Array of destination buffers for scatter operation
172+
int* dest_buffers[NUM_GPUS] = { vaddr_1, vaddr_2, vaddr_3, vaddr_4 };
161173

162174
// sleep(20);
163175

@@ -173,7 +185,7 @@ int main(int argc, char *argv[]) {
173185
// double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
174186
// std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
175187

176-
double latency_time = run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
188+
double latency_time = run_bench(coyote_thread, sg, mem, dest_buffers, N_LATENCY_REPS, n_runs, operation);
177189
std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
178190

179191
curr_size *= 2;

0 commit comments

Comments
 (0)