2828#include < cstdlib>
2929
3030// AMD GPU management & run-time libraries
31- // #include <hip/hip_runtime.h>
31+ #include < hip/hip_runtime.h>
3232
3333// External library for easier parsing of CLI arguments by the executable
3434#include < boost/program_options.hpp>
3939#include " constants.hpp"
4040
4141constexpr bool const IS_CLIENT = true ;
42+ const int NUM_GPUS = 4 ;
4243
4344// Registers, corresponding to the registers defined in the vFPGA
4445enum class ScatterRegisters : uint32_t {
@@ -53,7 +54,7 @@ enum class ScatterRegisters: uint32_t {
5354// the thread object which can lead to undefined behaviour and bugs.
5455double run_bench (
5556 coyote::cThread &coyote_thread, coyote::rdmaSg &sg,
56- int *mem, uint transfers, uint n_runs, bool operation
57+ int *mem, int * dest_buffers[], uint transfers, uint n_runs, bool operation
5758) {
5859 // When writing, the server asserts the written payload is correct (which the client sets)
5960 // When reading, the client asserts the read payload is correct (which the server sets)
@@ -80,18 +81,25 @@ double run_bench(
8081 }
8182
8283 while (coyote_thread.checkCompleted (coyote::CoyoteOper::LOCAL_WRITE) != transfers) {}
84+
85+ // After having received all the data, scatter it to the four GPU buffers
86+ for (int i = 0 ; i < sg.len / 8192 ; i++) {
87+ // printf("Copying chunk %d/%d to GPU %d\n", i+1, sg.len/4096, i%4);
88+ if (hipSetDevice (i%4 )) { throw std::runtime_error (" Couldn't select GPU!" ); }
89+ hipMemcpyAsync (dest_buffers[i % 4 ], &mem[i * 8192 ], 8192 , hipMemcpyHostToDevice);
90+ }
91+
92+ // Sync mem copy for all the GPUs
93+ for (int i = 0 ; i < NUM_GPUS; i++) {
94+ if (hipSetDevice (i)) { throw std::runtime_error (" Couldn't select GPU!" ); }
95+ if (hipDeviceSynchronize () != hipSuccess) { throw std::runtime_error (" Couldn't synchronize stream!" ); }
96+ }
8397 };
8498
8599 // Execute benchmark
86- coyote::cBench bench (n_runs, 0 );
100+ coyote::cBench bench (n_runs, 10 );
87101 bench.execute (bench_fn, prep_fn);
88102
89- // Functional correctness check
90- if (!operation) {
91- for (int i = 0 ; i < sg.len / sizeof (int ); i++) {
92- assert (mem[i] == i);
93- }
94- }
95103
96104 // For writes, divide by 2, since that is sent two ways (from client to server and then from server to client)
97105 // Reads are one way, so no need to scale
@@ -135,10 +143,11 @@ int main(int argc, char *argv[]) {
135143 // if (hipSetDevice(DEFAULT_GPU_ID)) { throw std::runtime_error("Couldn't select GPU!"); }
136144
137145 // Allocate four buffers for the scatter operation
138- int * vaddr_1 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::HPF, max_size});
139- int * vaddr_2 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::HPF, max_size});
140- int * vaddr_3 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::HPF, max_size});
141- int * vaddr_4 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::HPF, max_size});
146+ if (hipSetDevice (0 )) { throw std::runtime_error (" Couldn't select GPU!" ); }
147+ int * vaddr_1 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::GPU, max_size, false , 0 });
148+ int * vaddr_2 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::GPU, max_size, false , 1 });
149+ int * vaddr_3 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::GPU, max_size, false , 2 });
150+ int * vaddr_4 = (int *) coyote_thread.getMem ({coyote::CoyoteAllocType::GPU, max_size, false , 3 });
142151
143152 // Print all the new buffer addresses
144153 std::cout << " Scatter buffer addresses:" << std::endl;
@@ -153,11 +162,14 @@ int main(int argc, char *argv[]) {
153162 }
154163
155164 // Write the buffer addresses to the vFPGA registers
156- coyote_thread.setCSR (reinterpret_cast <uint64_t >(vaddr_1), static_cast <uint32_t >(ScatterRegisters::VADDR_1));
165+ /* coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_1), static_cast<uint32_t>(ScatterRegisters::VADDR_1));
157166 coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_2), static_cast<uint32_t>(ScatterRegisters::VADDR_2));
158167 coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_3), static_cast<uint32_t>(ScatterRegisters::VADDR_3));
159168 coyote_thread.setCSR(reinterpret_cast<uint64_t>(vaddr_4), static_cast<uint32_t>(ScatterRegisters::VADDR_4));
160- coyote_thread.setCSR (static_cast <uint64_t >(true ), static_cast <uint32_t >(ScatterRegisters::VADDR_VALID));
169+ coyote_thread.setCSR(static_cast<uint64_t>(true), static_cast<uint32_t>(ScatterRegisters::VADDR_VALID)); */
170+
171+ // Array of destination buffers for scatter operation
172+ int * dest_buffers[NUM_GPUS] = { vaddr_1, vaddr_2, vaddr_3, vaddr_4 };
161173
162174 // sleep(20);
163175
@@ -173,7 +185,7 @@ int main(int argc, char *argv[]) {
173185 // double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
174186 // std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
175187
176- double latency_time = run_bench (coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
188+ double latency_time = run_bench (coyote_thread, sg, mem, dest_buffers, N_LATENCY_REPS, n_runs, operation);
177189 std::cout << " Average latency: " << std::setw (8 ) << latency_time / 1e3 << " us" << std::endl;
178190
179191 curr_size *= 2 ;
0 commit comments