Skip to content

Commit 67a50e9

Browse files
author
Maximilian
committed
Updated examples for artifact evaluation
1 parent 241a0f8 commit 67a50e9

File tree

5 files changed

+43
-24
lines changed

5 files changed

+43
-24
lines changed

examples/09_perf_rdma/sw/src/client/main.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,12 @@ int main(int argc, char *argv[]) {
127127

128128
coyote::rdmaSg sg = { .len = curr_size };
129129

130-
// double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
131-
// double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
132-
// std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
130+
double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
131+
double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
132+
std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
133133

134-
double latency_time = run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
135-
std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
134+
// double latency_time = run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
135+
// std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
136136

137137
curr_size *= 2;
138138
}

examples/09_perf_rdma/sw/src/include/constants.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
#define DEFAULT_VFPGA_ID 0
3636

3737
// Run-time parameters; users can change these from the CLI
38-
#define N_RUNS_DEFAULT 10
38+
#define N_RUNS_DEFAULT 30
3939
#define MIN_TRANSFER_SIZE_DEFAULT 64
4040
#define MAX_TRANSFER_SIZE_DEFAULT (1 * 1024 * 1024)
4141

examples/11_rdma_scatter/sw/src/client/main.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,16 @@ int main(int argc, char *argv[]) {
100100
bool operation;
101101
std::string server_ip;
102102
unsigned int min_size, max_size, n_runs;
103+
bool throughput;
103104

104105
boost::program_options::options_description runtime_options("Coyote Perf RDMA Options");
105106
runtime_options.add_options()
106107
("ip_address,i", boost::program_options::value<std::string>(&server_ip), "Server's IP address")
107108
("operation,o", boost::program_options::value<bool>(&operation)->default_value(false), "Benchmark operation: READ(0) or WRITE(1)")
108109
("runs,r", boost::program_options::value<unsigned int>(&n_runs)->default_value(N_RUNS_DEFAULT), "Number of times to repeat the test")
109110
("min_size,x", boost::program_options::value<unsigned int>(&min_size)->default_value(MIN_TRANSFER_SIZE_DEFAULT), "Starting (minimum) transfer size")
110-
("max_size,X", boost::program_options::value<unsigned int>(&max_size)->default_value(MAX_TRANSFER_SIZE_DEFAULT), "Ending (maximum) transfer size");
111+
("max_size,X", boost::program_options::value<unsigned int>(&max_size)->default_value(MAX_TRANSFER_SIZE_DEFAULT), "Ending (maximum) transfer size")
112+
("throughput,t", boost::program_options::bool_switch(&throughput)->default_value(false), "Whether to benchmark throughput (true) or latency (false); by default, latency is benchmarked");
111113
boost::program_options::variables_map command_line_arguments;
112114
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, runtime_options), command_line_arguments);
113115
boost::program_options::notify(command_line_arguments);
@@ -118,6 +120,8 @@ int main(int argc, char *argv[]) {
118120
std::cout << "Number of test runs: " << n_runs << std::endl;
119121
std::cout << "Starting transfer size: " << min_size << std::endl;
120122
std::cout << "Ending transfer size: " << max_size << std::endl << std::endl;
123+
std::cout << "Benchmarking " << (throughput ? "throughput" : "latency") << std::endl << std::endl;
124+
121125

122126
/* Coyote completely abstracts the complexity behind exchanging QPs and setting up an RDMA connection
123127
* Instead, given a cThread, the target RDMA buffer size and the remote server's TCP address,
@@ -166,12 +170,14 @@ int main(int argc, char *argv[]) {
166170

167171
coyote::rdmaSg sg = { .len = curr_size };
168172

169-
// double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
170-
// double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
171-
// std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
172-
173-
double latency_time = run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
174-
std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
173+
if(throughput) {
174+
double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
175+
double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
176+
std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; \n";
177+
} else {
178+
double latency_time = run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
179+
std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
180+
}
175181

176182
curr_size *= 2;
177183
}

examples/11_rdma_scatter/sw/src/include/constants.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
#define DEFAULT_VFPGA_ID 0
3636

3737
// Run-time parameters; users can change these from the CLI
38-
#define N_RUNS_DEFAULT 10
38+
#define N_RUNS_DEFAULT 10
3939
#define MIN_TRANSFER_SIZE_DEFAULT 64
4040
#define MAX_TRANSFER_SIZE_DEFAULT (1 * 1024 * 1024)
4141

examples/12_rdma_scatter_baseline/sw/src/client/main.cpp

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,10 @@ enum class ScatterRegisters: uint32_t {
5454
// the thread object which can lead to undefined behaviour and bugs.
5555
double run_bench(
5656
coyote::cThread &coyote_thread, coyote::rdmaSg &sg,
57-
int *mem, int* dest_buffers[], uint transfers, uint n_runs, bool operation
57+
uint8_t *mem, int* dest_buffers[], uint transfers, uint n_runs, bool operation
5858
) {
59+
60+
printf("Running benchmark with transfer size %d bytes, repeated %d times\n", sg.len, transfers);
5961
// When writing, the server asserts the written payload is correct (which the client sets)
6062
// When reading, the client asserts the read payload is correct (which the server sets)
6163
for (int i = 0; i < sg.len / sizeof(int); i++) {
@@ -82,13 +84,18 @@ double run_bench(
8284

8385
while (coyote_thread.checkCompleted(coyote::CoyoteOper::LOCAL_WRITE) != transfers) {}
8486

87+
// printf("Received all data from server, starting scatter to GPUs...\n");
88+
8589
// After having received all the data, scatter it to the four GPU buffers
8690
for(int i = 0; i < sg.len / 8192; i++) {
8791
// printf("Copying chunk %d/%d to GPU %d\n", i+1, sg.len/4096, i%4);
88-
if (hipSetDevice(i%4)) { throw std::runtime_error("Couldn't select GPU!"); }
89-
hipMemcpyAsync(dest_buffers[i % 4], &mem[i * 8192], 8192, hipMemcpyHostToDevice);
92+
// printf("Copying chunk %d/%d to GPU %d\n", i+1, sg.len/8192, i%NUM_GPUS);
93+
if (hipSetDevice(i%NUM_GPUS)) { throw std::runtime_error("Couldn't select GPU!"); }
94+
if (hipMemcpyAsync(dest_buffers[i % NUM_GPUS], &mem[i * 8192], 8192, hipMemcpyHostToDevice) != hipSuccess) { throw std::runtime_error("Couldn't copy memory to GPU!"); }
9095
}
9196

97+
//printf("Issued all async copies to GPUs\n");
98+
9299
// Sync mem copy for all the GPUs
93100
for(int i = 0; i < NUM_GPUS; i++) {
94101
if (hipSetDevice(i)) { throw std::runtime_error("Couldn't select GPU!"); }
@@ -111,14 +118,16 @@ int main(int argc, char *argv[]) {
111118
bool operation;
112119
std::string server_ip;
113120
unsigned int min_size, max_size, n_runs;
121+
bool throughput;
114122

115123
boost::program_options::options_description runtime_options("Coyote Perf RDMA Options");
116124
runtime_options.add_options()
117125
("ip_address,i", boost::program_options::value<std::string>(&server_ip), "Server's IP address")
118126
("operation,o", boost::program_options::value<bool>(&operation)->default_value(false), "Benchmark operation: READ(0) or WRITE(1)")
119127
("runs,r", boost::program_options::value<unsigned int>(&n_runs)->default_value(N_RUNS_DEFAULT), "Number of times to repeat the test")
120128
("min_size,x", boost::program_options::value<unsigned int>(&min_size)->default_value(MIN_TRANSFER_SIZE_DEFAULT), "Starting (minimum) transfer size")
121-
("max_size,X", boost::program_options::value<unsigned int>(&max_size)->default_value(MAX_TRANSFER_SIZE_DEFAULT), "Ending (maximum) transfer size");
129+
("max_size,X", boost::program_options::value<unsigned int>(&max_size)->default_value(MAX_TRANSFER_SIZE_DEFAULT), "Ending (maximum) transfer size")
130+
("throughput,t", boost::program_options::bool_switch(&throughput)->default_value(false), "Whether to benchmark throughput (true) or latency (false); by default, latency is benchmarked");
122131
boost::program_options::variables_map command_line_arguments;
123132
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, runtime_options), command_line_arguments);
124133
boost::program_options::notify(command_line_arguments);
@@ -129,14 +138,15 @@ int main(int argc, char *argv[]) {
129138
std::cout << "Number of test runs: " << n_runs << std::endl;
130139
std::cout << "Starting transfer size: " << min_size << std::endl;
131140
std::cout << "Ending transfer size: " << max_size << std::endl << std::endl;
141+
std::cout << "Benchmarking " << (throughput ? "throughput" : "latency") << std::endl << std::endl;
132142

133143
/* Coyote completely abstracts the complexity behind exchanging QPs and setting up an RDMA connection
134144
* Instead, given a cThread, the target RDMA buffer size and the remote server's TCP address,
135145
* One can use the function initRDMA, which will allocate the buffer and
136146
* Exchange the necessary information with the server; the server calls the equivalent function but without the IP address
137147
*/
138148
coyote::cThread coyote_thread(DEFAULT_VFPGA_ID, getpid(), 0);
139-
int *mem = (int *) coyote_thread.initRDMA(max_size, coyote::DEF_PORT, server_ip.c_str());
149+
uint8_t *mem = (uint8_t *) coyote_thread.initRDMA(max_size, coyote::DEF_PORT, server_ip.c_str());
140150
if (!mem) { throw std::runtime_error("Could not allocate memory; exiting..."); }
141151

142152
// GPU memory will be allocated on the GPU set using hipSetDevice(...)
@@ -181,12 +191,15 @@ int main(int argc, char *argv[]) {
181191

182192
coyote::rdmaSg sg = { .len = curr_size };
183193

184-
// double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
185-
// double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
186-
// std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
194+
if(throughput) {
195+
double throughput_time = run_bench(coyote_thread, sg, mem, dest_buffers, N_THROUGHPUT_REPS, n_runs, operation);
196+
double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
197+
std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
198+
} else {
187199

188-
double latency_time = run_bench(coyote_thread, sg, mem, dest_buffers, N_LATENCY_REPS, n_runs, operation);
189-
std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
200+
double latency_time = run_bench(coyote_thread, sg, mem, dest_buffers, N_LATENCY_REPS, n_runs, operation);
201+
std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
202+
}
190203

191204
curr_size *= 2;
192205
}

0 commit comments

Comments
 (0)