Skip to content

Commit a8c0f66

Browse files
author
Maximilian
committed
Fixed SW for Baseline and P2P
1 parent 80dcf7f commit a8c0f66

File tree

4 files changed

+21
-6
lines changed

4 files changed

+21
-6
lines changed

examples/09_perf_rdma/sw/src/client/main.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ double run_bench(
7171
};
7272

7373
// Execute benchmark
74-
coyote::cBench bench(n_runs, 0);
74+
coyote::cBench bench(n_runs, 10);
7575
bench.execute(bench_fn, prep_fn);
7676

7777
// Functional correctness check
@@ -127,9 +127,9 @@ int main(int argc, char *argv[]) {
127127

128128
coyote::rdmaSg sg = { .len = curr_size };
129129

130-
double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
131-
double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
132-
std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
130+
// double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
131+
// double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
132+
// std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
133133

134134
double latency_time = run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
135135
std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;

examples/11_rdma_scatter/sw/src/client/main.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,11 @@ double run_bench(
7777
auto bench_fn = [&]() {
7878
for (int i = 0; i < transfers; i++) {
7979
coyote_thread.invoke(coyote_operation, sg);
80+
// printf("Invoked %d/%d\n", i+1, transfers);
8081
}
8182

8283
while (coyote_thread.checkCompleted(coyote::CoyoteOper::LOCAL_WRITE) != transfers) {}
84+
// printf("Completed %d transfers\n", transfers);
8385
};
8486

8587
// Execute benchmark

examples/11_rdma_scatter/sw/src/server/main.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,13 @@ void run_bench(
6969

7070
// For writes, wait until client has written the targer number of messages; then write them back
7171
if (operation) {
72+
// printf("Run %d: Waiting for %u writes from client...\n", i, transfers);
7273
while (coyote_thread.checkCompleted(coyote::CoyoteOper::LOCAL_WRITE) != transfers) {}
74+
// printf("Run %d: Received %u writes from client; writing them back...\n", i, transfers);
7375

7476
for (int i = 0; i < transfers; i++) {
7577
coyote_thread.invoke(coyote::CoyoteOper::REMOTE_RDMA_WRITE, sg);
78+
// printf("Run %d: Invoked write %d/%d\n", i, i+1, transfers);
7679
}
7780
// For reads, the server is completely passive
7881
} else {
@@ -123,10 +126,13 @@ int main(int argc, char *argv[]) {
123126

124127
int* vaddr_1 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 0});
125128

129+
// if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
126130
int* vaddr_2 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 1});
127131

132+
// if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
128133
int* vaddr_3 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 2});
129134

135+
// if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
130136
int* vaddr_4 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 3});
131137

132138
// Print all the new buffer addresses
@@ -154,7 +160,7 @@ int main(int argc, char *argv[]) {
154160
while(curr_size <= max_size) {
155161
coyote::rdmaSg sg = { .len = curr_size };
156162
// run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
157-
run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
163+
run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs + 10, operation);
158164
curr_size *= 2;
159165
}
160166

examples/12_rdma_scatter_baseline/sw/src/server/main.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ void run_bench(
9292
// printf("Copying chunk %d/%d to GPU %d\n", i+1, sg.len/4096, i%4);
9393
if (hipSetDevice(i%4)) { throw std::runtime_error("Couldn't select GPU!"); }
9494
hipMemcpyAsync(dest_buffers[i % 4], &mem[i * 4096], 4096, hipMemcpyHostToDevice);
95+
96+
// if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
97+
// hipMemcpyAsync(dest_buffers[0], &mem[0], sg.len, hipMemcpyHostToDevice);
9598
// hipMemcpy(dest_buffers[i % 4], &mem[i * 4096], 4096, hipMemcpyHostToDevice);
9699
// hipEventRecord(events[i % 4], streams[i % 4]);
97100
}
@@ -125,7 +128,11 @@ void run_bench(
125128
// Sync mem copy for all the GPUs
126129
for(int i = 0; i < NUM_GPUS; i++) {
127130
if (hipSetDevice(i)) { throw std::runtime_error("Couldn't select GPU!"); }
128-
if (hipStreamSynchronize(streams[i]) != hipSuccess) { throw std::runtime_error("Couldn't synchronize stream!"); }
131+
if (hipDeviceSynchronize() != hipSuccess) { throw std::runtime_error("Couldn't synchronize stream!"); }
132+
133+
// if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
134+
// if (hipDeviceSynchronize() != hipSuccess) { throw std::runtime_error("Couldn't synchronize stream!"); }
135+
129136
// printf("Synchronized stream for GPU %d\n", i);
130137
}
131138

0 commit comments

Comments
 (0)