Fixed SW for Baseline and P2P

Maximilian · Maximilian · commit a8c0f6614cf8 · 2025-09-11T17:30:06.000+02:00
diff --git a/examples/09_perf_rdma/sw/src/client/main.cpp b/examples/09_perf_rdma/sw/src/client/main.cpp
@@ -71,7 +71,7 @@ double run_bench(
     };
 
     // Execute benchmark
-    coyote::cBench bench(n_runs, 0);
+    coyote::cBench bench(n_runs, 10);
     bench.execute(bench_fn, prep_fn);
 
     // Functional correctness check
@@ -127,9 +127,9 @@ int main(int argc, char *argv[])  {
         
         coyote::rdmaSg sg = { .len = curr_size };
     
-        double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
-        double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
-        std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
+        // double throughput_time = run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
+        // double throughput = ((double) N_THROUGHPUT_REPS * (double) curr_size) / (1024.0 * 1024.0 * throughput_time * 1e-9);
+        // std::cout << "Average throughput: " << std::setw(8) << throughput << " MB/s; ";
         
         double latency_time = run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
         std::cout << "Average latency: " << std::setw(8) << latency_time / 1e3 << " us" << std::endl;
diff --git a/examples/11_rdma_scatter/sw/src/client/main.cpp b/examples/11_rdma_scatter/sw/src/client/main.cpp
@@ -77,9 +77,11 @@ double run_bench(
     auto bench_fn = [&]() {        
         for (int i = 0; i < transfers; i++) {
             coyote_thread.invoke(coyote_operation, sg);
+            // printf("Invoked %d/%d\n", i+1, transfers);
         }
 
         while (coyote_thread.checkCompleted(coyote::CoyoteOper::LOCAL_WRITE) != transfers) {}
+        // printf("Completed %d transfers\n", transfers);
     };
 
     // Execute benchmark
diff --git a/examples/11_rdma_scatter/sw/src/server/main.cpp b/examples/11_rdma_scatter/sw/src/server/main.cpp
@@ -69,10 +69,13 @@ void run_bench(
 
         // For writes, wait until client has written the targer number of messages; then write them back
         if (operation) {
+            // printf("Run %d: Waiting for %u writes from client...\n", i, transfers);
             while (coyote_thread.checkCompleted(coyote::CoyoteOper::LOCAL_WRITE) != transfers) {}
+            // printf("Run %d: Received %u writes from client; writing them back...\n", i, transfers);
 
             for (int i = 0; i < transfers; i++) {
                 coyote_thread.invoke(coyote::CoyoteOper::REMOTE_RDMA_WRITE, sg);
+                // printf("Run %d: Invoked write %d/%d\n", i, i+1, transfers);
             }
         // For reads, the server is completely passive 
         } else { 
@@ -123,10 +126,13 @@ int main(int argc, char *argv[])  {
 
     int* vaddr_1 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 0}); 
     
+    // if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
     int* vaddr_2 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 1}); 
     
+    // if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
     int* vaddr_3 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 2});
     
+    // if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
     int* vaddr_4 = (int *) coyote_thread.getMem({coyote::CoyoteAllocType::GPU, max_size, false, 3});
 
     // Print all the new buffer addresses
@@ -154,7 +160,7 @@ int main(int argc, char *argv[])  {
     while(curr_size <= max_size) {
         coyote::rdmaSg sg = { .len = curr_size };
         // run_bench(coyote_thread, sg, mem, N_THROUGHPUT_REPS, n_runs, operation);
-        run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs, operation);
+        run_bench(coyote_thread, sg, mem, N_LATENCY_REPS, n_runs + 10, operation);
         curr_size *= 2;
     }
 
diff --git a/examples/12_rdma_scatter_baseline/sw/src/server/main.cpp b/examples/12_rdma_scatter_baseline/sw/src/server/main.cpp
@@ -92,6 +92,9 @@ void run_bench(
                 // printf("Copying chunk %d/%d to GPU %d\n", i+1, sg.len/4096, i%4);
                 if (hipSetDevice(i%4)) { throw std::runtime_error("Couldn't select GPU!"); }
                 hipMemcpyAsync(dest_buffers[i % 4], &mem[i * 4096], 4096, hipMemcpyHostToDevice);
+
+                // if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
+                // hipMemcpyAsync(dest_buffers[0], &mem[0], sg.len, hipMemcpyHostToDevice);
                 // hipMemcpy(dest_buffers[i % 4], &mem[i * 4096], 4096, hipMemcpyHostToDevice);
                 // hipEventRecord(events[i % 4], streams[i % 4]);
             }
@@ -125,7 +128,11 @@ void run_bench(
             // Sync mem copy for all the GPUs 
             for(int i = 0; i < NUM_GPUS; i++) {
                 if (hipSetDevice(i)) { throw std::runtime_error("Couldn't select GPU!"); }
-                if (hipStreamSynchronize(streams[i]) != hipSuccess) { throw std::runtime_error("Couldn't synchronize stream!"); }
+                if (hipDeviceSynchronize() != hipSuccess) { throw std::runtime_error("Couldn't synchronize stream!"); }
+
+                // if (hipSetDevice(0)) { throw std::runtime_error("Couldn't select GPU!"); }
+                // if (hipDeviceSynchronize() != hipSuccess) { throw std::runtime_error("Couldn't synchronize stream!"); }
+
                 // printf("Synchronized stream for GPU %d\n", i);
             }