Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ rocprofiler_add_option(
ROCPROFILER_BUILD_CI "Enable continuous integration default values for options" OFF
ADVANCED)

rocprofiler_add_option(ROCPROFILER_GPU_TARGETS "Targets for building tests and samples"
"")
if(NOT ROCPROFILER_GPU_TARGETS OR ROCPROFILER_GPU_TARGETS STREQUAL "")
set(ROCPROFILER_GPU_TARGETS
"gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102")
endif()

rocprofiler_add_option(ROCPROFILER_BUILD_TESTS "Enable building the tests"
${ROCPROFILER_BUILD_CI})
rocprofiler_add_option(ROCPROFILER_BUILD_SAMPLES "Enable building the code samples"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
// SOFTWARE.

#include <hip/hip_runtime.h>
#include <stdlib.h>
#include <time.h>
#include <cassert>

#include "client.hpp"

Expand Down Expand Up @@ -48,6 +51,92 @@ kernelA(int devid, volatile int* wait_on, int value, int* no_opt)
printf("[device=%i][return] Wait on %i: %i (%i)\n", devid, value, *wait_on, *no_opt);
}

// Force assert to work even in Release builds for this test
#ifdef NDEBUG
# undef NDEBUG
# include <cassert>
# define NDEBUG
#endif
Comment on lines 54 to 59
Copy link

Copilot AI Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Manipulating NDEBUG to force assert behavior in release builds is problematic and can cause undefined behavior. Consider using a custom assertion macro or explicit error handling instead of redefining system macros.

Copilot uses AI. Check for mistakes.

__global__ void
check_order_kernel(int expected, int* actual)
{
// Note: We do not use atomics here on purpose to ensure that the barrier
// being injected has proper fencing set.
if(*actual != expected)
{
printf("[error] Expected %i but got %i\n", expected, *actual);
}
// Assert will now work in both Debug and Release builds
assert(*actual == expected);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe needs NDEBUG?

Copy link
Contributor Author

@bwelton bwelton Aug 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, if it fails, we want the CI to trigger and fail this sample/test (this should never fail).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but unless we remove NDEBUG, this check will not be there in Release builds.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh thats what you are asking about, changed. Thanks for the comment here!

(*actual)++;
}

class DualStreamExecutor
{
private:
hipStream_t stream1_ = {};
hipStream_t stream2_ = {};
int device_ = {0};

public:
DualStreamExecutor(int device = 0)
: device_(device)
{
HIP_CALL(hipSetDevice(device_));
HIP_CALL(hipStreamCreate(&stream1_));
HIP_CALL(hipStreamCreate(&stream2_));
std::cout << "Created dual streams on device " << device_ << std::endl;
}

~DualStreamExecutor()
{
HIP_CALL(hipStreamDestroy(stream1_));
HIP_CALL(hipStreamDestroy(stream2_));
}

// Function template to launch any kernel on both streams
template <typename KernelFunc, typename... Args>
void launch_kernel_on_both_streams(KernelFunc kernel,
dim3 gridSize,
dim3 blockSize,
size_t sharedMem,
Args... args)
{
hipLaunchKernelGGL(kernel, gridSize, blockSize, sharedMem, stream1_, args...);
hipLaunchKernelGGL(kernel, gridSize, blockSize, sharedMem, stream2_, args...);
}

// Synchronize both streams
void synchronize()
{
HIP_CALL(hipStreamSynchronize(stream1_));
HIP_CALL(hipStreamSynchronize(stream2_));
std::cout << "Both streams synchronized" << std::endl;
}

// Get stream handles
hipStream_t get_stream1() const { return stream1_; }
hipStream_t get_stream2() const { return stream2_; }

// Execute async memory operations on both streams
void async_memcpy_to_device(void* dst1, void* dst2, const void* src, size_t size)
{
HIP_CALL(hipMemcpyAsync(dst1, src, size, hipMemcpyHostToDevice, stream1_));
HIP_CALL(hipMemcpyAsync(dst2, src, size, hipMemcpyHostToDevice, stream2_));
}

void async_memcpy_to_host(void* dst1,
void* dst2,
const void* src1,
const void* src2,
size_t size)
{
HIP_CALL(hipMemcpyAsync(dst1, src1, size, hipMemcpyDeviceToHost, stream1_));
HIP_CALL(hipMemcpyAsync(dst2, src2, size, hipMemcpyDeviceToHost, stream2_));
}
};

int
main(int, char**)
{
Expand Down Expand Up @@ -76,5 +165,27 @@ main(int, char**)
HIP_CALL(hipSetDevice(0));
HIP_CALL(hipDeviceSynchronize());

// Validate that kernels are being processed in order on the same device
HIP_CALL(hipSetDevice(0));
DualStreamExecutor executor(0);
*no_opt_0 = 0;
// Use reproducible seed for deterministic testing and easier debugging
srand(12345);
Comment on lines 172 to 173
Copy link

Copilot AI Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Using the global srand() function can affect other parts of the program. Consider using a local random number generator (e.g., std::mt19937) with explicit state management for better isolation.

Copilot uses AI. Check for mistakes.

for(int i = 0; i < 10000; i++)
{
if(rand() & 1)
{
hipLaunchKernelGGL(
check_order_kernel, dim3(1), dim3(1), 0, executor.get_stream1(), i, no_opt_0);
}
else
{
hipLaunchKernelGGL(
check_order_kernel, dim3(1), dim3(1), 0, executor.get_stream2(), i, no_opt_0);
}
}
executor.synchronize();
HIP_CALL(hipDeviceSynchronize());
std::cerr << "Run complete\n";
}
20 changes: 20 additions & 0 deletions projects/rocprofiler-sdk/samples/counter_collection/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,16 @@ launchKernels(const long NUM_LAUNCH, const long SYNC_INTERVAL, const int DEV_ID)
hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, 1, 2);
hipLaunchKernelGGL(kernelB, dim3(1), dim3(1), 0, 0, 1, 2);
if(i % SYNC_INTERVAL == (SYNC_INTERVAL - 1)) HIP_CALL(hipDeviceSynchronize());

// Progress indicator every 1000 iterations
if(i % 1000 == 999)
{
printf("[PROGRESS] Device %d: Completed %ld / %ld iterations\n",
DEV_ID,
i + 1,
NUM_LAUNCH);
fflush(stdout);
}
}

const int NElems = 512 * 512;
Expand All @@ -101,6 +111,16 @@ launchKernels(const long NUM_LAUNCH, const long SYNC_INTERVAL, const int DEV_ID)
{
hipLaunchKernelGGL(kernelC, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, NElems);
if(i % SYNC_INTERVAL == (SYNC_INTERVAL - 1)) HIP_CALL(hipDeviceSynchronize());

// Progress indicator every 1000 iterations
if(i % 1000 == 999)
{
printf("[PROGRESS] Device %d: KernelC completed %ld / %ld iterations\n",
DEV_ID,
i + 1,
NUM_LAUNCH);
fflush(stdout);
}
}
HIP_CALL(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIP_CALL(hipDeviceSynchronize());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ target_link_libraries(
PRIVATE rocprofiler-sdk::rocprofiler-sdk rocprofiler-sdk::samples-build-flags
rocprofiler-sdk::samples-common-library)

set(DEFAULT_GPU_TARGETS "gfx906" "gfx908" "gfx90a" "gfx942" "gfx950" "gfx1100" "gfx1101"
"gfx1102")
set(DEFAULT_GPU_TARGETS ${ROCPROFILER_GPU_TARGETS})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is invalid when samples are build against an install -- ROCPROFILER_GPU_TARGETS is not set. Furthermore, all of these targets may not be supported by OpenMP


set(OPENMP_GPU_TARGETS
"${DEFAULT_GPU_TARGETS}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ EvaluateAST::read_pkt(const aql::CounterPacketConstruct* pkt_gen, hsa::AQLPacket

if(status != HSA_STATUS_SUCCESS)
{
ROCP_ERROR << "AqlProfile could not decode packet";
ROCP_ERROR << "AqlProfile could not decode packet " << status;
}
return ret;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "lib/rocprofiler-sdk/registration.hpp"

#include <functional>
#include <sstream>

namespace rocprofiler
{
Expand Down Expand Up @@ -161,5 +162,47 @@ hsa_barrier::clear_barrier()
ROCP_TRACE << "Barrier (handle: " << _barrier_signal.handle << ") is now cleared.";
}

std::string
hsa_barrier::to_string() const
{
std::ostringstream oss;
oss << "hsa_barrier{";

// _barrier_finished (function pointer - just indicate if it's set)
oss << "_barrier_finished: " << (_barrier_finished ? "set" : "null") << ", ";

// _barrier_signal
oss << "_barrier_signal: {handle: " << _barrier_signal.handle << "}, ";

// _queue_waiting
oss << "_queue_waiting: {";
_queue_waiting.rlock([&](const auto& queue_waiting) {
bool first = true;
for(const auto& [queue_id, count] : queue_waiting)
{
if(!first) oss << ", ";
oss << queue_id << ": " << count;
first = false;
}
});
oss << "}, ";

// _barrier_enqueued
oss << "_barrier_enqueued: {";
_barrier_enqueued.rlock([&](const auto& barrier_enqueued) {
bool first = true;
for(const auto& queue_id : barrier_enqueued)
{
if(!first) oss << ", ";
oss << queue_id;
first = false;
}
});
oss << "}";

oss << "}";
return oss.str();
}

} // namespace hsa
} // namespace rocprofiler
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@
#include <hsa/hsa_api_trace.h>
#include <hsa/hsa_ext_amd.h>

#include <fmt/format.h>

#include <atomic>
#include <functional>
#include <optional>
#include <string>
#include <unordered_map>
#include <unordered_set>

Expand Down Expand Up @@ -72,6 +75,9 @@ class hsa_barrier
// If this is the last queue waiting, clears the barrier and marks it as complete.
void remove_queue(const Queue* queue);

// Returns a string containing all class variable data
std::string to_string() const;

private:
std::function<void()> _barrier_finished = {};
CoreApiTable _core_api = {};
Expand All @@ -86,3 +92,23 @@ class hsa_barrier

} // namespace hsa
} // namespace rocprofiler

namespace fmt
{
// fmt::format support for hsa_barrier
template <>
struct formatter<rocprofiler::hsa::hsa_barrier>
{
template <typename ParseContext>
constexpr auto parse(ParseContext& ctx)
{
return ctx.begin();
}

template <typename Ctx>
auto format(rocprofiler::hsa::hsa_barrier const& barrier, Ctx& ctx) const
{
return fmt::format_to(ctx.out(), "{}", barrier.to_string());
}
};
} // namespace fmt
Loading
Loading