Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build_tools/lint/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
" additional targets."
),
"multi_gpu": "Used by `xla_test` to signal that multiple GPUs are needed.",
"skip_rocprofiler_sdk": "used to skip rocmtracer test as it calls rocprofiler-sdk via rocprofiler_force_configure",
}


Expand Down
158 changes: 150 additions & 8 deletions xla/backends/profiler/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,58 @@ cc_library(
],
)

config_setting(
name = "use_v1",
values = {"define": "xla_rocm_profiler=v1"},
)

config_setting(
name = "use_rocprofiler_sdk",
values = {"define": "xla_rocm_profiler=v3"},
)

cc_library(
name = "rocm_profiler_backend_cfg",
defines = select({
":use_v1": ["XLA_GPU_ROCM_TRACER_BACKEND=1"],
":use_rocprofiler_sdk": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
"//conditions:default": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
}),
visibility = ["//visibility:public"],
)

cc_library(
name = "rocm_tracer_utils",
srcs = ["rocm_tracer_utils.cc"],
hdrs = ["rocm_tracer_utils.h"],
tags = [
"gpu",
"manual",
"rocm-only",
],
deps = [
"//xla/tsl/profiler/backends/cpu:annotation_stack",
"//xla/tsl/profiler/utils:time_utils",
"//xla/tsl/profiler/utils:math_utils",
"@com_google_absl//absl/strings:string_view",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/container:node_hash_set",
"@tsl//tsl/platform:env_time",
"@tsl//tsl/platform:env",
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:macros",
"@local_config_rocm//rocm:rocprofiler-sdk",
],
visibility = ["//visibility:public"],
)

cc_library(
name = "rocm_collector",
srcs = ["rocm_collector.cc"],
hdrs = ["rocm_collector.h"],
# copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
tags = [
"gpu",
"rocm-only",
Expand All @@ -373,6 +420,8 @@ cc_library(
"manual",
]),
deps = [
":rocm_tracer_utils",
":rocm_profiler_backend_cfg",
"//xla/stream_executor/rocm:roctracer_wrapper",
"//xla/tsl/profiler/backends/cpu:annotation_stack",
"//xla/tsl/profiler/utils:parse_annotation",
Expand All @@ -397,26 +446,54 @@ cc_library(
"@tsl//tsl/platform:types",
"@tsl//tsl/profiler/lib:profiler_factory",
"@tsl//tsl/profiler/lib:profiler_interface",
"@local_config_rocm//rocm:rocprofiler-sdk",
],
)

cc_library(
name = "rocm_tracer",
srcs = ["rocm_tracer.cc"],
hdrs = ["rocm_tracer.h"],
# copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
name = "rocm_tracer_headers",
hdrs = [
"rocm_tracer.h",
"rocm_profiler_sdk.h",
"rocm_tracer_v1.h",
],
tags = [
"gpu",
"manual",
"rocm-only",
] + if_google([
# TODO(b/360374983): Remove this tag once the target can be built without --config=rocm.
],
# PROPAGATE the layout macro to every dependent TU:
defines = select({
":use_v1": ["XLA_GPU_ROCM_TRACER_BACKEND=1"],
":use_rocprofiler_sdk": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
"//conditions:default": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
}),
visibility = ["//visibility:public"],
)

cc_library(
name = "rocm_tracer_impl",
srcs = select({
":use_v1": ["rocm_tracer_v1.cc"],
":use_rocprofiler_sdk": ["rocm_profiler_sdk.cc"],
"//conditions:default": ["rocm_profiler_sdk.cc"],
}),
tags = [
"gpu",
"manual",
]),
"rocm-only",
],
deps = [
":rocm_tracer_headers",
":rocm_collector",
"//xla/stream_executor/rocm:roctracer_wrapper",
"//xla/tsl/profiler/backends/cpu:annotation_stack",
"//xla/tsl/profiler/utils:time_utils",
"//xla/tsl/profiler/utils:xplane_builder",
"//xla/tsl/profiler/utils:xplane_schema",
"//xla/tsl/profiler/utils:xplane_utils",
"//xla/tsl/util:env_var",
"//xla:debug_options_flags",
"@com_google_absl//absl/container:fixed_array",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
Expand All @@ -433,9 +510,74 @@ cc_library(
"@tsl//tsl/platform:status",
"@tsl//tsl/platform:thread_annotations",
"@tsl//tsl/platform:types",
"@tsl//tsl/profiler/lib:profiler_factory",
"@tsl//tsl/profiler/lib:profiler_interface",
],
)

cc_library(
name = "rocm_tracer",
tags = [
"gpu",
"manual",
"rocm-only",
],
deps = [":rocm_tracer_headers", ":rocm_tracer_impl"],
visibility = ["//visibility:public"],
)

# upstream it's called xla_cc_test as no GPU involved.
xla_test(
name = "rocm_tracer_test",
size = "small",
srcs = ["rocm_tracer_test.cc"],
tags = [
"gpu",
"rocm-only",
"skip_rocprofiler_sdk", # due to rocprofiler-sdk's rocprofiler_force_configure
] + if_google([
# Optional: only run internally if ROCm config is enabled
"manual",
]),
deps = [
":rocm_tracer",
":rocm_tracer_utils",
"//xla/tsl/profiler/utils:xplane_builder",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_googletest//:gtest_main",
"@tsl//tsl/platform:status_matchers",
"@tsl//tsl/platform:test",
"@tsl//tsl/profiler/protobuf:xplane_proto_cc",
],
)

xla_test(
name = "rocm_collector_test",
size = "small",
srcs = ["rocm_collector_test.cc"],
tags = [
"gpu",
"rocm-only",
] + if_google([
"manual",
]),
deps = [
":rocm_collector",
":rocm_tracer_utils",
"//xla/tsl/profiler/utils:xplane_builder",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_googletest//:gtest_main",
"@tsl//tsl/platform:env_time",
"@tsl//tsl/platform:status_matchers",
"@tsl//tsl/platform:test",
"@tsl//tsl/profiler/protobuf:xplane_proto_cc",
"@tsl//tsl/platform:env",
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:macros",
],
)

cc_library(
name = "cupti_collector",
srcs = ["cupti_collector.cc"],
Expand Down
52 changes: 28 additions & 24 deletions xla/backends/profiler/gpu/device_tracer_rocm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,22 @@ limitations under the License.
#include "rocm/include/roctracer/ext/prof_protocol.h"
#include "xla/backends/profiler/gpu/rocm_collector.h"
#include "xla/backends/profiler/gpu/rocm_tracer.h"
#include "tsl/profiler/lib/profiler_factory.h"
#include "tsl/profiler/lib/profiler_interface.h"
#include "xla/backends/profiler/gpu/rocm_collector.h"
#include "xla/backends/profiler/gpu/rocm_tracer.h"
#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
#include "xla/debug_options_flags.h"
#include "xla/tsl/platform/env_time.h"
#include "xla/tsl/platform/errors.h"
#include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
#include "tsl/profiler/lib/profiler_factory.h"
#include "tsl/profiler/lib/profiler_interface.h"

namespace xla {
namespace profiler {

using tensorflow::ProfileOptions;
using tsl::profiler::AnnotationStack;
using tsl::profiler::ProfilerInterface;
using tsl::profiler::RegisterProfilerFactory;
using tsl::profiler::XSpace;

// GpuTracer for ROCm GPU.
Expand All @@ -59,7 +62,6 @@ class GpuTracer : public profiler::ProfilerInterface {
absl::Status DoStop();

RocmTracerOptions GetRocmTracerOptions();

RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);

enum State {
Expand All @@ -76,10 +78,9 @@ class GpuTracer : public profiler::ProfilerInterface {
};

RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
// TODO(rocm-profiler): We need support for context similar to CUDA
RocmTracerOptions options;
#if defined(XLA_GPU_ROCM_TRACER_BACKEND) && (XLA_GPU_ROCM_TRACER_BACKEND == 1)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JFYI, you can skip defined() and use simply

#if XLA_GPU_ROCM_TRACER_BACKEND == 1

After all replacements due to macro expansion and evaluations of defined-macro-expressions ... have been performed, all remaining identifiers and keywords, except for true and false, are replaced with the pp-number 0 ....
https://timsong-cpp.github.io/cppwp/n4659/cpp.cond#9

std::vector<uint32_t> empty_vec;

// clang formatting does not preserve one entry per line
// clang-format off
std::vector<uint32_t> hip_api_domain_ops{
Expand Down Expand Up @@ -149,35 +150,42 @@ RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);

options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_OPS, empty_vec);

#else
options.max_annotation_strings = 1024 * 1024;
#endif
return options;
}

RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
uint32_t num_gpus) {
RocmTraceCollectorOptions options;
options.max_callback_api_events = 2 * 1024 * 1024;
options.max_activity_api_events = 2 * 1024 * 1024;
options.max_annotation_strings = 1024 * 1024;
options.num_gpus = num_gpus;

const auto& dbg = xla::GetDebugOptionsFromFlags();
int64_t max_events = dbg.xla_gpu_rocm_max_trace_events();
VLOG(2) << "max number of events to be trace from flag = " << max_events;
if (max_events <= 0) max_events = 4 * 1024 * 1024;
if (max_events > 1'000'000'000LL) max_events = 1'000'000'000LL;

VLOG(3) << "maximum number of events to be traced = " << max_events;

options.max_callback_api_events = max_events;
options.max_activity_api_events = max_events;
options.max_annotation_strings = max_events;
return options;
}

absl::Status GpuTracer::DoStart() {
if (!rocm_tracer_->IsAvailable()) {
return tsl::errors::Unavailable("Another profile session running.");
}

AnnotationStack::Enable(true);
uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();

RocmTracerOptions tracer_options = GetRocmTracerOptions();
RocmTraceCollectorOptions trace_collector_options =
GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
rocm_trace_collector_ = CreateRocmCollector(
trace_collector_options, start_walltime_ns, start_gputime_ns);

RocmTracerOptions tracer_options = GetRocmTracerOptions();
rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());

return absl::OkStatus();
Expand Down Expand Up @@ -240,13 +248,9 @@ std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
return nullptr;
}

profiler::RocmTracer* rocm_tracer =
profiler::RocmTracer::GetRocmTracerSingleton();
if (!rocm_tracer->IsAvailable()) {
return nullptr;
}

return std::make_unique<profiler::GpuTracer>(rocm_tracer);
auto& rocm_tracer = profiler::RocmTracer::GetRocmTracerSingleton();
if (!rocm_tracer.IsAvailable()) return nullptr;
return std::make_unique<profiler::GpuTracer>(&rocm_tracer);
}

auto register_rocm_gpu_tracer_factory = [] {
Expand Down
Loading