Skip to content

Commit 55dae96

Browse files
[AutoPGLE] Prevent an AutoPGLE to run if user launched an external profiler.
PiperOrigin-RevId: 739109278
1 parent 821715b commit 55dae96

File tree

9 files changed

+26
-10
lines changed

9 files changed

+26
-10
lines changed

third_party/tsl/tsl/profiler/lib/profiler_session.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,13 @@ ProfilerSession::ProfilerSession(const ProfileOptions& options)
121121
DCHECK(profiler_lock_.Active());
122122
profilers_ = std::make_unique<tsl::profiler::ProfilerCollection>(
123123
profiler::CreateProfilers(options_));
124-
profilers_->Start().IgnoreError();
124+
125+
absl::Status status = profilers_->Start();
126+
if (options_.ignore_start_error()) {
127+
status.IgnoreError();
128+
} else {
129+
status_ = status;
130+
}
125131
#endif
126132
}
127133

third_party/tsl/tsl/profiler/protobuf/profiler_options.proto

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ syntax = "proto3";
22

33
package tensorflow;
44

5-
// Next ID: 13
5+
// Next ID: 14
66
message ProfileOptions {
77
// Some default value of option are not proto3 default value. Use this version
88
// to determine if we should use default option value instead of proto3
@@ -94,6 +94,8 @@ message ProfileOptions {
9494
// }
9595
// }
9696
map<string, AdvancedConfigValue> advanced_configuration = 12;
97+
98+
bool ignore_start_error = 13;
9799
}
98100

99101
// Options for remote profiler session manager.

xla/backends/gpu/codegen/triton/kernel_name_tracer_cuda.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ void KernelNameTracerCuda::start() {
5454
collector_options, start_walltime_ns, start_gputime_ns);
5555
profiler::CuptiTracerOptions options;
5656
options.activities_selected = {CUPTI_ACTIVITY_KIND_KERNEL};
57-
cupti_tracer_->Enable(options, cupti_collector_.get());
57+
cupti_tracer_->Enable(options, cupti_collector_.get()).IgnoreError();
5858
}
5959

6060
std::vector<std::string> KernelNameTracerCuda::stop() {

xla/backends/profiler/gpu/cupti_error_manager_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class CuptiErrorManagerTest : public ::testing::Test {
8080
}
8181

8282
void EnableProfiling(const CuptiTracerOptions& option) {
83-
cupti_tracer_->Enable(option, cupti_collector_.get());
83+
cupti_tracer_->Enable(option, cupti_collector_.get()).IgnoreError();
8484
}
8585

8686
void DisableProfiling() { cupti_tracer_->Disable(); }

xla/backends/profiler/gpu/cupti_tracer.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,8 +1035,8 @@ int CuptiTracer::NumGpus() {
10351035
return num_gpus;
10361036
}
10371037

1038-
void CuptiTracer::Enable(const CuptiTracerOptions &option,
1039-
CuptiTraceCollector *collector) {
1038+
absl::Status CuptiTracer::Enable(const CuptiTracerOptions &option,
1039+
CuptiTraceCollector *collector) {
10401040
option_ = option;
10411041
collector_ = collector;
10421042

@@ -1058,10 +1058,13 @@ void CuptiTracer::Enable(const CuptiTracerOptions &option,
10581058

10591059
absl::Status status = EnableApiTracing();
10601060
need_root_access_ |= status.code() == tsl::error::PERMISSION_DENIED;
1061-
if (!status.ok()) return;
1061+
if (!status.ok()) {
1062+
return status;
1063+
}
10621064

10631065
EnableActivityTracing().IgnoreError();
10641066
tsl::profiler::AnnotationStack::Enable(true);
1067+
return status;
10651068
}
10661069

10671070
void CuptiTracer::Disable() {

xla/backends/profiler/gpu/cupti_tracer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ class CuptiTracer {
8181
bool IsAvailable() const;
8282
bool NeedRootAccess() const { return need_root_access_; }
8383

84-
void Enable(const CuptiTracerOptions& option, CuptiTraceCollector* collector);
84+
absl::Status Enable(const CuptiTracerOptions& option,
85+
CuptiTraceCollector* collector);
8586
void Disable();
8687

8788
// Control threads could periodically call this function to flush the

xla/backends/profiler/gpu/device_tracer_cuda.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ absl::Status GpuTracer::DoStart() {
152152
cupti_collector_ = CreateCuptiCollector(collector_options, start_walltime_ns,
153153
start_gputime_ns);
154154

155-
cupti_tracer_->Enable(options_, cupti_collector_.get());
155+
cupti_tracer_->Enable(options_, cupti_collector_.get()).IgnoreError();
156156
return absl::OkStatus();
157157
}
158158

xla/service/gpu/model/hlo_op_profiler.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class CuptiKernelTracer : public HloOpProfiler::KernelTracer,
6868
// Not interested in API callbacks, but empty list enables them all.
6969
CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject);
7070
options.activities_selected.push_back(CUPTI_ACTIVITY_KIND_KERNEL);
71-
cupti_tracer_->Enable(options, this);
71+
cupti_tracer_->Enable(options, this).IgnoreError();
7272
}
7373

7474
uint64_t getMedianKernelTimeNs() && override {

xla/tsl/profiler/utils/session_manager.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ RemoteProfilerSessionManagerOptions GetRemoteSessionManagerOptionsLocked(
108108
int value = std::get<int>(kw.second);
109109
options.set_delay_ms(value);
110110
VLOG(1) << "delay_ms was set to " << value;
111+
} else if (key == "ignore_start_error") {
112+
int value = std::get<int>(kw.second);
113+
options.mutable_profiler_options()->set_ignore_start_error(value);
114+
VLOG(1) << "ignore_start_error was set to " << value;
111115
} else {
112116
LOG(WARNING) << "Unrecognised key: " << key;
113117
}

0 commit comments

Comments
 (0)