Skip to content

Commit 5676931

Browse files
authored
[Profiler] Make timer_create-based CPU profiler default (#7322)
1 parent 64b64cb commit 5676931

File tree

12 files changed

+155
-42
lines changed

12 files changed

+155
-42
lines changed

profiler/build/CpuWallTime.linux.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"DD_PROFILING_ENABLED": "1",
2727
"DD_PROFILING_CPU_ENABLED": "1",
2828
"DD_INTERNAL_USE_BACKTRACE2": "1",
29+
"DD_INTERNAL_CPU_PROFILER_TYPE": "TimerCreate",
2930
"DD_TRACE_ENABLED" : "0"
3031
}
3132
},
@@ -35,17 +36,19 @@
3536
"DD_CLR_ENABLE_NGEN": "true",
3637
"DD_PROFILING_ENABLED": "1",
3738
"DD_PROFILING_CPU_ENABLED": "1",
39+
"DD_INTERNAL_CPU_PROFILER_TYPE": "TimerCreate",
3840
"DD_INTERNAL_USE_BACKTRACE2": "0",
3941
"DD_TRACE_ENABLED" : "0"
4042
}
4143
},
4244
{
43-
"name": "Profiler_cpu_walltime_timer_create",
45+
"name": "Profiler_cpu_walltime_manual",
4446
"environmentVariables": {
4547
"DD_CLR_ENABLE_NGEN": "true",
4648
"DD_PROFILING_ENABLED": "1",
4749
"DD_PROFILING_CPU_ENABLED": "1",
48-
"DD_INTERNAL_CPU_PROFILER_TYPE": "TimerCreate",
50+
"DD_INTERNAL_USE_BACKTRACE2": "1",
51+
"DD_INTERNAL_CPU_PROFILER_TYPE": "ManualCpuTime",
4952
"DD_TRACE_ENABLED" : "0"
5053
}
5154
}

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/ProfilerSignalManager.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,22 @@ bool ProfilerSignalManager::UnRegisterHandler()
9393
return true;
9494
}
9595

96+
bool ProfilerSignalManager::IgnoreSignal() {
97+
struct sigaction sampleAction;
98+
sampleAction.sa_handler = SIG_IGN;
99+
sigemptyset(&sampleAction.sa_mask);
100+
sampleAction.sa_flags = 0;
101+
102+
int32_t result = sigaction(_signalToSend, &sampleAction, nullptr);
103+
if (result != 0)
104+
{
105+
Log::Error("ProfilerSignalManager::IgnoreSignal: Failed mark ", strsignal(_signalToSend), " as ignored. Reason: ",
106+
strerror(errno), ".");
107+
return false;
108+
}
109+
return true;
110+
}
111+
96112
void ProfilerSignalManager::SetSignal(int32_t signal)
97113
{
98114
_signalToSend = signal;

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/ProfilerSignalManager.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class ProfilerSignalManager
1818

1919
bool RegisterHandler(HandlerFn_t handler);
2020
bool UnRegisterHandler();
21+
bool IgnoreSignal();
2122
int32_t SendSignal(pid_t threadId);
2223
bool CheckSignalHandler();
2324
bool IsHandlerInPlace() const;

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/TimerCreateCpuProfiler.cpp

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#include <ucontext.h>
1818
#include <unistd.h>
1919

20-
TimerCreateCpuProfiler* TimerCreateCpuProfiler::Instance = nullptr;
20+
std::atomic<TimerCreateCpuProfiler*> TimerCreateCpuProfiler::Instance = nullptr;
2121

2222
TimerCreateCpuProfiler::TimerCreateCpuProfiler(
2323
IConfiguration* pConfiguration,
@@ -29,7 +29,8 @@ TimerCreateCpuProfiler::TimerCreateCpuProfiler(
2929
_pSignalManager{pSignalManager}, // put it as parameter for better testing
3030
_pManagedThreadsList{pManagedThreadsList},
3131
_pProvider{pProvider},
32-
_samplingInterval{pConfiguration->GetCpuProfilingInterval()}
32+
_samplingInterval{pConfiguration->GetCpuProfilingInterval()},
33+
_nbThreadsInSignalHandler{0}
3334
{
3435
Log::Info("Cpu profiling interval: ", _samplingInterval.count(), "ms");
3536
Log::Info("timer_create Cpu profiler is enabled");
@@ -91,6 +92,15 @@ bool TimerCreateCpuProfiler::StartImpl()
9192

9293
bool TimerCreateCpuProfiler::StopImpl()
9394
{
95+
Instance = nullptr;
96+
// we cannot unregister. We would replace the current action by the default one
97+
// which will cause the termination of the process
98+
// Instead, mark SIGPROF as ignored.
99+
if (!_pSignalManager->IgnoreSignal())
100+
{
101+
Log::Warn("Failed to mark the signal SIGPROF as ignored.");
102+
}
103+
94104
{
95105
std::unique_lock lock(_registerLock);
96106

@@ -99,15 +109,27 @@ bool TimerCreateCpuProfiler::StopImpl()
99109
_pManagedThreadsList->ForEach([this](ManagedThreadInfo* thread) { UnregisterThreadImpl(thread); });
100110
}
101111

102-
Instance = nullptr;
103-
_pSignalManager->UnRegisterHandler();
112+
std::uint64_t nbThreadsInSignalHandler = _nbThreadsInSignalHandler;
113+
if (nbThreadsInSignalHandler != 0)
114+
{
115+
Log::Info("Waiting for all threads exiting the signal handler (#threads ", _nbThreadsInSignalHandler, ")");
116+
117+
// TODO: for now we sleep.
118+
std::this_thread::sleep_for(500ms);
119+
if (_nbThreadsInSignalHandler != 0)
120+
{
121+
Log::Warn("There are threads that are still executing the signal handler: ", _nbThreadsInSignalHandler);
122+
return false;
123+
}
124+
Log::Info("All threads exited the signal handler");
125+
}
104126

105127
return true;
106128
}
107129

108130
bool TimerCreateCpuProfiler::CollectStackSampleSignalHandler(int sig, siginfo_t* info, void* ucontext)
109131
{
110-
auto instance = Instance;
132+
auto instance = Instance.load();
111133
if (instance == nullptr)
112134
{
113135
return false;
@@ -122,7 +144,6 @@ extern "C" unsigned long long dd_inside_wrapped_functions() __attribute__((weak)
122144

123145
bool TimerCreateCpuProfiler::CanCollect(void* ctx)
124146
{
125-
// TODO (in another PR): add metrics about reasons we could not collect
126147
if (dd_inside_wrapped_functions != nullptr && dd_inside_wrapped_functions() != 0)
127148
{
128149
_discardMetrics->Incr<DiscardReason::InsideWrappedFunction>();
@@ -189,12 +210,14 @@ struct StackWalkLock
189210

190211
bool TimerCreateCpuProfiler::Collect(void* ctx)
191212
{
213+
_nbThreadsInSignalHandler++;
192214
_totalSampling->Incr();
193215

194216
auto threadInfo = ManagedThreadInfo::CurrentThreadInfo;
195217
if (threadInfo == nullptr)
196218
{
197219
_discardMetrics->Incr<DiscardReason::UnknownThread>();
220+
_nbThreadsInSignalHandler--;
198221
// Ooops should never happen
199222
return false;
200223
}
@@ -203,11 +226,13 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
203226
if (!l.IsLockAcquired())
204227
{
205228
_discardMetrics->Incr<DiscardReason::FailedAcquiringLock>();
229+
_nbThreadsInSignalHandler--;
206230
return false;
207231
}
208232

209233
if (!CanCollect(ctx))
210234
{
235+
_nbThreadsInSignalHandler--;
211236
return false;
212237
}
213238

@@ -217,6 +242,7 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
217242
auto rawCpuSample = _pProvider->GetRawSample();
218243
if (!rawCpuSample)
219244
{
245+
_nbThreadsInSignalHandler--;
220246
return false;
221247
}
222248

@@ -229,6 +255,7 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
229255
{
230256
rawCpuSample.Discard();
231257
_discardMetrics->Incr<DiscardReason::EmptyBacktrace>();
258+
_nbThreadsInSignalHandler--;
232259
return false;
233260
}
234261

@@ -240,6 +267,7 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
240267
rawCpuSample->AppDomainId = threadInfo->GetAppDomainId();
241268
rawCpuSample->ThreadInfo = std::move(threadInfo);
242269
rawCpuSample->Duration = _samplingInterval;
270+
_nbThreadsInSignalHandler--;
243271
return true;
244272
}
245273

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/TimerCreateCpuProfiler.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class TimerCreateCpuProfiler : public ServiceBase
4242

4343
private:
4444
static bool CollectStackSampleSignalHandler(int sig, siginfo_t* info, void* ucontext);
45-
static TimerCreateCpuProfiler* Instance;
45+
static std::atomic<TimerCreateCpuProfiler*> Instance;
4646

4747
bool CanCollect(void* context);
4848
bool Collect(void* ucontext);
@@ -59,4 +59,5 @@ class TimerCreateCpuProfiler : public ServiceBase
5959
std::shared_mutex _registerLock;
6060
std::shared_ptr<CounterMetric> _totalSampling;
6161
std::shared_ptr<DiscardMetrics> _discardMetrics;
62+
std::atomic<std::uint64_t> _nbThreadsInSignalHandler;
6263
};

profiler/src/ProfilerEngine/Datadog.Profiler.Native/Configuration.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ std::string const Configuration::DefaultEmptyString = "";
2626
std::chrono::seconds const Configuration::DefaultDevUploadInterval = 20s;
2727
std::chrono::seconds const Configuration::DefaultProdUploadInterval = 60s;
2828
std::chrono::milliseconds const Configuration::DefaultCpuProfilingInterval = 9ms;
29+
CpuProfilerType const Configuration::DefaultCpuProfilerType =
30+
#ifdef _WINDOWS
31+
CpuProfilerType::ManualCpuTime;
32+
#else
33+
CpuProfilerType::TimerCreate;
34+
#endif
2935

3036
Configuration::Configuration()
3137
{
@@ -107,7 +113,7 @@ Configuration::Configuration()
107113
);
108114
_httpRequestDurationThreshold = ExtractHttpRequestDurationThreshold();
109115
_forceHttpSampling = GetEnvironmentValue(EnvironmentVariables::ForceHttpSampling, false);
110-
_cpuProfilerType = GetEnvironmentValue(EnvironmentVariables::CpuProfilerType, CpuProfilerType::ManualCpuTime);
116+
_cpuProfilerType = GetEnvironmentValue(EnvironmentVariables::CpuProfilerType, DefaultCpuProfilerType);
111117
_isWaitHandleProfilingEnabled = GetEnvironmentValue(EnvironmentVariables::WaitHandleProfilingEnabled, false);
112118
}
113119

profiler/src/ProfilerEngine/Datadog.Profiler.Native/Configuration.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ class Configuration final : public IConfiguration
120120
static std::chrono::seconds const DefaultDevUploadInterval;
121121
static std::chrono::seconds const DefaultProdUploadInterval;
122122
static std::chrono::milliseconds const DefaultCpuProfilingInterval;
123+
static CpuProfilerType const DefaultCpuProfilerType;
123124

124125
bool _isProfilingEnabled;
125126
bool _isCpuProfilingEnabled;

profiler/src/ProfilerEngine/Datadog.Profiler.Native/CorProfilerCallback.cpp

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,30 @@
7272

7373
#include <cmath>
7474

75+
void LogServiceStart(bool success, const char* name )
76+
{
77+
if (success)
78+
{
79+
Log::Info(name, " started successfully.");
80+
}
81+
else
82+
{
83+
Log::Info(name, " failed to start. This service might have been started earlier.");
84+
}
85+
}
86+
87+
void LogServiceStop(bool success, const char* name )
88+
{
89+
if (success)
90+
{
91+
Log::Info(name, " stopped successfully.");
92+
}
93+
else
94+
{
95+
Log::Info(name, " failed to stopped. This service might have been stopped earlier.");
96+
}
97+
}
98+
7599
IClrLifetime* CorProfilerCallback::GetClrLifetime() const
76100
{
77101
return _pClrLifetime.get();
@@ -520,7 +544,9 @@ void CorProfilerCallback::InitializeServices()
520544
#ifdef LINUX
521545
if (_pConfiguration->IsCpuProfilingEnabled() && _pConfiguration->GetCpuProfilerType() == CpuProfilerType::TimerCreate)
522546
{
523-
_pCpuProfiler = RegisterService<TimerCreateCpuProfiler>(
547+
// Other alternative in case of crash-at-shutdown, do not register it as a service
548+
// we will have to start it by hand (already stopped by hand)
549+
_pCpuProfiler = std::make_unique<TimerCreateCpuProfiler>(
524550
_pConfiguration.get(),
525551
ProfilerSignalManager::Get(SIGPROF),
526552
_pManagedThreadList,
@@ -714,17 +740,22 @@ bool CorProfilerCallback::StartServices()
714740
{
715741
auto name = service->GetName();
716742
success = service->Start();
717-
if (success)
718-
{
719-
Log::Info(name, " started successfully.");
720-
}
721-
else
722-
{
723-
Log::Info(name, " failed to start. This service might have been started earlier.");
724-
}
743+
LogServiceStart(success, name);
725744
result &= success;
726745
}
727746

747+
#ifdef LINUX
748+
// We cannot add the timer_create-based CPU profiler to the _services list
749+
// we have to control the Stop
750+
// If we fail to stop, we mustn't release the memory associated to it
751+
// otherwise we might crash.
752+
if (_pCpuProfiler != nullptr)
753+
{
754+
auto success = _pCpuProfiler->Start();
755+
LogServiceStart(success, _pCpuProfiler->GetName());
756+
}
757+
#endif
758+
728759
return result;
729760
}
730761

@@ -759,14 +790,7 @@ bool CorProfilerCallback::StopServices()
759790
const auto& service = _services[i - 1];
760791
const auto* name = service->GetName();
761792
success = service->Stop();
762-
if (success)
763-
{
764-
Log::Info(name, " stopped successfully.");
765-
}
766-
else
767-
{
768-
Log::Info(name, " failed to stop. This service might have been stopped earlier.");
769-
}
793+
LogServiceStop(success, name);
770794
result &= success;
771795
}
772796

@@ -1513,7 +1537,16 @@ HRESULT STDMETHODCALLTYPE CorProfilerCallback::Shutdown()
15131537
#ifdef LINUX
15141538
if (_pCpuProfiler != nullptr)
15151539
{
1516-
_pCpuProfiler->Stop();
1540+
// if we failed at stopping the time_create-based CPU profiler,
1541+
// it's safer to not release the memory.
1542+
// Otherwise, we might crash the application.
1543+
// Reason: one thread could be executing the signal handler and accessing some field
1544+
auto stopped = _pCpuProfiler->Stop();
1545+
LogServiceStop(stopped, _pCpuProfiler->GetName());
1546+
if (stopped)
1547+
{
1548+
_pCpuProfiler = nullptr;
1549+
}
15171550
}
15181551
#endif
15191552

@@ -1776,7 +1809,7 @@ void CorProfilerCallback::OnThreadRoutineFinished()
17761809
return;
17771810
}
17781811

1779-
auto* cpuProfiler = myThis->_pCpuProfiler;
1812+
auto* cpuProfiler = myThis->_pCpuProfiler.get();
17801813
if (cpuProfiler == nullptr)
17811814
{
17821815
return;

profiler/src/ProfilerEngine/Datadog.Profiler.Native/CorProfilerCallback.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,11 @@
3434
#include "ProxyMetric.h"
3535
#include "IAllocationsRecorder.h"
3636
#include "IMetadataProvider.h"
37-
#include "ThreadLifetimeProvider.h"
3837
#include "shared/src/native-src/string.h"
38+
#include "ThreadLifetimeProvider.h"
39+
#ifdef LINUX
40+
#include "TimerCreateCpuProfiler.h"
41+
#endif
3942
#include "IEtwEventsManager.h"
4043
#include "ISsiLifetime.h"
4144

@@ -54,7 +57,6 @@ class IConfiguration;
5457
class IExporter;
5558
class RawSampleTransformer;
5659
class RuntimeIdStore;
57-
class TimerCreateCpuProfiler;
5860
class CpuSampleProvider;
5961
class NetworkProvider;
6062

@@ -254,7 +256,7 @@ private :
254256
RuntimeIdStore* _pRuntimeIdStore = nullptr;
255257
#ifdef LINUX
256258
SystemCallsShield* _systemCallsShield = nullptr;
257-
TimerCreateCpuProfiler* _pCpuProfiler = nullptr;
259+
std::unique_ptr<TimerCreateCpuProfiler> _pCpuProfiler = nullptr;
258260
CpuSampleProvider* _pCpuSampleProvider = nullptr;
259261
#endif
260262

profiler/test/Datadog.Profiler.IntegrationTests/LinuxOnly/SignalHandlerTest.cs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,14 @@ public void CheckSignalHandlerIsInstalledOnce(string appName, string framework,
4747
var logFile = Directory.GetFiles(runner.Environment.LogDir)
4848
.Single(f => Path.GetFileName(f).StartsWith("DD-DotNet-Profiler-Native-"));
4949

50-
var nbSignalHandlerInstallation = File.ReadLines(logFile)
51-
.Count(l => l.Contains("Successfully setup signal handler for"));
52-
53-
nbSignalHandlerInstallation.Should().Be(1);
50+
51+
File.ReadLines(logFile)
52+
.Count(l => l.Contains("Successfully setup signal handler for User defined signal 1 signal"))
53+
.Should().Be(1);
54+
55+
File.ReadLines(logFile)
56+
.Count(l => l.Contains("Successfully setup signal handler for Profiling timer expired signal"))
57+
.Should().Be(1);
5458
}
5559
}
5660
}

0 commit comments

Comments
 (0)