Skip to content

Commit 52a5770

Browse files
authored
fix(profiling): make stack_v2 and ddup more robust against failure [backport #8471 to 2.7] (#8636)
Backport of #8615 to 2.7 This fixes a few defects in how the ddup and stack_v2 native extensions permitted failure. It also modifies the cmake invocation in setup.py so as to use the one provided by the `cmake` module we already depended upon. I thought the old way would have given it to us, but it appears as though the 2.7.1 release revealed the profiling-related native extensions failed to build. ## Checklist - [x] Change(s) are motivated and described in the PR description - [x] Testing strategy is described if automated tests are not included in the PR - [x] Risks are described (performance impact, potential for breakage, maintainability) - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed or label `changelog/no-changelog` is set - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)) - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) - [x] If this PR changes the public interface, I've notified `@DataDog/apm-tees`. - [x] If change touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. ## Reviewer Checklist - [x] Title is accurate - [x] All changes are related to the pull request's stated goal - [x] Description motivates each change - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - [x] Testing strategy adequately addresses listed risks - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] Release note makes sense to a user of the library - [x] Author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: sanchda <[email protected]>
1 parent 8cb578e commit 52a5770

File tree

9 files changed

+122
-99
lines changed

9 files changed

+122
-99
lines changed

.github/workflows/build_deploy.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ jobs:
7878

7979
- name: Build sdist
8080
run: |
81-
pip install cython
81+
pip install cython cmake
8282
python setup.py sdist
8383
- uses: actions/upload-artifact@v3
8484
with:

.github/workflows/build_python_3.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ jobs:
5656
CMAKE_BUILD_PARALLEL_LEVEL: 12
5757
CIBW_REPAIR_WHEEL_COMMAND_LINUX: |
5858
mkdir ./tempwheelhouse &&
59+
unzip -l {wheel} | grep '\.so' &&
5960
auditwheel repair -w ./tempwheelhouse {wheel} &&
6061
(yum install -y zip || apk add zip) &&
6162
for w in ./tempwheelhouse/*.whl; do
Lines changed: 70 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,99 @@
1+
from typing import Dict # noqa:F401
2+
from typing import Optional # noqa:F401
3+
14
from .utils import sanitize_string # noqa: F401
25

36

4-
try:
5-
from ._ddup import * # noqa: F403, F401
7+
is_available = False
68

7-
is_available = True
89

9-
except Exception as e:
10-
from typing import Dict # noqa:F401
11-
from typing import Optional # noqa:F401
10+
def not_implemented(func):
11+
def wrapper(*args, **kwargs):
12+
raise NotImplementedError("{} is not implemented on this platform".format(func.__name__))
1213

13-
from ddtrace.internal.logger import get_logger
1414

15-
LOG = get_logger(__name__)
16-
LOG.debug("Failed to import _ddup: %s", e)
15+
@not_implemented
16+
def init(
17+
env, # type: Optional[str]
18+
service, # type: Optional[str]
19+
version, # type: Optional[str]
20+
tags, # type: Optional[Dict[str, str]]
21+
max_nframes, # type: Optional[int]
22+
url, # type: Optional[str]
23+
):
24+
pass
25+
1726

18-
is_available = False
27+
@not_implemented
28+
def upload(): # type: () -> None
29+
pass
1930

20-
# Decorator for not-implemented
21-
def not_implemented(func):
22-
def wrapper(*args, **kwargs):
23-
raise NotImplementedError("{} is not implemented on this platform".format(func.__name__))
2431

32+
class SampleHandle:
2533
@not_implemented
26-
def init(
27-
env, # type: Optional[str]
28-
service, # type: Optional[str]
29-
version, # type: Optional[str]
30-
tags, # type: Optional[Dict[str, str]]
31-
max_nframes, # type: Optional[int]
32-
url, # type: Optional[str]
33-
):
34+
def push_cputime(self, value, count): # type: (int, int) -> None
3435
pass
3536

3637
@not_implemented
37-
def upload(): # type: () -> None
38+
def push_walltime(self, value, count): # type: (int, int) -> None
3839
pass
3940

40-
class SampleHandle:
41-
@not_implemented
42-
def push_cputime(self, value, count): # type: (int, int) -> None
43-
pass
41+
@not_implemented
42+
def push_acquire(self, value, count): # type: (int, int) -> None
43+
pass
4444

45-
@not_implemented
46-
def push_walltime(self, value, count): # type: (int, int) -> None
47-
pass
45+
@not_implemented
46+
def push_release(self, value, count): # type: (int, int) -> None
47+
pass
4848

49-
@not_implemented
50-
def push_acquire(self, value, count): # type: (int, int) -> None
51-
pass
49+
@not_implemented
50+
def push_alloc(self, value, count): # type: (int, int) -> None
51+
pass
5252

53-
@not_implemented
54-
def push_release(self, value, count): # type: (int, int) -> None
55-
pass
53+
@not_implemented
54+
def push_heap(self, value): # type: (int) -> None
55+
pass
5656

57-
@not_implemented
58-
def push_alloc(self, value, count): # type: (int, int) -> None
59-
pass
57+
@not_implemented
58+
def push_lock_name(self, lock_name): # type: (str) -> None
59+
pass
6060

61-
@not_implemented
62-
def push_heap(self, value): # type: (int) -> None
63-
pass
61+
@not_implemented
62+
def push_frame(self, name, filename, address, line): # type: (str, str, int, int) -> None
63+
pass
6464

65-
@not_implemented
66-
def push_lock_name(self, lock_name): # type: (str) -> None
67-
pass
65+
@not_implemented
66+
def push_threadinfo(self, thread_id, thread_native_id, thread_name): # type: (int, int, Optional[str]) -> None
67+
pass
6868

69-
@not_implemented
70-
def push_frame(self, name, filename, address, line): # type: (str, str, int, int) -> None
71-
pass
69+
@not_implemented
70+
def push_taskinfo(self, task_id, task_name): # type: (int, str) -> None
71+
pass
7272

73-
@not_implemented
74-
def push_threadinfo(self, thread_id, thread_native_id, thread_name): # type: (int, int, Optional[str]) -> None
75-
pass
73+
@not_implemented
74+
def push_exceptioninfo(self, exc_type, count): # type: (type, int) -> None
75+
pass
7676

77-
@not_implemented
78-
def push_taskinfo(self, task_id, task_name): # type: (int, str) -> None
79-
pass
77+
@not_implemented
78+
def push_class_name(self, class_name): # type: (str) -> None
79+
pass
8080

81-
@not_implemented
82-
def push_exceptioninfo(self, exc_type, count): # type: (type, int) -> None
83-
pass
81+
@not_implemented
82+
def push_span(self, span, endpoint_collection_enabled): # type: (Optional[Span], bool) -> None
83+
pass
8484

85-
@not_implemented
86-
def push_class_name(self, class_name): # type: (str) -> None
87-
pass
85+
@not_implemented
86+
def flush_sample(self): # type: () -> None
87+
pass
8888

89-
@not_implemented
90-
def push_span(self, span, endpoint_collection_enabled): # type: (Optional[Span], bool) -> None
91-
pass
9289

93-
@not_implemented
94-
def flush_sample(self): # type: () -> None
95-
pass
90+
try:
91+
from ._ddup import * # noqa: F403, F401
92+
93+
is_available = True
94+
95+
except Exception as e:
96+
from ddtrace.internal.logger import get_logger
97+
98+
LOG = get_logger(__name__)
99+
LOG.debug("Failed to import _ddup: %s", e)
Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,27 @@
1+
is_available = False
2+
3+
4+
# Decorator for not-implemented
5+
def not_implemented(func):
6+
def wrapper(*args, **kwargs):
7+
raise NotImplementedError("{} is not implemented on this platform".format(func.__name__))
8+
9+
10+
@not_implemented
11+
def start(*args, **kwargs):
12+
pass
13+
14+
15+
@not_implemented
16+
def stop(*args, **kwargs):
17+
pass
18+
19+
20+
@not_implemented
21+
def set_interval(*args, **kwargs):
22+
pass
23+
24+
125
try:
226
from ._stack_v2 import * # noqa: F401, F403
327

@@ -7,23 +31,5 @@
731
from ddtrace.internal.logger import get_logger
832

933
LOG = get_logger(__name__)
10-
LOG.debug("Failed to import _stack_v2: %s", e)
11-
12-
is_available = False
1334

14-
# Decorator for not-implemented
15-
def not_implemented(func):
16-
def wrapper(*args, **kwargs):
17-
raise NotImplementedError("{} is not implemented on this platform".format(func.__name__))
18-
19-
@not_implemented
20-
def start(*args, **kwargs):
21-
pass
22-
23-
@not_implemented
24-
def stop(*args, **kwargs):
25-
pass
26-
27-
@not_implemented
28-
def set_interval(*args, **kwargs):
29-
pass
35+
LOG.debug("Failed to import _stack_v2: %s", e)

ddtrace/internal/datadog/profiling/stack_v2/src/stack_v2.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,12 @@
22
#include "python_headers.hpp"
33
#include "sampler.hpp"
44

5-
#include <iostream>
6-
75
using namespace Datadog;
86

97
static PyObject*
108
_stack_v2_start(PyObject* self, PyObject* args, PyObject* kwargs)
119
{
1210
(void)self;
13-
std::cout << "Starting the sampler" << std::endl;
1411
static const char* const_kwlist[] = { "min_interval", NULL };
1512
static char** kwlist = const_cast<char**>(const_kwlist);
1613
double min_interval_s = g_default_sampling_period_s;
@@ -21,9 +18,7 @@ _stack_v2_start(PyObject* self, PyObject* args, PyObject* kwargs)
2118

2219
Sampler::get().set_interval(min_interval_s);
2320
Sampler::get().start();
24-
25-
Py_INCREF(Py_None);
26-
return Py_None;
21+
Py_RETURN_NONE;
2722
}
2823

2924
// Bypasses the old-style cast warning with an unchecked helper function
@@ -35,7 +30,7 @@ stack_v2_stop(PyObject* self, PyObject* args)
3530
(void)self;
3631
(void)args;
3732
Sampler::get().stop();
38-
Py_RETURN_NONE
33+
Py_RETURN_NONE;
3934
}
4035

4136
static PyObject*

ddtrace/profiling/collector/memalloc.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ def _start_service(self):
7171
if _memalloc is None:
7272
raise collector.CollectorUnavailable
7373

74+
if self._export_libdd_enabled and not ddup.is_available:
75+
self._export_libdd_enabled = False
76+
7477
try:
7578
_memalloc.start(self.max_nframe, self._max_events, self.heap_sample_size)
7679
except RuntimeError:

ddtrace/profiling/collector/stack.pyx

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -488,20 +488,23 @@ class StackCollector(collector.PeriodicCollector):
488488
# If libdd is enabled, propagate the configuration
489489
if config.export.libdd_enabled:
490490
if not ddup.is_available:
491-
# We probably already told the user about this in profiler.py, but let's do it again here.
492-
LOG.error("Failed to load the libdd collector from stack.pyx; falling back to the legacy collector")
491+
# We don't report on this, since it's already been reported in profiler.py
493492
set_use_libdd(False)
493+
494+
# If the user had also set stack.v2.enabled, then we need to disable that as well.
495+
if self._stack_collector_v2_enabled:
496+
self._stack_collector_v2_enabled = False
497+
LOG.error("Stack v2 was requested, but the libdd collector could not be enabled. Falling back to the v1 stack sampler.")
494498
else:
495499
set_use_libdd(True)
496500

497501
# If stack v2 is requested, verify it is loaded properly and that libdd has been enabled.
498502
if self._stack_collector_v2_enabled:
499503
if not stack_v2.is_available:
500504
self._stack_collector_v2_enabled = False
501-
LOG.error("Failed to load the v2 stack sampler; falling back to the v1 stack sampler")
505+
LOG.error("Stack v2 was requested, but it could not be enabled. Check debug logs for more information.")
502506
if not use_libdd:
503507
self._stack_collector_v2_enabled = False
504-
LOG.error("libdd collector not enabled; falling back to the v1 stack sampler. Did you set DD_PROFILING_EXPORT_LIBDD_ENABLED=true?")
505508

506509
# If at the end of things, stack v2 is still enabled, then start the native thread running the v2 sampler
507510
if self._stack_collector_v2_enabled:

ddtrace/profiling/scheduler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def __attrs_post_init__(self):
3333
def _start_service(self):
3434
# type: (...) -> None
3535
"""Start the scheduler."""
36+
if self._export_libdd_enabled and not ddup.is_available:
37+
self._export_libdd_enabled = False
3638
LOG.debug("Starting scheduler")
3739
super(Scheduler, self)._start_service()
3840
self._last_export = compat.time_ns()

setup.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import sysconfig
99
import tarfile
1010

11+
import cmake
12+
1113

1214
from setuptools import Extension, find_packages, setup # isort: skip
1315
from setuptools.command.build_ext import build_ext # isort: skip
@@ -38,6 +40,9 @@
3840

3941
DEBUG_COMPILE = "DD_COMPILE_DEBUG" in os.environ
4042

43+
# stack_v2 profiling extensions are optional, unless they are made explicitly required by this environment variable
44+
STACK_V2_REQUIRED = "DD_STACK_V2_REQUIRED" in os.environ
45+
4146
IS_PYSTON = hasattr(sys, "pyston_version_info")
4247

4348
LIBDDWAF_DOWNLOAD_DIR = HERE / "ddtrace" / "appsec" / "_ddwaf" / "libddwaf"
@@ -338,7 +343,9 @@ def build_extension_cmake(self, ext):
338343
"-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs)),
339344
]
340345

341-
cmake_command = os.environ.get("CMAKE_COMMAND", "cmake")
346+
cmake_command = (
347+
Path(cmake.CMAKE_BIN_DIR) / "cmake"
348+
).resolve() # explicitly use the cmake provided by the cmake package
342349
subprocess.run([cmake_command, *cmake_args], cwd=cmake_build_dir, check=True)
343350
subprocess.run([cmake_command, "--build", ".", *build_args], cwd=cmake_build_dir, check=True)
344351
subprocess.run([cmake_command, "--install", ".", *install_args], cwd=cmake_build_dir, check=True)
@@ -465,6 +472,7 @@ def get_exts_for(name):
465472
"-DPY_MINOR_VERSION={}".format(sys.version_info.minor),
466473
"-DPY_MICRO_VERSION={}".format(sys.version_info.micro),
467474
],
475+
optional=not STACK_V2_REQUIRED,
468476
)
469477
)
470478

@@ -474,7 +482,8 @@ def get_exts_for(name):
474482
CMakeExtension(
475483
"ddtrace.internal.datadog.profiling.stack_v2._stack_v2",
476484
source_dir=STACK_V2_DIR,
477-
)
485+
optional=not STACK_V2_REQUIRED,
486+
),
478487
)
479488

480489
else:

0 commit comments

Comments
 (0)