Skip to content

Commit 00a6e2f

Browse files
mabdinurduncanista
andauthored
fix(aws_lambda): fix infinite recursion when wrapping signals [backport #5314 to 1.9] (#5348)
Backports: #5314 ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/contributing.html#Release-Note-Guidelines) are followed. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Author is aware of the performance implications of this PR as reported in the benchmarks PR comment. ## Reviewer Checklist - [ ] Title is accurate. - [ ] No unnecessary changes are introduced. - [ ] Description motivates each change. - [ ] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [ ] Testing strategy adequately addresses listed risk(s). - [ ] Change is maintainable (easy to change, telemetry, documentation). - [ ] Release note makes sense to a user of the library. - [ ] Reviewer is aware of, and discussed the performance implications of this PR as reported in the benchmarks PR comment. Co-authored-by: jordan gonzález <[email protected]>
1 parent 6760ac3 commit 00a6e2f

File tree

3 files changed

+96
-73
lines changed

3 files changed

+96
-73
lines changed

ddtrace/contrib/aws_lambda/patch.py

Lines changed: 91 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -15,81 +15,100 @@
1515
log = get_logger(__name__)
1616

1717

18-
def _crash_flush(_, __):
19-
"""
20-
Tags the current root span with an Impending Timeout error.
21-
Finishes spans with ancestors from the current span.
22-
"""
23-
root_span = tracer.current_root_span()
24-
if root_span is not None:
25-
root_span.error = 1
26-
root_span.set_tag_str(ERROR_MSG, "Datadog detected an Impending Timeout")
27-
root_span.set_tag_str(ERROR_TYPE, "Impending Timeout")
28-
else:
29-
log.warning("An impending timeout was reached, but no root span was found. No error will be tagged.")
30-
31-
current_span = tracer.current_span()
32-
if current_span is not None:
33-
current_span.finish_with_ancestors()
34-
35-
36-
def _handle_signal(sig, f):
37-
"""
38-
Wraps the given signal with a previously defined, if exists.
39-
This to avoid our signals overriding existing ones.
40-
41-
Returns the handler of the wrapped signal.
42-
"""
43-
old_signal = None
44-
if callable(signal.getsignal(sig)):
18+
class TimeoutChannel:
19+
def __init__(self, context):
20+
self.crashed = False
21+
self.context = context
22+
23+
def _handle_signal(self, sig, f):
24+
"""
25+
Returns a signal of type `sig` with function `f`, if there are
26+
no previously defined signals.
27+
28+
Else, wraps the given signal with the previously defined one,
29+
so no signals are overridden.
30+
"""
4531
old_signal = signal.getsignal(sig)
4632

47-
def wrap_signals(*args, **kwargs):
48-
if old_signal is not None:
49-
old_signal(*args, **kwargs)
50-
f(*args, **kwargs)
51-
52-
return signal.signal(sig, wrap_signals)
53-
54-
55-
def _check_timeout(context):
56-
"""
57-
Creates a timeout to detect when an AWS Lambda handler's remaining
58-
time is about to end.
59-
60-
Crashes flushes when the signal is activated.
61-
"""
62-
_handle_signal(signal.SIGALRM, _crash_flush)
63-
remaining_time_in_millis = context.get_remaining_time_in_millis()
64-
apm_flush_deadline = int(os.environ.get("DD_APM_FLUSH_DEADLINE_MILLISECONDS", 0))
65-
66-
if apm_flush_deadline > 0 and apm_flush_deadline <= remaining_time_in_millis:
67-
if apm_flush_deadline < 200:
68-
log.warning(
69-
"DD_APM_FLUSH_DEADLINE_MILLISECONDS will be overridden to 200ms.",
70-
"The value before was %d, more time for span flushing was needed.",
71-
apm_flush_deadline,
72-
)
73-
74-
# A minimum deadline of 200ms is set to allow us to have at
75-
# least 100ms to flush our span queue.
76-
apm_flush_deadline = 200
77-
78-
remaining_time_in_millis = apm_flush_deadline
33+
def wrap_signals(*args, **kwargs):
34+
if old_signal is not None:
35+
old_signal(*args, **kwargs)
36+
f(*args, **kwargs)
37+
38+
# Return the incoming signal if any of the following cases happens:
39+
# - old signal does not exist,
40+
# - old signal is the same as the incoming, or
41+
# - old signal is our wrapper.
42+
# This avoids multiple signal calling and infinite wrapping.
43+
if not callable(old_signal) or old_signal == f or old_signal == wrap_signals:
44+
return signal.signal(sig, f)
45+
46+
return signal.signal(sig, wrap_signals)
47+
48+
def _start(self):
49+
self._handle_signal(signal.SIGALRM, self._crash_flush)
50+
51+
remaining_time_in_millis = self.context.get_remaining_time_in_millis()
52+
apm_flush_deadline = int(os.environ.get("DD_APM_FLUSH_DEADLINE_MILLISECONDS", 100))
53+
apm_flush_deadline = 100 if apm_flush_deadline < 0 else apm_flush_deadline
54+
55+
# TODO: Update logic to calculate an approximate of how long it will
56+
# take us to flush the spans on the queue.
57+
remaining_time_in_seconds = max(((remaining_time_in_millis - apm_flush_deadline) / 1000), 0)
58+
signal.setitimer(signal.ITIMER_REAL, remaining_time_in_seconds)
59+
60+
def _crash_flush(self, _, __):
61+
"""
62+
Tags the current root span with an Impending Timeout error.
63+
Finishes spans with ancestors from the current span.
64+
"""
65+
self._remove_alarm_signal()
66+
self.crashed = True
67+
68+
root_span = tracer.current_root_span()
69+
if root_span is not None:
70+
root_span.error = 1
71+
root_span.set_tag_str(ERROR_MSG, "Datadog detected an Impending Timeout")
72+
root_span.set_tag_str(ERROR_TYPE, "Impending Timeout")
73+
else:
74+
log.warning("An impending timeout was reached, but no root span was found. No error will be tagged.")
75+
76+
current_span = tracer.current_span()
77+
if current_span is not None:
78+
current_span.finish_with_ancestors()
79+
80+
def _remove_alarm_signal(self):
81+
"""Removes the handler set for the signal `SIGALRM`."""
82+
signal.alarm(0)
83+
signal.signal(signal.SIGALRM, signal.SIG_DFL)
84+
85+
def stop(self):
86+
self._remove_alarm_signal()
87+
88+
89+
class DatadogInstrumentation(object):
90+
"""Patches an AWS Lambda handler function for Datadog instrumentation."""
7991

80-
# Subtracting 100ms to ensure we have time to flush.
81-
# TODO: Update logic to calculate an approximate of how long it will
82-
# take us to flush the spans on the queue.
83-
remaining_time_in_seconds = max((remaining_time_in_millis - 100) / 1000, 0)
84-
signal.setitimer(signal.ITIMER_REAL, remaining_time_in_seconds)
92+
def __call__(self, func, args, kwargs):
93+
self.func = func
94+
self._before(args, kwargs)
95+
try:
96+
self.response = self.func(*args, **kwargs)
97+
return self.response
98+
except Exception:
99+
raise
100+
finally:
101+
self._after()
85102

103+
def _before(self, args, kwargs):
104+
self.context = get_argument_value(args, kwargs, -1, "context")
105+
self.timeoutChannel = TimeoutChannel(self.context)
86106

87-
def _datadog_instrumentation(func, args, kwargs):
88-
"""Patches an AWS Lambda handler function for Datadog instrumentation."""
89-
context = get_argument_value(args, kwargs, -1, "context") # context is always the last parameter
90-
_check_timeout(context)
107+
self.timeoutChannel._start()
91108

92-
return func(*args, **kwargs)
109+
def _after(self):
110+
if not self.timeoutChannel.crashed:
111+
self.timeoutChannel.stop()
93112

94113

95114
def _modify_module_name(module_name):
@@ -100,15 +119,14 @@ def _modify_module_name(module_name):
100119
def _get_handler_and_module():
101120
"""Returns the user AWS Lambda handler and module."""
102121
path = os.environ.get("DD_LAMBDA_HANDLER", None)
122+
_datadog_instrumentation = DatadogInstrumentation()
123+
103124
if path is None:
104125
from datadog_lambda.wrapper import datadog_lambda_wrapper
105126

106127
handler = getattr(datadog_lambda_wrapper, "__call__")
107128

108-
def wrapper(func, args, kwargs):
109-
return _datadog_instrumentation(func, args, kwargs)
110-
111-
return handler, datadog_lambda_wrapper, wrapper
129+
return handler, datadog_lambda_wrapper, _datadog_instrumentation
112130
else:
113131
parts = path.rsplit(".", 1)
114132
(mod_name, handler_name) = parts
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
aws_lambda: Fixes a ``RecursionError`` which is raised when aws lambda signal handlers are wrapped infinitely. This caused lambdas to crash on startup.

tests/contrib/aws_lambda/test_aws_lambda.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ async def test_module_patching(mocker, context):
102102
}
103103
)
104104

105+
os.environ.pop("DD_LAMBDA_HANDLER")
105106
patch()
106107

107108
result = manually_wrapped_handler({}, context)

0 commit comments

Comments
 (0)