Skip to content

Commit 166c4cb

Browse files
authored
Fix Flaky Test in AWS X-Ray Remote Sampler Client Test (#201)
*Issue #, if available:* This PR fixes flaky test `test_urls_excluded_from_sampling`, causing issues in PR workflows - Example: https://github.com/aws-observability/aws-otel-python-instrumentation/actions/runs/9103811387/job/25026448867 Instantiation of `AwsXRayRemoteSampler` will spawn processes that will regularly (every 10s and 300s) make a Request call, and these processes stay alive between different unit tests. This means that the test that counts the number instrumented Request calls is affected by these "orphaned" processes. The solution is to stop these processes inside the tests that they are created from, or stop them from being spawned in the first place if they are not needed. *Description of changes:* - Stop the daemon rule and target poller processes inside the tests that they are created from, or stop them from being spawned in the first place if they are not needed. - Revert previous fix attempt - Add test for version to increase and pass code coverage *Testing:* - Reproduced failing test by running test in loop of 10000 runs - Failing test not reproducible after applying fix. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
1 parent a80a9e4 commit 166c4cb

File tree

4 files changed

+71
-49
lines changed

4 files changed

+71
-49
lines changed

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/sampler/test_aws_xray_remote_sampler.py

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -49,80 +49,93 @@ def json(self):
4949

5050

5151
class TestAwsXRayRemoteSampler(TestCase):
52+
def setUp(self):
53+
self.rs = None
54+
55+
def tearDown(self):
56+
# Clean up timers
57+
if self.rs is not None:
58+
self.rs._rules_timer.cancel()
59+
self.rs._targets_timer.cancel()
60+
5261
def test_create_remote_sampler_with_empty_resource(self):
53-
rs = AwsXRayRemoteSampler(resource=Resource.get_empty())
54-
self.assertIsNotNone(rs._rules_timer)
55-
self.assertEqual(rs._AwsXRayRemoteSampler__polling_interval, 300)
56-
self.assertIsNotNone(rs._AwsXRayRemoteSampler__xray_client)
57-
self.assertIsNotNone(rs._AwsXRayRemoteSampler__resource)
58-
self.assertTrue(len(rs._AwsXRayRemoteSampler__client_id), 24)
62+
self.rs = AwsXRayRemoteSampler(resource=Resource.get_empty())
63+
self.assertIsNotNone(self.rs._rules_timer)
64+
self.assertEqual(self.rs._AwsXRayRemoteSampler__polling_interval, 300)
65+
self.assertIsNotNone(self.rs._AwsXRayRemoteSampler__xray_client)
66+
self.assertIsNotNone(self.rs._AwsXRayRemoteSampler__resource)
67+
self.assertTrue(len(self.rs._AwsXRayRemoteSampler__client_id), 24)
5968

6069
def test_create_remote_sampler_with_populated_resource(self):
61-
rs = AwsXRayRemoteSampler(
70+
self.rs = AwsXRayRemoteSampler(
6271
resource=Resource.create({"service.name": "test-service-name", "cloud.platform": "test-cloud-platform"})
6372
)
64-
self.assertIsNotNone(rs._rules_timer)
65-
self.assertEqual(rs._AwsXRayRemoteSampler__polling_interval, 300)
66-
self.assertIsNotNone(rs._AwsXRayRemoteSampler__xray_client)
67-
self.assertIsNotNone(rs._AwsXRayRemoteSampler__resource)
68-
self.assertEqual(rs._AwsXRayRemoteSampler__resource.attributes["service.name"], "test-service-name")
69-
self.assertEqual(rs._AwsXRayRemoteSampler__resource.attributes["cloud.platform"], "test-cloud-platform")
73+
self.assertIsNotNone(self.rs._rules_timer)
74+
self.assertEqual(self.rs._AwsXRayRemoteSampler__polling_interval, 300)
75+
self.assertIsNotNone(self.rs._AwsXRayRemoteSampler__xray_client)
76+
self.assertIsNotNone(self.rs._AwsXRayRemoteSampler__resource)
77+
self.assertEqual(self.rs._AwsXRayRemoteSampler__resource.attributes["service.name"], "test-service-name")
78+
self.assertEqual(self.rs._AwsXRayRemoteSampler__resource.attributes["cloud.platform"], "test-cloud-platform")
7079

7180
def test_create_remote_sampler_with_all_fields_populated(self):
72-
rs = AwsXRayRemoteSampler(
81+
self.rs = AwsXRayRemoteSampler(
7382
resource=Resource.create({"service.name": "test-service-name", "cloud.platform": "test-cloud-platform"}),
7483
endpoint="http://abc.com",
7584
polling_interval=120,
7685
log_level=DEBUG,
7786
)
78-
self.assertIsNotNone(rs._rules_timer)
79-
self.assertEqual(rs._AwsXRayRemoteSampler__polling_interval, 120)
80-
self.assertIsNotNone(rs._AwsXRayRemoteSampler__xray_client)
81-
self.assertIsNotNone(rs._AwsXRayRemoteSampler__resource)
87+
self.assertIsNotNone(self.rs._rules_timer)
88+
self.assertEqual(self.rs._AwsXRayRemoteSampler__polling_interval, 120)
89+
self.assertIsNotNone(self.rs._AwsXRayRemoteSampler__xray_client)
90+
self.assertIsNotNone(self.rs._AwsXRayRemoteSampler__resource)
8291
self.assertEqual(
83-
rs._AwsXRayRemoteSampler__xray_client._AwsXRaySamplingClient__get_sampling_rules_endpoint,
92+
self.rs._AwsXRayRemoteSampler__xray_client._AwsXRaySamplingClient__get_sampling_rules_endpoint,
8493
"http://abc.com/GetSamplingRules",
8594
)
86-
self.assertEqual(rs._AwsXRayRemoteSampler__resource.attributes["service.name"], "test-service-name")
87-
self.assertEqual(rs._AwsXRayRemoteSampler__resource.attributes["cloud.platform"], "test-cloud-platform")
95+
self.assertEqual(self.rs._AwsXRayRemoteSampler__resource.attributes["service.name"], "test-service-name")
96+
self.assertEqual(self.rs._AwsXRayRemoteSampler__resource.attributes["cloud.platform"], "test-cloud-platform")
8897

8998
@patch("requests.Session.post", side_effect=mocked_requests_get)
9099
@patch("amazon.opentelemetry.distro.sampler.aws_xray_remote_sampler.DEFAULT_TARGET_POLLING_INTERVAL_SECONDS", 2)
91100
def test_update_sampling_rules_and_targets_with_pollers_and_should_sample(self, mock_post=None):
92-
rs = AwsXRayRemoteSampler(
101+
self.rs = AwsXRayRemoteSampler(
93102
resource=Resource.create({"service.name": "test-service-name", "cloud.platform": "test-cloud-platform"})
94103
)
95-
self.assertEqual(rs._AwsXRayRemoteSampler__target_polling_interval, 2)
104+
self.assertEqual(self.rs._AwsXRayRemoteSampler__target_polling_interval, 2)
96105

97106
time.sleep(1.0)
98107
self.assertEqual(
99-
rs._AwsXRayRemoteSampler__rule_cache._RuleCache__rule_appliers[0].sampling_rule.RuleName, "test"
108+
self.rs._AwsXRayRemoteSampler__rule_cache._RuleCache__rule_appliers[0].sampling_rule.RuleName,
109+
"test",
100110
)
101-
self.assertEqual(rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision, Decision.DROP)
111+
self.assertEqual(self.rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision, Decision.DROP)
102112

103113
# wait 2 more seconds since targets polling was patched to 2 seconds (rather than 10s)
104114
time.sleep(2.0)
105-
self.assertEqual(rs._AwsXRayRemoteSampler__target_polling_interval, 1000)
115+
self.assertEqual(self.rs._AwsXRayRemoteSampler__target_polling_interval, 1000)
106116
self.assertEqual(
107-
rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision, Decision.RECORD_AND_SAMPLE
117+
self.rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision,
118+
Decision.RECORD_AND_SAMPLE,
108119
)
109120
self.assertEqual(
110-
rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision, Decision.RECORD_AND_SAMPLE
121+
self.rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision,
122+
Decision.RECORD_AND_SAMPLE,
111123
)
112124
self.assertEqual(
113-
rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision, Decision.RECORD_AND_SAMPLE
125+
self.rs.should_sample(None, 0, "name", attributes={"abc": "1234"}).decision,
126+
Decision.RECORD_AND_SAMPLE,
114127
)
115128

116129
@patch("requests.Session.post", side_effect=mocked_requests_get)
117130
@patch("amazon.opentelemetry.distro.sampler.aws_xray_remote_sampler.DEFAULT_TARGET_POLLING_INTERVAL_SECONDS", 3)
118131
def test_multithreading_with_large_reservoir_with_otel_sdk(self, mock_post=None):
119-
rs = AwsXRayRemoteSampler(
132+
self.rs = AwsXRayRemoteSampler(
120133
resource=Resource.create({"service.name": "test-service-name", "cloud.platform": "test-cloud-platform"})
121134
)
122135
attributes = {"abc": "1234"}
123136

124137
time.sleep(2.0)
125-
self.assertEqual(rs.should_sample(None, 0, "name", attributes=attributes).decision, Decision.DROP)
138+
self.assertEqual(self.rs.should_sample(None, 0, "name", attributes=attributes).decision, Decision.DROP)
126139

127140
# wait 3 more seconds since targets polling was patched to 2 seconds (rather than 10s)
128141
time.sleep(3.0)
@@ -139,7 +152,7 @@ def test_multithreading_with_large_reservoir_with_otel_sdk(self, mock_post=None)
139152
target=create_spans,
140153
name="thread_" + str(idx),
141154
daemon=True,
142-
args=(sampled_array, idx, attributes, rs, number_of_spans),
155+
args=(sampled_array, idx, attributes, self.rs, number_of_spans),
143156
)
144157
)
145158
threads[idx].start()
@@ -149,7 +162,7 @@ def test_multithreading_with_large_reservoir_with_otel_sdk(self, mock_post=None)
149162
threads[idx].join()
150163
sum_sampled += sampled_array[idx]
151164

152-
test_rule_applier = rs._AwsXRayRemoteSampler__rule_cache._RuleCache__rule_appliers[0]
165+
test_rule_applier = self.rs._AwsXRayRemoteSampler__rule_cache._RuleCache__rule_appliers[0]
153166
self.assertEqual(
154167
test_rule_applier._SamplingRuleApplier__reservoir_sampler._root._RateLimitingSampler__reservoir._quota,
155168
100000,
@@ -161,7 +174,7 @@ def test_multithreading_with_large_reservoir_with_otel_sdk(self, mock_post=None)
161174
@patch("amazon.opentelemetry.distro.sampler.aws_xray_remote_sampler.DEFAULT_TARGET_POLLING_INTERVAL_SECONDS", 2)
162175
@patch("amazon.opentelemetry.distro.sampler.aws_xray_remote_sampler._Clock", MockClock)
163176
def test_multithreading_with_some_reservoir_with_otel_sdk(self, mock_post=None):
164-
rs = AwsXRayRemoteSampler(
177+
self.rs = AwsXRayRemoteSampler(
165178
resource=Resource.create({"service.name": "test-service-name", "cloud.platform": "test-cloud-platform"})
166179
)
167180
attributes = {"abc": "non-matching attribute value, use default rule"}
@@ -170,17 +183,19 @@ def test_multithreading_with_some_reservoir_with_otel_sdk(self, mock_post=None):
170183
# which will eat up more than 1 second of reservoir. Using MockClock we can freeze time
171184
# and pretend all thread jobs start and end at the exact same time,
172185
# assume and test exactly 1 second of reservoir (100 quota) only
173-
mock_clock: MockClock = rs._clock
186+
mock_clock: MockClock = self.rs._clock
174187

175188
time.sleep(1.0)
176189
mock_clock.add_time(1.0)
177-
self.assertEqual(mock_clock.now(), rs._clock.now())
178-
self.assertEqual(rs.should_sample(None, 0, "name", attributes=attributes).decision, Decision.RECORD_AND_SAMPLE)
190+
self.assertEqual(mock_clock.now(), self.rs._clock.now())
191+
self.assertEqual(
192+
self.rs.should_sample(None, 0, "name", attributes=attributes).decision, Decision.RECORD_AND_SAMPLE
193+
)
179194

180195
# wait 2 more seconds since targets polling was patched to 2 seconds (rather than 10s)
181196
time.sleep(2.0)
182197
mock_clock.add_time(2.0)
183-
self.assertEqual(mock_clock.now(), rs._clock.now())
198+
self.assertEqual(mock_clock.now(), self.rs._clock.now())
184199

185200
number_of_spans = 100
186201
thread_count = 1000
@@ -194,7 +209,7 @@ def test_multithreading_with_some_reservoir_with_otel_sdk(self, mock_post=None):
194209
target=create_spans,
195210
name="thread_" + str(idx),
196211
daemon=True,
197-
args=(sampled_array, idx, attributes, rs, number_of_spans),
212+
args=(sampled_array, idx, attributes, self.rs, number_of_spans),
198213
)
199214
)
200215
threads[idx].start()
@@ -204,7 +219,7 @@ def test_multithreading_with_some_reservoir_with_otel_sdk(self, mock_post=None):
204219
threads[idx].join()
205220
sum_sampled += sampled_array[idx]
206221

207-
default_rule_applier = rs._AwsXRayRemoteSampler__rule_cache._RuleCache__rule_appliers[1]
222+
default_rule_applier = self.rs._AwsXRayRemoteSampler__rule_cache._RuleCache__rule_appliers[1]
208223
self.assertEqual(
209224
default_rule_applier._SamplingRuleApplier__reservoir_sampler._root._RateLimitingSampler__reservoir._quota,
210225
100,

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/sampler/test_aws_xray_sampling_client.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import json
44
import logging
55
import os
6-
import time
76
from importlib import reload
87
from logging import getLogger
98
from unittest import TestCase
@@ -187,11 +186,7 @@ def test_urls_excluded_from_sampling(self):
187186
except requests.exceptions.RequestException:
188187
pass
189188

190-
timeout = time.time() + 1
191189
span_list = memory_exporter.get_finished_spans()
192-
while len(span_list) != 1 and timeout > time.time():
193-
span_list = memory_exporter.get_finished_spans()
194-
time.sleep(0.1)
195190
self.assertEqual(1, len(span_list))
196191
span_http_url = span_list[0].attributes.get("http.url")
197192
self.assertEqual(span_http_url, "http://this_is_a_fake_url:3849/GetSamplingRules")
@@ -201,11 +196,7 @@ def test_urls_excluded_from_sampling(self):
201196
except requests.exceptions.RequestException:
202197
pass
203198

204-
timeout = time.time() + 1
205199
span_list = memory_exporter.get_finished_spans()
206-
while len(span_list) != 2 and timeout > time.time():
207-
span_list = memory_exporter.get_finished_spans()
208-
time.sleep(0.1)
209200
self.assertEqual(2, len(span_list))
210201
span_http_url = span_list[1].attributes.get("http.url")
211202
self.assertEqual(span_http_url, "http://this_is_a_fake_url:3849/SamplingTargets")

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/test_aws_opentelementry_configurator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@ def test_import_default_sampler_when_env_var_is_not_set(self):
175175
self.assertEqual(default_sampler.get_description(), DEFAULT_ON.get_description())
176176
# DEFAULT_ON is a ParentBased(ALWAYS_ON) sampler
177177

178+
@patch.object(AwsXRayRemoteSampler, "_AwsXRayRemoteSampler__start_sampling_rule_poller", lambda x: None)
179+
@patch.object(AwsXRayRemoteSampler, "_AwsXRayRemoteSampler__start_sampling_target_poller", lambda x: None)
178180
def test_using_xray_sampler_sets_url_exclusion_env_vars(self):
179181
targets_to_exclude = "SamplingTargets,GetSamplingRules"
180182
os.environ.pop("OTEL_PYTHON_REQUESTS_EXCLUDED_URLS", None)
@@ -186,6 +188,8 @@ def test_using_xray_sampler_sets_url_exclusion_env_vars(self):
186188
self.assertEqual(os.environ.get("OTEL_PYTHON_REQUESTS_EXCLUDED_URLS", None), targets_to_exclude)
187189
self.assertEqual(os.environ.get("OTEL_PYTHON_URLLIB3_EXCLUDED_URLS", None), targets_to_exclude)
188190

191+
@patch.object(AwsXRayRemoteSampler, "_AwsXRayRemoteSampler__start_sampling_rule_poller", lambda x: None)
192+
@patch.object(AwsXRayRemoteSampler, "_AwsXRayRemoteSampler__start_sampling_target_poller", lambda x: None)
189193
def test_using_xray_sampler_appends_url_exclusion_env_vars(self):
190194
targets_to_exclude = "SamplingTargets,GetSamplingRules"
191195
os.environ.pop("OTEL_PYTHON_REQUESTS_EXCLUDED_URLS", None)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from unittest import TestCase
5+
6+
from amazon.opentelemetry.distro.version import __version__
7+
8+
9+
class TestVersion(TestCase):
10+
def test_version_is_not_empty_and_not_none(self):
11+
self.assertIsNotNone(__version__)
12+
self.assertNotEqual(__version__, "")

0 commit comments

Comments
 (0)