Skip to content

Commit 6fb98f5

Browse files
committed
Add support for detecting synthetic source.
1 parent 74536f1 commit 6fb98f5

File tree

3 files changed

+269
-1
lines changed

3 files changed

+269
-1
lines changed

instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ def response_hook(span, request_obj, response):
125125
)
126126
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
127127
from opentelemetry.instrumentation.requests.package import _instruments
128+
from opentelemetry.instrumentation.requests.semconv import (
129+
ATTR_USER_AGENT_SYNTHETIC_TYPE,
130+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
131+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
132+
)
128133
from opentelemetry.instrumentation.requests.version import __version__
129134
from opentelemetry.instrumentation.utils import (
130135
is_http_instrumentation_enabled,
@@ -157,6 +162,46 @@ def response_hook(span, request_obj, response):
157162
_RequestHookT = Optional[Callable[[Span, PreparedRequest], None]]
158163
_ResponseHookT = Optional[Callable[[Span, PreparedRequest, Response], None]]
159164

165+
# Test patterns to detect (case-insensitive)
166+
_TEST_PATTERNS = [
167+
"alwayson",
168+
]
169+
170+
# Bot patterns to detect (case-insensitive)
171+
_BOT_PATTERNS = [
172+
"googlebot",
173+
"bingbot",
174+
]
175+
176+
177+
def _detect_synthetic_user_agent(user_agent: str) -> Optional[str]:
178+
"""
179+
Detect synthetic user agent type based on user agent string contents.
180+
181+
Args:
182+
user_agent: The user agent string to analyze
183+
184+
Returns:
185+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST if user agent contains any pattern from _TEST_PATTERNS
186+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT if user agent contains any pattern from _BOT_PATTERNS
187+
None otherwise
188+
189+
Note: Test patterns take priority over bot patterns.
190+
"""
191+
if not user_agent:
192+
return None
193+
194+
user_agent_lower = user_agent.lower()
195+
196+
if any(
197+
test_pattern in user_agent_lower for test_pattern in _TEST_PATTERNS
198+
):
199+
return USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST
200+
elif any(bot_pattern in user_agent_lower for bot_pattern in _BOT_PATTERNS):
201+
return USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT
202+
203+
return None
204+
160205

161206
def _set_http_status_code_attribute(
162207
span,
@@ -234,6 +279,9 @@ def get_or_create_headers():
234279

235280
url = redact_url(request.url)
236281

282+
# Get headers early for user agent detection
283+
headers = get_or_create_headers()
284+
237285
span_attributes = {}
238286
_set_http_method(
239287
span_attributes,
@@ -243,6 +291,12 @@ def get_or_create_headers():
243291
)
244292
_set_http_url(span_attributes, url, sem_conv_opt_in_mode)
245293

294+
# Check for synthetic user agent type
295+
user_agent = headers.get("User-Agent")
296+
synthetic_type = _detect_synthetic_user_agent(user_agent)
297+
if synthetic_type:
298+
span_attributes[ATTR_USER_AGENT_SYNTHETIC_TYPE] = synthetic_type
299+
246300
metric_labels = {}
247301
_set_http_method(
248302
metric_labels,
@@ -297,7 +351,6 @@ def get_or_create_headers():
297351
if callable(request_hook):
298352
request_hook(span, request)
299353

300-
headers = get_or_create_headers()
301354
inject(headers)
302355

303356
with suppress_http_instrumentation():
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright The OpenTelemetry Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
Semantic conventions for user agent synthetic type detection.
17+
18+
This module defines constants for user agent synthetic type attributes and values
19+
according to OpenTelemetry semantic conventions.
20+
21+
**EXPERIMENTAL**: This module contains experimental semantic conventions that are not
22+
yet part of the official OpenTelemetry semantic conventions specification. These
23+
attributes and values may experience breaking changes in future versions as the
24+
specification evolves.
25+
26+
The semantic conventions defined here are used to classify synthetic traffic based
27+
on User-Agent header analysis, helping distinguish between:
28+
- Bot traffic (web crawlers, search engine bots)
29+
- Test traffic (monitoring systems, health checks)
30+
- Regular user traffic
31+
32+
These experimental conventions may be:
33+
1. Modified with different attribute names or values
34+
2. Moved to official semantic convention packages
35+
3. Deprecated in favor of standardized alternatives
36+
4. Changed based on community feedback and specification updates
37+
38+
Users should be prepared for potential breaking changes when upgrading and should
39+
monitor OpenTelemetry specification updates for official semantic convention releases.
40+
"""
41+
42+
# User agent synthetic type attribute
43+
ATTR_USER_AGENT_SYNTHETIC_TYPE = "user_agent.synthetic.type"
44+
45+
# User agent synthetic type values
46+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT = "bot"
47+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST = "test"
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# Copyright The OpenTelemetry Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import httpretty
16+
import requests
17+
18+
from opentelemetry.instrumentation.requests import RequestsInstrumentor
19+
from opentelemetry.instrumentation.requests.semconv import (
20+
ATTR_USER_AGENT_SYNTHETIC_TYPE,
21+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
22+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
23+
)
24+
from opentelemetry.test.test_base import TestBase
25+
26+
27+
class TestUserAgentSynthetic(TestBase):
28+
URL = "http://mock/status/200"
29+
30+
def setUp(self):
31+
super().setUp()
32+
RequestsInstrumentor().instrument()
33+
httpretty.enable()
34+
httpretty.register_uri(httpretty.GET, self.URL, body="Hello!")
35+
36+
def tearDown(self):
37+
super().tearDown()
38+
RequestsInstrumentor().uninstrument()
39+
httpretty.disable()
40+
41+
def assert_span(self, num_spans=1):
42+
span_list = self.memory_exporter.get_finished_spans()
43+
self.assertEqual(num_spans, len(span_list))
44+
if num_spans == 0:
45+
return None
46+
if num_spans == 1:
47+
return span_list[0]
48+
return span_list
49+
50+
def test_user_agent_bot_googlebot(self):
51+
"""Test that googlebot user agent is marked as 'bot'"""
52+
headers = {
53+
"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
54+
}
55+
requests.get(self.URL, headers=headers, timeout=5)
56+
57+
span = self.assert_span()
58+
self.assertEqual(
59+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
60+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
61+
)
62+
63+
def test_user_agent_bot_bingbot(self):
64+
"""Test that bingbot user agent is marked as 'bot'"""
65+
headers = {
66+
"User-Agent": "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
67+
}
68+
requests.get(self.URL, headers=headers, timeout=5)
69+
70+
span = self.assert_span()
71+
self.assertEqual(
72+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
73+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
74+
)
75+
76+
def test_user_agent_test_alwayson(self):
77+
"""Test that alwayson user agent is marked as 'test'"""
78+
headers = {"User-Agent": "AlwaysOn-Monitor/1.0"}
79+
requests.get(self.URL, headers=headers, timeout=5)
80+
81+
span = self.assert_span()
82+
self.assertEqual(
83+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
84+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
85+
)
86+
87+
def test_user_agent_case_insensitive(self):
88+
"""Test that detection is case insensitive"""
89+
headers = {"User-Agent": "GOOGLEBOT/2.1"}
90+
requests.get(self.URL, headers=headers, timeout=5)
91+
92+
span = self.assert_span()
93+
self.assertEqual(
94+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
95+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
96+
)
97+
98+
self.memory_exporter.clear()
99+
100+
headers = {"User-Agent": "ALWAYSON-Monitor/1.0"}
101+
requests.get(self.URL, headers=headers, timeout=5)
102+
103+
span = self.assert_span()
104+
self.assertEqual(
105+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
106+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
107+
)
108+
109+
def test_user_agent_normal_browser(self):
110+
"""Test that normal browser user agents don't get synthetic type"""
111+
headers = {
112+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
113+
}
114+
requests.get(self.URL, headers=headers, timeout=5)
115+
116+
span = self.assert_span()
117+
self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes)
118+
119+
def test_no_user_agent_header(self):
120+
"""Test that requests without user agent don't get synthetic type"""
121+
requests.get(self.URL, timeout=5)
122+
123+
span = self.assert_span()
124+
self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes)
125+
126+
def test_empty_user_agent_header(self):
127+
"""Test that empty user agent doesn't get synthetic type"""
128+
headers = {"User-Agent": ""}
129+
requests.get(self.URL, headers=headers, timeout=5)
130+
131+
span = self.assert_span()
132+
self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes)
133+
134+
def test_user_agent_substring_match(self):
135+
"""Test that substrings are detected correctly"""
136+
# Test googlebot in middle of string
137+
headers = {"User-Agent": "MyApp/1.0 googlebot crawler"}
138+
requests.get(self.URL, headers=headers, timeout=5)
139+
140+
span = self.assert_span()
141+
self.assertEqual(
142+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
143+
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
144+
)
145+
146+
self.memory_exporter.clear()
147+
148+
# Test alwayson in middle of string
149+
headers = {"User-Agent": "TestFramework/1.0 alwayson monitoring"}
150+
requests.get(self.URL, headers=headers, timeout=5)
151+
152+
span = self.assert_span()
153+
self.assertEqual(
154+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
155+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
156+
)
157+
158+
def test_user_agent_priority_alwayson_over_bot(self):
159+
"""Test that alwayson takes priority if both patterns match"""
160+
headers = {"User-Agent": "alwayson-googlebot/1.0"}
161+
requests.get(self.URL, headers=headers, timeout=5)
162+
163+
span = self.assert_span()
164+
# alwayson should be checked first and return 'test'
165+
self.assertEqual(
166+
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
167+
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
168+
)

0 commit comments

Comments
 (0)