diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f252e8290..dbaa0c6a4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- `opentelemetry-instrumentation-requests` Detect synthetic sources on requests. + ([#3674](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3674)) + ## Version 1.36.0/0.57b0 (2025-07-29) ### Fixed diff --git a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py index 7cfc3a4fee..e3dfb84039 100644 --- a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py @@ -124,7 +124,16 @@ def response_hook(span, request_obj, response): _StabilityMode, ) from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.requests.constants import ( + BOT_PATTERNS, + TEST_PATTERNS, +) from opentelemetry.instrumentation.requests.package import _instruments +from opentelemetry.instrumentation.requests.semconv import ( + ATTR_USER_AGENT_SYNTHETIC_TYPE, + USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT, + USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST, +) from opentelemetry.instrumentation.requests.version import __version__ from opentelemetry.instrumentation.utils import ( is_http_instrumentation_enabled, @@ -158,6 +167,33 @@ def response_hook(span, request_obj, response): _ResponseHookT = Optional[Callable[[Span, PreparedRequest, Response], None]] +def _detect_synthetic_user_agent(user_agent: str) -> Optional[str]: + """ + Detect synthetic user agent type based on user agent string contents. + + Args: + user_agent: The user agent string to analyze + + Returns: + USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST if user agent contains any pattern from TEST_PATTERNS + USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT if user agent contains any pattern from BOT_PATTERNS + None otherwise + + Note: Test patterns take priority over bot patterns. + """ + if not user_agent: + return None + + user_agent_lower = user_agent.lower() + + if any(test_pattern in user_agent_lower for test_pattern in TEST_PATTERNS): + return USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST + if any(bot_pattern in user_agent_lower for bot_pattern in BOT_PATTERNS): + return USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT + + return None + + def _set_http_status_code_attribute( span, status_code, @@ -234,6 +270,9 @@ def get_or_create_headers(): url = redact_url(request.url) + # Get headers early for user agent detection + headers = get_or_create_headers() + span_attributes = {} _set_http_method( span_attributes, @@ -243,6 +282,12 @@ def get_or_create_headers(): ) _set_http_url(span_attributes, url, sem_conv_opt_in_mode) + # Check for synthetic user agent type + user_agent = headers.get("User-Agent") + synthetic_type = _detect_synthetic_user_agent(user_agent) + if synthetic_type: + span_attributes[ATTR_USER_AGENT_SYNTHETIC_TYPE] = synthetic_type + metric_labels = {} _set_http_method( metric_labels, @@ -297,7 +342,6 @@ def get_or_create_headers(): if callable(request_hook): request_hook(span, request) - headers = get_or_create_headers() inject(headers) with suppress_http_instrumentation(): diff --git a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/constants.py b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/constants.py new file mode 100644 index 0000000000..cb6d6741df --- /dev/null +++ b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/constants.py @@ -0,0 +1,34 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Constants for OpenTelemetry requests instrumentation. + +This module contains configuration constants and pattern definitions used +by the requests instrumentation for various features like synthetic user +agent detection. +""" + +# Test patterns to detect in user agent strings (case-insensitive) +# These patterns indicate synthetic test traffic +TEST_PATTERNS = [ + "alwayson", +] + +# Bot patterns to detect in user agent strings (case-insensitive) +# These patterns indicate automated bot traffic +BOT_PATTERNS = [ + "googlebot", + "bingbot", +] diff --git a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/semconv.py b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/semconv.py new file mode 100644 index 0000000000..7a0e3f48b1 --- /dev/null +++ b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/semconv.py @@ -0,0 +1,47 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Semantic conventions for user agent synthetic type detection. + +This module defines constants for user agent synthetic type attributes and values +according to OpenTelemetry semantic conventions. + +**EXPERIMENTAL**: This module contains experimental semantic conventions that are not +yet part of the official OpenTelemetry semantic conventions specification. These +attributes and values may experience breaking changes in future versions as the +specification evolves. + +The semantic conventions defined here are used to classify synthetic traffic based +on User-Agent header analysis, helping distinguish between: +- Bot traffic (web crawlers, search engine bots) +- Test traffic (monitoring systems, health checks) +- Regular user traffic + +These experimental conventions may be: +1. Modified with different attribute names or values +2. Moved to official semantic convention packages +3. Deprecated in favor of standardized alternatives +4. Changed based on community feedback and specification updates + +Users should be prepared for potential breaking changes when upgrading and should +monitor OpenTelemetry specification updates for official semantic convention releases. +""" + +# User agent synthetic type attribute +ATTR_USER_AGENT_SYNTHETIC_TYPE = "user_agent.synthetic.type" + +# User agent synthetic type values +USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT = "bot" +USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST = "test" diff --git a/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py b/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py new file mode 100644 index 0000000000..26cf5c68aa --- /dev/null +++ b/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py @@ -0,0 +1,168 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import httpretty +import requests + +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.instrumentation.requests.semconv import ( + ATTR_USER_AGENT_SYNTHETIC_TYPE, + USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT, + USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST, +) +from opentelemetry.test.test_base import TestBase + + +class TestUserAgentSynthetic(TestBase): + URL = "http://mock/status/200" + + def setUp(self): + super().setUp() + RequestsInstrumentor().instrument() + httpretty.enable() + httpretty.register_uri(httpretty.GET, self.URL, body="Hello!") + + def tearDown(self): + super().tearDown() + RequestsInstrumentor().uninstrument() + httpretty.disable() + + def assert_span(self, num_spans=1): + span_list = self.memory_exporter.get_finished_spans() + self.assertEqual(num_spans, len(span_list)) + if num_spans == 0: + return None + if num_spans == 1: + return span_list[0] + return span_list + + def test_user_agent_bot_googlebot(self): + """Test that googlebot user agent is marked as 'bot'""" + headers = { + "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + } + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT, + ) + + def test_user_agent_bot_bingbot(self): + """Test that bingbot user agent is marked as 'bot'""" + headers = { + "User-Agent": "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + } + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT, + ) + + def test_user_agent_test_alwayson(self): + """Test that alwayson user agent is marked as 'test'""" + headers = {"User-Agent": "AlwaysOn-Monitor/1.0"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST, + ) + + def test_user_agent_case_insensitive(self): + """Test that detection is case insensitive""" + headers = {"User-Agent": "GOOGLEBOT/2.1"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT, + ) + + self.memory_exporter.clear() + + headers = {"User-Agent": "ALWAYSON-Monitor/1.0"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST, + ) + + def test_user_agent_normal_browser(self): + """Test that normal browser user agents don't get synthetic type""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes) + + def test_no_user_agent_header(self): + """Test that requests without user agent don't get synthetic type""" + requests.get(self.URL, timeout=5) + + span = self.assert_span() + self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes) + + def test_empty_user_agent_header(self): + """Test that empty user agent doesn't get synthetic type""" + headers = {"User-Agent": ""} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes) + + def test_user_agent_substring_match(self): + """Test that substrings are detected correctly""" + # Test googlebot in middle of string + headers = {"User-Agent": "MyApp/1.0 googlebot crawler"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT, + ) + + self.memory_exporter.clear() + + # Test alwayson in middle of string + headers = {"User-Agent": "TestFramework/1.0 alwayson monitoring"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST, + ) + + def test_user_agent_priority_alwayson_over_bot(self): + """Test that alwayson takes priority if both patterns match""" + headers = {"User-Agent": "alwayson-googlebot/1.0"} + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + # alwayson should be checked first and return 'test' + self.assertEqual( + span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE), + USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST, + )