Skip to content

Add Support for Detecting Synthetic Source #3674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Added

- `opentelemetry-instrumentation-requests` Detect synthetic sources on requests.
([#3674](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3674))

## Version 1.36.0/0.57b0 (2025-07-29)

### Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,16 @@ def response_hook(span, request_obj, response):
_StabilityMode,
)
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
from opentelemetry.instrumentation.requests.constants import (
BOT_PATTERNS,
TEST_PATTERNS,
)
from opentelemetry.instrumentation.requests.package import _instruments
from opentelemetry.instrumentation.requests.semconv import (
ATTR_USER_AGENT_SYNTHETIC_TYPE,
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
)
from opentelemetry.instrumentation.requests.version import __version__
from opentelemetry.instrumentation.utils import (
is_http_instrumentation_enabled,
Expand Down Expand Up @@ -158,6 +167,33 @@ def response_hook(span, request_obj, response):
_ResponseHookT = Optional[Callable[[Span, PreparedRequest, Response], None]]


def _detect_synthetic_user_agent(user_agent: str) -> Optional[str]:
"""
Detect synthetic user agent type based on user agent string contents.

Args:
user_agent: The user agent string to analyze

Returns:
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST if user agent contains any pattern from TEST_PATTERNS
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT if user agent contains any pattern from BOT_PATTERNS
None otherwise

Note: Test patterns take priority over bot patterns.
"""
if not user_agent:
return None

user_agent_lower = user_agent.lower()

if any(test_pattern in user_agent_lower for test_pattern in TEST_PATTERNS):
return USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST
if any(bot_pattern in user_agent_lower for bot_pattern in BOT_PATTERNS):
return USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT

return None


def _set_http_status_code_attribute(
span,
status_code,
Expand Down Expand Up @@ -234,6 +270,9 @@ def get_or_create_headers():

url = redact_url(request.url)

# Get headers early for user agent detection
headers = get_or_create_headers()

span_attributes = {}
_set_http_method(
span_attributes,
Expand All @@ -243,6 +282,12 @@ def get_or_create_headers():
)
_set_http_url(span_attributes, url, sem_conv_opt_in_mode)

# Check for synthetic user agent type
user_agent = headers.get("User-Agent")
synthetic_type = _detect_synthetic_user_agent(user_agent)
if synthetic_type:
span_attributes[ATTR_USER_AGENT_SYNTHETIC_TYPE] = synthetic_type

metric_labels = {}
_set_http_method(
metric_labels,
Expand Down Expand Up @@ -297,7 +342,6 @@ def get_or_create_headers():
if callable(request_hook):
request_hook(span, request)

headers = get_or_create_headers()
inject(headers)

with suppress_http_instrumentation():
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright The OpenTelemetry Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Constants for OpenTelemetry requests instrumentation.

This module contains configuration constants and pattern definitions used
by the requests instrumentation for various features like synthetic user
agent detection.
"""

# Test patterns to detect in user agent strings (case-insensitive)
# These patterns indicate synthetic test traffic
TEST_PATTERNS = [
"alwayson",
]

# Bot patterns to detect in user agent strings (case-insensitive)
# These patterns indicate automated bot traffic
BOT_PATTERNS = [
"googlebot",
"bingbot",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright The OpenTelemetry Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Semantic conventions for user agent synthetic type detection.

This module defines constants for user agent synthetic type attributes and values
according to OpenTelemetry semantic conventions.

**EXPERIMENTAL**: This module contains experimental semantic conventions that are not
yet part of the official OpenTelemetry semantic conventions specification. These
attributes and values may experience breaking changes in future versions as the
specification evolves.

The semantic conventions defined here are used to classify synthetic traffic based
on User-Agent header analysis, helping distinguish between:
- Bot traffic (web crawlers, search engine bots)
- Test traffic (monitoring systems, health checks)
- Regular user traffic

These experimental conventions may be:
1. Modified with different attribute names or values
2. Moved to official semantic convention packages
3. Deprecated in favor of standardized alternatives
4. Changed based on community feedback and specification updates

Users should be prepared for potential breaking changes when upgrading and should
monitor OpenTelemetry specification updates for official semantic convention releases.
"""

# User agent synthetic type attribute
ATTR_USER_AGENT_SYNTHETIC_TYPE = "user_agent.synthetic.type"

# User agent synthetic type values
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT = "bot"
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST = "test"
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Copyright The OpenTelemetry Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import httpretty
import requests

from opentelemetry.instrumentation.requests import RequestsInstrumentor
from opentelemetry.instrumentation.requests.semconv import (
ATTR_USER_AGENT_SYNTHETIC_TYPE,
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
)
from opentelemetry.test.test_base import TestBase


class TestUserAgentSynthetic(TestBase):
URL = "http://mock/status/200"

def setUp(self):
super().setUp()
RequestsInstrumentor().instrument()
httpretty.enable()
httpretty.register_uri(httpretty.GET, self.URL, body="Hello!")

def tearDown(self):
super().tearDown()
RequestsInstrumentor().uninstrument()
httpretty.disable()

def assert_span(self, num_spans=1):
span_list = self.memory_exporter.get_finished_spans()
self.assertEqual(num_spans, len(span_list))
if num_spans == 0:
return None
if num_spans == 1:
return span_list[0]
return span_list

def test_user_agent_bot_googlebot(self):
"""Test that googlebot user agent is marked as 'bot'"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
)

def test_user_agent_bot_bingbot(self):
"""Test that bingbot user agent is marked as 'bot'"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
)

def test_user_agent_test_alwayson(self):
"""Test that alwayson user agent is marked as 'test'"""
headers = {"User-Agent": "AlwaysOn-Monitor/1.0"}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
)

def test_user_agent_case_insensitive(self):
"""Test that detection is case insensitive"""
headers = {"User-Agent": "GOOGLEBOT/2.1"}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
)

self.memory_exporter.clear()

headers = {"User-Agent": "ALWAYSON-Monitor/1.0"}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
)

def test_user_agent_normal_browser(self):
"""Test that normal browser user agents don't get synthetic type"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes)

def test_no_user_agent_header(self):
"""Test that requests without user agent don't get synthetic type"""
requests.get(self.URL, timeout=5)

span = self.assert_span()
self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes)

def test_empty_user_agent_header(self):
"""Test that empty user agent doesn't get synthetic type"""
headers = {"User-Agent": ""}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertNotIn(ATTR_USER_AGENT_SYNTHETIC_TYPE, span.attributes)

def test_user_agent_substring_match(self):
"""Test that substrings are detected correctly"""
# Test googlebot in middle of string
headers = {"User-Agent": "MyApp/1.0 googlebot crawler"}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_BOT,
)

self.memory_exporter.clear()

# Test alwayson in middle of string
headers = {"User-Agent": "TestFramework/1.0 alwayson monitoring"}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
)

def test_user_agent_priority_alwayson_over_bot(self):
"""Test that alwayson takes priority if both patterns match"""
headers = {"User-Agent": "alwayson-googlebot/1.0"}
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
# alwayson should be checked first and return 'test'
self.assertEqual(
span.attributes.get(ATTR_USER_AGENT_SYNTHETIC_TYPE),
USER_AGENT_SYNTHETIC_TYPE_VALUE_TEST,
)