Skip to content

Commit ff3e1ff

Browse files
authored
Customer Facing Statsbeat: Exception categorization changes (#42695)
* Exception categorizatio changes * Updated CHANGELOG * Refactored after statsbeat changes * Addressed feedback * Fix lint * Added reason for UNKNOWN
1 parent f891a62 commit ff3e1ff

File tree

8 files changed

+177
-99
lines changed

8 files changed

+177
-99
lines changed

sdk/monitor/azure-monitor-opentelemetry-exporter/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
([#42551](https://github.com/Azure/azure-sdk-for-python/pull/42551))
1414
- Rename Customer Statsbeat to Customer SDKStats as per [Spec] - https://github.com/aep-health-and-standards/Telemetry-Collection-Spec/pull/581
1515
([#42573](https://github.com/Azure/azure-sdk-for-python/pull/42573))
16+
- Customer Facing SDKStats: Exception categorization as per [Spec] - https://github.com/aep-health-and-standards/Telemetry-Collection-Spec/blob/main/ApplicationInsights/sdkstats/customer_facing_sdk_stats.md
17+
([#42695](https://github.com/Azure/azure-sdk-for-python/pull/42695))
1618

1719
### Breaking Changes
1820

sdk/monitor/azure-monitor-opentelemetry-exporter/azure/monitor/opentelemetry/exporter/_constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,13 @@ def __init__(self, language: str, version: str, compute_type: str):
195195
_AVAILABILITY_ENVELOPE_NAME: _AVAILABILITY,
196196
}
197197

198+
# Exception categories
199+
class _exception_categories(Enum):
200+
CLIENT_EXCEPTION = "Client exception"
201+
STORAGE_EXCEPTION = "Storage exception"
202+
NETWORK_EXCEPTION = "Network exception"
203+
TIMEOUT_EXCEPTION = "Timeout exception"
204+
198205
# Map RP names
199206
class _RP_Names(Enum):
200207
APP_SERVICE = "appsvc"

sdk/monitor/azure-monitor-opentelemetry-exporter/azure/monitor/opentelemetry/exporter/export/_base.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
_RETRYABLE_STATUS_CODES,
4444
_THROTTLE_STATUS_CODES,
4545
DropCode,
46+
_exception_categories,
4647
)
4748
# from azure.monitor.opentelemetry.exporter._configuration import _ConfigurationManager
4849
from azure.monitor.opentelemetry.exporter._connection_string_parser import ConnectionStringParser
@@ -346,7 +347,7 @@ def _transmit(self, envelopes: List[TelemetryItem]) -> ExportResult:
346347
else:
347348
if not self._is_stats_exporter():
348349
if self._customer_sdkstats_metrics and self._should_collect_customer_sdkstats():
349-
_track_dropped_items(self._customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, "Error parsing redirect information.")
350+
_track_dropped_items(self._customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, _exception_categories.CLIENT_EXCEPTION.value)
350351
logger.error(
351352
"Error parsing redirect information.",
352353
)
@@ -359,7 +360,7 @@ def _transmit(self, envelopes: List[TelemetryItem]) -> ExportResult:
359360
self._customer_sdkstats_metrics,
360361
envelopes,
361362
DropCode.CLIENT_EXCEPTION,
362-
"Error sending telemetry because of circular redirects. Please check the integrity of your connection string."
363+
_exception_categories.CLIENT_EXCEPTION.value
363364
)
364365
logger.error(
365366
"Error sending telemetry because of circular redirects. "
@@ -418,7 +419,7 @@ def _transmit(self, envelopes: List[TelemetryItem]) -> ExportResult:
418419

419420
# Track dropped items in customer sdkstats for general exceptions
420421
if self._customer_sdkstats_metrics and self._should_collect_customer_sdkstats():
421-
_track_dropped_items(self._customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, str(ex))
422+
_track_dropped_items(self._customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, _exception_categories.CLIENT_EXCEPTION.value)
422423

423424
if self._should_collect_stats():
424425
_update_requests_map(_REQ_EXCEPTION_NAME[1], value=ex.__class__.__name__)

sdk/monitor/azure-monitor-opentelemetry-exporter/azure/monitor/opentelemetry/exporter/statsbeat/_customer_sdkstats.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
RetryCodeType,
2424
CustomerSdkStatsMetricName,
2525
_CUSTOMER_SDKSTATS_LANGUAGE,
26+
_exception_categories,
2627
)
2728

2829

@@ -112,20 +113,16 @@ def count_dropped_items(
112113
if not self._is_enabled or count <= 0:
113114
return
114115

115-
# Get or create the drop_code map for this telemetry_type
116116
if telemetry_type not in self._counters.total_item_drop_count:
117117
self._counters.total_item_drop_count[telemetry_type] = {}
118118
drop_code_map = self._counters.total_item_drop_count[telemetry_type]
119119

120-
# Get or create the reason map for this drop_code
121120
if drop_code not in drop_code_map:
122121
drop_code_map[drop_code] = {}
123122
reason_map = drop_code_map[drop_code]
124123

125-
# Generate a low-cardinality, informative reason description
126124
reason = self._get_drop_reason(drop_code, exception_message)
127125

128-
# Update the count for this reason
129126
current_count = reason_map.get(reason, 0)
130127
reason_map[reason] = current_count + count
131128

@@ -212,28 +209,29 @@ def _get_drop_reason(self, drop_code: DropCodeType, exception_message: Optional[
212209
return categorize_status_code(drop_code)
213210

214211
if drop_code == DropCode.CLIENT_EXCEPTION:
215-
return exception_message if exception_message else "unknown_exception"
212+
return exception_message if exception_message else _exception_categories.CLIENT_EXCEPTION.value
216213

217214
drop_code_reasons = {
218-
DropCode.CLIENT_READONLY: "readonly_mode",
219-
DropCode.CLIENT_STORAGE_DISABLED: "local storage is disabled",
220-
DropCode.CLIENT_PERSISTENCE_CAPACITY: "persistence_full",
215+
DropCode.CLIENT_READONLY: "Client readonly",
216+
DropCode.CLIENT_STORAGE_DISABLED: "Client local storage disabled",
217+
DropCode.CLIENT_PERSISTENCE_CAPACITY: "Client persistence capacity",
218+
DropCode.UNKNOWN: "Unknown reason"
221219
}
222220

223-
return drop_code_reasons.get(drop_code, "unknown_reason")
221+
return drop_code_reasons.get(drop_code, DropCode.UNKNOWN)
224222

225223
def _get_retry_reason(self, retry_code: RetryCodeType, exception_message: Optional[str] = None) -> str:
226224
if isinstance(retry_code, int):
227225
return categorize_status_code(retry_code)
228226

229227
if retry_code == RetryCode.CLIENT_EXCEPTION:
230-
return exception_message if exception_message else "unknown_exception"
228+
return exception_message if exception_message else _exception_categories.CLIENT_EXCEPTION.value
231229

232230
retry_code_reasons = {
233-
RetryCode.CLIENT_TIMEOUT: "client_timeout",
234-
RetryCode.UNKNOWN: "unknown_reason",
231+
RetryCode.CLIENT_TIMEOUT: "Client timeout",
232+
RetryCode.UNKNOWN: "Unknown reason",
235233
}
236-
return retry_code_reasons.get(retry_code, "unknown_reason")
234+
return retry_code_reasons.get(retry_code, RetryCode.UNKNOWN)
237235

238236
# Global customer sdkstats singleton
239237
_CUSTOMER_SDKSTATS_METRICS = None

sdk/monitor/azure-monitor-opentelemetry-exporter/azure/monitor/opentelemetry/exporter/statsbeat/_utils.py

Lines changed: 41 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
# Licensed under the MIT License.
33
import os
44
from typing import Optional, List, Tuple
5-
from azure.core.exceptions import ServiceRequestError
5+
6+
from requests import ReadTimeout, Timeout
7+
from azure.core.exceptions import ServiceRequestTimeoutError
68
from azure.monitor.opentelemetry.exporter._constants import (
79
RetryCode,
810
RetryCodeType,
@@ -27,6 +29,7 @@
2729
_REQ_DURATION_NAME,
2830
_REQ_SUCCESS_NAME,
2931
_APPLICATIONINSIGHTS_SDKSTATS_EXPORT_INTERVAL,
32+
_exception_categories,
3033
)
3134

3235
from azure.monitor.opentelemetry.exporter.statsbeat._state import (
@@ -84,42 +87,55 @@ def _update_requests_map(type_name, value):
8487

8588
def categorize_status_code(status_code: int) -> str:
8689
status_map = {
87-
400: "bad_request",
88-
401: "unauthorized",
89-
402: "daily quota exceeded",
90-
403: "forbidden",
91-
404: "not_found",
92-
408: "request_timeout",
93-
413: "payload_too_large",
94-
429: "too_many_requests",
95-
500: "internal_server_error",
96-
502: "bad_gateway",
97-
503: "service_unavailable",
98-
504: "gateway_timeout",
90+
400: "Bad request",
91+
401: "Unauthorized",
92+
402: "Daily quota exceeded",
93+
403: "Forbidden",
94+
404: "Not found",
95+
408: "Request timeout",
96+
413: "Payload too large",
97+
429: "Too many requests",
98+
500: "Internal server error",
99+
502: "Bad gateway",
100+
503: "Service unavailable",
101+
504: "Gateway timeout",
99102
}
100103
if status_code in status_map:
101104
return status_map[status_code]
102105
if 400 <= status_code < 500:
103-
return "client_error_4xx"
106+
return "Client error 4xx"
104107
if 500 <= status_code < 600:
105-
return "server_error_5xx"
108+
return "Server error 5xx"
106109
return f"status_{status_code}"
107110

108111
def _determine_client_retry_code(error) -> Tuple[RetryCodeType, Optional[str]]:
112+
timeout_exception_types = (
113+
ServiceRequestTimeoutError,
114+
ReadTimeout,
115+
TimeoutError,
116+
Timeout,
117+
)
118+
network_exception_types = (
119+
ConnectionError,
120+
OSError,
121+
)
109122
if hasattr(error, 'status_code') and error.status_code in [401, 403, 408, 429, 500, 502, 503, 504]:
110123
# For specific status codes, preserve the custom message if available
111124
error_message = getattr(error, 'message', None) if hasattr(error, 'message') else None
112125
return (error.status_code, error_message or _UNKNOWN)
113126

114-
if isinstance(error, ServiceRequestError):
115-
error_message = str(error.message) if error.message else ""
116-
else:
117-
error_message = str(error)
127+
if isinstance(error, timeout_exception_types):
128+
return (RetryCode.CLIENT_TIMEOUT, _exception_categories.TIMEOUT_EXCEPTION.value)
129+
130+
if hasattr(error, 'message'):
131+
error_message = getattr(error, 'message', None) if hasattr(error, 'message') else None
132+
if error_message is not None and ('timeout' in error_message.lower() or 'timed out' in error_message.lower()):
133+
return (RetryCode.CLIENT_TIMEOUT, _exception_categories.TIMEOUT_EXCEPTION.value)
134+
135+
if isinstance(error, network_exception_types):
136+
return (RetryCode.CLIENT_EXCEPTION, _exception_categories.NETWORK_EXCEPTION.value)
118137

119-
error_message_lower = error_message.lower()
120-
if 'timeout' in error_message_lower or 'timed out' in error_message_lower:
121-
return (RetryCode.CLIENT_TIMEOUT, error_message)
122-
return (RetryCode.CLIENT_EXCEPTION, error_message)
138+
return (RetryCode.CLIENT_EXCEPTION, _exception_categories.CLIENT_EXCEPTION.value)
123139

124140
def _track_successful_items(customer_sdkstats_metrics, envelopes: List[TelemetryItem]):
125141
if customer_sdkstats_metrics:
@@ -199,10 +215,10 @@ def _track_dropped_items_from_storage(customer_sdkstats_metrics, result_from_sto
199215
_track_dropped_items(customer_sdkstats_metrics, envelopes, DropCode.CLIENT_PERSISTENCE_CAPACITY)
200216
elif get_local_storage_setup_state_exception() != "":
201217
# For exceptions caught in _check_and_set_folder_permissions during storage setup
202-
_track_dropped_items(customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, result_from_storage_put) # pylint: disable=line-too-long
218+
_track_dropped_items(customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, _exception_categories.STORAGE_EXCEPTION.value) # pylint: disable=line-too-long
203219
elif isinstance(result_from_storage_put, str):
204220
# For any exceptions occurred in put method of either LocalFileStorage or LocalFileBlob, track dropped item with reason # pylint: disable=line-too-long
205-
_track_dropped_items(customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, result_from_storage_put) # pylint: disable=line-too-long
221+
_track_dropped_items(customer_sdkstats_metrics, envelopes, DropCode.CLIENT_EXCEPTION, _exception_categories.STORAGE_EXCEPTION.value) # pylint: disable=line-too-long
206222
else:
207223
# LocalFileBlob.put returns StorageExportResult.LOCAL_FILE_BLOB_SUCCESS here. Don't need to track anything in this case. # pylint: disable=line-too-long
208224
pass

sdk/monitor/azure-monitor-opentelemetry-exporter/tests/statsbeat/test_customer_sdkstats.py

Lines changed: 14 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,14 @@
1616
_APPLICATIONINSIGHTS_SDKSTATS_ENABLED_PREVIEW,
1717
_REQUEST,
1818
_DEPENDENCY,
19-
_REQ_RETRY_NAME,
2019
_CUSTOMER_SDKSTATS_LANGUAGE,
2120
_APPLICATIONINSIGHTS_SDKSTATS_EXPORT_INTERVAL,
2221
_DEFAULT_STATS_SHORT_EXPORT_INTERVAL,
2322
_UNKNOWN,
2423
_TYPE_MAP,
2524
DropCode,
26-
DropCodeType,
2725
RetryCode,
28-
RetryCodeType,
29-
_TRACE,
26+
_exception_categories,
3027
)
3128

3229
from opentelemetry import trace
@@ -115,7 +112,7 @@ def test_customer_sdkstats_not_initialized_when_disabled(self):
115112

116113
# Verify the metrics methods don't do anything when disabled
117114
metrics.count_successful_items(5, _REQUEST)
118-
metrics.count_dropped_items(3, _REQUEST, DropCode.CLIENT_EXCEPTION, "Test exception")
115+
metrics.count_dropped_items(3, _REQUEST, DropCode.CLIENT_EXCEPTION, _exception_categories.NETWORK_EXCEPTION.value)
119116

120117
# Verify callbacks return empty lists when disabled
121118
self.assertEqual(metrics._item_success_callback(mock.Mock()), [])
@@ -270,7 +267,7 @@ def patched_transmit(self_exporter, envelopes):
270267
if should_fail:
271268
nonlocal dropped_items
272269

273-
failure_type = random.choice(["http_status", "client_exception"])
270+
failure_type = random.choice(["http_status", "exception"])
274271

275272
if failure_type == "http_status":
276273
status_codes = [401, 401, 403, 500, 500, 503, 402]
@@ -282,38 +279,12 @@ def patched_transmit(self_exporter, envelopes):
282279
metrics.count_dropped_items(failure_count, telemetry_type, status_code, None)
283280
else:
284281
exception_scenarios = [
285-
"timeout_exception"
286-
"Connection timed out after 30 seconds",
287-
"Request timed out after 60 seconds",
288-
"Operation timed out",
289-
290-
"network_exception",
291-
"Network connection failed: Connection refused",
292-
"Network error: Host unreachable",
293-
294-
"authentication_exception",
295-
"Authentication failed: Invalid credentials",
296-
"Auth error: Token expired",
297-
298-
"Failed to parse response: Invalid JSON format",
299-
"Parse error: Malformed XML",
300-
"parse_exception",
301-
302-
"Out of memory: Cannot allocate buffer",
303-
"Memory allocation failed",
304-
"memory_exception",
305-
306-
"HTTP 401 Unauthorized",
307-
"HTTP 401 Invalid token",
308-
"HTTP 500 Internal Server Error",
309-
"HTTP 500 Database error",
310-
311-
"Unknown transmission error",
312-
"Unexpected error occurred"
313-
314-
"storage_exception",
315-
"other_exception"
282+
_exception_categories.CLIENT_EXCEPTION.value,
283+
_exception_categories.NETWORK_EXCEPTION.value,
284+
_exception_categories.STORAGE_EXCEPTION.value,
285+
_exception_categories.TIMEOUT_EXCEPTION.value
316286
]
287+
317288

318289
exception_message = random.choice(exception_scenarios)
319290

@@ -448,7 +419,7 @@ def patched_transmit(self_exporter, envelopes):
448419
if should_retry:
449420
nonlocal retried_items
450421

451-
retry_type = random.choice(["http_status", "client_timeout", "unknown"])
422+
retry_type = random.choice(["http_status", "Client timeout", "Unknown"])
452423

453424
if retry_type == "http_status":
454425
# HTTP status codes that would trigger retries
@@ -459,7 +430,7 @@ def patched_transmit(self_exporter, envelopes):
459430
retried_items += failure_count
460431

461432
metrics.count_retry_items(failure_count, telemetry_type, status_code, None)
462-
elif retry_type == "client_timeout":
433+
elif retry_type == "Client timeout":
463434
timeout_messages = [
464435
"Connection timed out after 30 seconds",
465436
"Request timed out after 60 seconds",
@@ -477,10 +448,10 @@ def patched_transmit(self_exporter, envelopes):
477448
else:
478449
# Unknown retry reasons
479450
unknown_messages = [
480-
"Unknown network error",
481-
"Unexpected retry condition",
482-
"Network instability detected",
483-
"Connection reset by peer"
451+
_exception_categories.CLIENT_EXCEPTION.value,
452+
_exception_categories.NETWORK_EXCEPTION.value,
453+
_exception_categories.STORAGE_EXCEPTION.value,
454+
_exception_categories.TIMEOUT_EXCEPTION.value
484455
]
485456

486457
exception_message = random.choice(unknown_messages)

0 commit comments

Comments
 (0)