Skip to content

Commit fd93d9e

Browse files
fix: Make OTLP metrics snapshot testing more stable (#7374)
## Summary of changes Makes the OTLP metrics snapshots more stable by adding ordering, doing some deduplication (that wasn't reproduced locally), and consolidating data series by their identifying Resources/Scopes ## Reason for change We had to mark this test as Flaky, but we really want to make sure it's stable so we can correctly do regression tests against it. ## Implementation details Processes the OTLP metrics data after receiving it from the mock tracer agent. This includes: - Grouping multiple series of ScopeMetrics into one series if their Resources match (this is expected and asserted to be true) - Keeping only one metric data point from a Metric with the same Resource & InstrumentationScope & Metric Name (should fix the issue with multiple exports of the same asynchronous counter) - Scrubs the `telemetry.sdk.version` attribute with a dummy value so we can successfully test against multiple package versions ## Test coverage This is the test coverage. ## Other details
1 parent 5676931 commit fd93d9e

File tree

5 files changed

+754
-723
lines changed

5 files changed

+754
-723
lines changed

tracer/test/Datadog.Trace.ClrProfiler.IntegrationTests/OpenTelemetrySdkTests.cs

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,6 @@ public async Task IntegrationDisabled(string packageVersion)
202202
[SkippableTheory]
203203
[Trait("Category", "EndToEnd")]
204204
[Trait("RunOnWindows", "True")]
205-
[Flaky("The received payload is super flaky, sometimes we receive multiple payloads", maxRetries: 3)]
206205
[MemberData(nameof(PackageVersions.OpenTelemetry), MemberType = typeof(PackageVersions))]
207206
public async Task SubmitsOtlpMetrics(string packageVersion)
208207
{
@@ -235,22 +234,66 @@ public async Task SubmitsOtlpMetrics(string packageVersion)
235234

236235
metricRequests.Should().NotBeEmpty("Expected OTLP metric requests were not received.");
237236

238-
// TODO: We only expect one metric request in the snapshot, but we could actually have multiple payloads, some of which
239-
// could contain no metrics, some of which could contain _duplicate_ metrics. Flake central.
240-
// To fix it we need to deserialize the data into a "real" payload, and aggregate the metrics as appropriate,
241-
// to make sure that we're actually sending the "correct" results. For example, gauge values should not change,
242-
// count values should be 0 in subsequent payloads etc.
243-
var snapshotPayload = metricRequests
244-
.Select(r => r.DeserializedData)
245-
.ToList();
237+
// Group the scope metrics by the resource metrics and schema URL (should only be one unique combination)
238+
var resourceMetricByResource = metricRequests
239+
.SelectMany(r => r.MetricsData.ResourceMetrics)
240+
.GroupBy(r => new Tuple<global::OpenTelemetry.Proto.Resource.V1.Resource, string>(r.Resource, r.SchemaUrl))
241+
.Should()
242+
.ContainSingle()
243+
.Subject;
244+
245+
// Group the individual metrics by scope metric and schema URL (should only be one unique combination since we're only using one ActivitySource)
246+
// This may result in multiple entries for metrics that are repeated multiple times before the test exits
247+
var scopeMetricsByResource = resourceMetricByResource
248+
.SelectMany(r => r.ScopeMetrics)
249+
.GroupBy(r => new Tuple<global::OpenTelemetry.Proto.Common.V1.InstrumentationScope, string>(r.Scope, r.SchemaUrl))
250+
.OrderBy(group => group.Key.Item1.Name);
251+
252+
var scopeMetrics = new List<object>();
253+
foreach (var scopeMetricByResource in scopeMetricsByResource)
254+
{
255+
var metrics = scopeMetricByResource
256+
.SelectMany(r => r.Metrics)
257+
.GroupBy(r => r.Name)
258+
.OrderBy(group => group.Key)
259+
.Select(group => group.First())
260+
.ToList();
261+
262+
scopeMetrics.Add(new
263+
{
264+
Scope = scopeMetricByResource.Key.Item1,
265+
Metrics = metrics,
266+
SchemaUrl = scopeMetricByResource.Key.Item2
267+
});
268+
}
269+
270+
// Filter out the telemetry resource name, if any
271+
foreach (var attribute in resourceMetricByResource.Key.Item1.Attributes)
272+
{
273+
if (attribute.Key.Equals("telemetry.sdk.version"))
274+
{
275+
attribute.Value.StringValue = "sdk-version";
276+
}
277+
}
278+
279+
// Although there's only one resource, let's still emit snapshot data in the expected array format
280+
var resourceMetrics = new object[]
281+
{
282+
new
283+
{
284+
Resource = resourceMetricByResource.Key.Item1,
285+
ScopeMetrics = scopeMetrics,
286+
SchemaUrl = resourceMetricByResource.Key.Item2,
287+
}
288+
};
246289

247290
var settings = VerifyHelper.GetSpanVerifierSettings();
248291
settings.AddRegexScrubber(_timeUnixNanoRegexMetrics, @"TimeUnixNano"": <DateTimeOffset.Now>");
249292

250293
var suffix = GetSuffix(packageVersion);
251294
var fileName = $"{nameof(OpenTelemetrySdkTests)}.SubmitsOtlpMetrics{suffix}{snapshotName}";
252295

253-
await Verifier.Verify(snapshotPayload, settings)
296+
await Verifier.Verify(resourceMetrics, settings)
254297
.UseFileName(fileName)
255298
.DisableRequireUniquePrefix();
256299
}

tracer/test/Datadog.Trace.TestHelpers/MockTracerAgent.cs

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -976,34 +976,28 @@ private void HandlePotentialOtlpData(MockHttpRequest request)
976976
h => h.Value.ToList(),
977977
StringComparer.OrdinalIgnoreCase);
978978

979-
object deserializedData = null;
979+
MetricsData metricsData = null;
980980

981981
if (request.PathAndQuery.StartsWith("/v1/metrics") && contentType.Contains("protobuf"))
982982
{
983983
try
984984
{
985-
var metricsData = MetricsData.Parser.ParseFrom(body);
986-
var resource = metricsData.ResourceMetrics;
987-
988-
if (resource is not null)
989-
{
990-
deserializedData = resource;
991-
}
985+
metricsData = MetricsData.Parser.ParseFrom(body);
992986
}
993987
catch (Exception ex)
994988
{
995989
Output?.WriteLine($"[OTLP] Failed to deserialize metrics data: {ex.Message}");
996990
}
997991
}
998992

999-
if (deserializedData is not null)
993+
if (metricsData is not null)
1000994
{
1001995
OtlpRequests.Enqueue(new MockOtlpRequest(
1002996
request.PathAndQuery,
1003997
headersDict,
1004998
body,
1005999
contentType,
1006-
deserializedData));
1000+
metricsData));
10071001
}
10081002
}
10091003

@@ -1704,13 +1698,13 @@ private async Task HandleUdsTraces()
17041698

17051699
public class MockOtlpRequest
17061700
{
1707-
public MockOtlpRequest(string pathAndQuery, Dictionary<string, List<string>> headers, byte[] body, string contentType, object deserializedData)
1701+
public MockOtlpRequest(string pathAndQuery, Dictionary<string, List<string>> headers, byte[] body, string contentType, MetricsData metricsData)
17081702
{
17091703
PathAndQuery = pathAndQuery;
17101704
Headers = headers;
17111705
Body = body;
17121706
ContentType = contentType;
1713-
DeserializedData = deserializedData;
1707+
MetricsData = metricsData;
17141708
}
17151709

17161710
public string PathAndQuery { get; }
@@ -1724,7 +1718,7 @@ public MockOtlpRequest(string pathAndQuery, Dictionary<string, List<string>> hea
17241718
/// <summary>
17251719
/// Gets the deserialized protobuf data (if available)
17261720
/// </summary>
1727-
public object DeserializedData { get; }
1721+
public MetricsData MetricsData { get; }
17281722
}
17291723
}
17301724
}

0 commit comments

Comments
 (0)