Skip to content

Commit 0e1a44e

Browse files
authored
Ignore rejit timeout errors in smoke tests and report a metric (#7370)
## Summary of changes Ignore the "rejit timed out" errors in smoke tests, but report a metric ## Reason for change We're still seeing flake from this. We don't want to increase the timeout further at this stage, so instead we want to allow it, and report a metric so that we can track the issue ## Implementation details Added a helper for sending a metric, and added a "reportablePatterns" list for the smoke tests. ## Test coverage This is the test, will see if any of the jobs hit it and 🤞 ## Other details We will set up a dashboard to monitor these (and other skip metrics) as part of the pre-release process
1 parent de2b8fa commit 0e1a44e

File tree

2 files changed

+177
-5
lines changed

2 files changed

+177
-5
lines changed

tracer/build/_build/Build.Steps.cs

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2415,7 +2415,7 @@ string NormalizedPath(AbsolutePath ap)
24152415
Target CheckBuildLogsForErrors => _ => _
24162416
.Unlisted()
24172417
.Description("Reads the logs from build_data and checks for error lines")
2418-
.Executes(() =>
2418+
.Executes(async () =>
24192419
{
24202420
// we expect to see _some_ errors, so explicitly ignore them
24212421
var knownPatterns = new List<Regex>
@@ -2453,13 +2453,13 @@ string NormalizedPath(AbsolutePath ap)
24532453
new(@".*Some errors were found while applying waf configuration \(RulesFile: rasp-rule-set.json\).*", RegexOptions.Compiled),
24542454
};
24552455

2456-
CheckLogsForErrors(knownPatterns, allFilesMustExist: false, minLogLevel: LogLevel.Error);
2456+
await CheckLogsForErrors(knownPatterns, allFilesMustExist: false, minLogLevel: LogLevel.Error, new ());
24572457
});
24582458

24592459
Target CheckSmokeTestsForErrors => _ => _
24602460
.Unlisted()
24612461
.Description("Reads the logs from build_data and checks for error lines in the smoke test logs")
2462-
.Executes(() =>
2462+
.Executes(async () =>
24632463
{
24642464
var knownPatterns = new List<Regex>();
24652465

@@ -2507,10 +2507,18 @@ string NormalizedPath(AbsolutePath ap)
25072507

25082508
// glibc TLS-reuse bug warnings
25092509
knownPatterns.Add(new(@".*GLIBC version 2.34-2.36 has a TLS-reuse bug.*", RegexOptions.Compiled));
2510-
CheckLogsForErrors(knownPatterns, allFilesMustExist: true, minLogLevel: LogLevel.Warning);
2510+
2511+
// These patterns should be ignored, but we should send a metric when they occur
2512+
// so that we can track they don't happen too often and gate releases on them etc
2513+
var reportablePatterns = new List<(string IgnoreReasonTag, Regex Regex)>
2514+
{
2515+
new("rejit_thread_timeout", new(@".*Timeout while waiting for the rejit requests to be processed. Rejit will continue asynchronously, but some initial calls may not be instrumented.*", RegexOptions.Compiled))
2516+
};
2517+
2518+
await CheckLogsForErrors(knownPatterns, allFilesMustExist: true, minLogLevel: LogLevel.Warning, reportablePatterns);
25112519
});
25122520

2513-
private void CheckLogsForErrors(List<Regex> knownPatterns, bool allFilesMustExist, LogLevel minLogLevel)
2521+
private async Task CheckLogsForErrors(List<Regex> knownPatterns, bool allFilesMustExist, LogLevel minLogLevel, List<(string IgnoreReasonTag, Regex Regex)> reportablePatterns)
25142522
{
25152523
var logDirectory = BuildDataDirectory / "logs";
25162524
if (!logDirectory.Exists())
@@ -2523,6 +2531,8 @@ private void CheckLogsForErrors(List<Regex> knownPatterns, bool allFilesMustExis
25232531
}
25242532
}
25252533

2534+
Dictionary<string, int> reportableMetrics = new();
2535+
25262536
var managedFiles = logDirectory.GlobFiles("**/dotnet-tracer-managed-*");
25272537
var managedErrors = managedFiles
25282538
.SelectMany(ParseManagedLogFiles)
@@ -2565,6 +2575,12 @@ private void CheckLogsForErrors(List<Regex> knownPatterns, bool allFilesMustExis
25652575
|| nativeProfilerErrors.Count != 0
25662576
|| nativeLoaderErrors.Count != 0;
25672577

2578+
if (reportableMetrics.Count > 0)
2579+
{
2580+
Logger.Warning("Found reportable (but ignored) problems in the logs");
2581+
await MetricHelper.SendReportableErrorMetrics(Logger.Logger, reportableMetrics);
2582+
}
2583+
25682584
if (hasRequiredFiles && !hasErrors)
25692585
{
25702586
Logger.Information("No problems found in managed or native logs");
@@ -2628,6 +2644,16 @@ bool IsProblematic(ParsedLogLine logLine)
26282644
return false;
26292645
}
26302646

2647+
foreach (var pattern in reportablePatterns)
2648+
{
2649+
if (pattern.Regex.IsMatch(logLine.Message))
2650+
{
2651+
var previous = reportableMetrics.GetValueOrDefault(pattern.IgnoreReasonTag, 0);
2652+
reportableMetrics[pattern.IgnoreReasonTag] = previous + 1;
2653+
return false;
2654+
}
2655+
}
2656+
26312657
foreach (var pattern in knownPatterns)
26322658
{
26332659
if (pattern.IsMatch(logLine.Message))

tracer/build/_build/MetricHelper.cs

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
// <copyright file="GenerateIntegrationDefinitions.cs" company="Datadog">
2+
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2017 Datadog, Inc.
4+
// </copyright>
5+
6+
using System;
7+
using System.Collections.Generic;
8+
using System.Diagnostics.CodeAnalysis;
9+
using System.Linq;
10+
using System.Net.Http;
11+
using System.Text;
12+
using System.Threading.Tasks;
13+
using Serilog;
14+
15+
public static class MetricHelper
16+
{
17+
public static Task SendReportableErrorMetrics(ILogger log, Dictionary<string, int> errors)
18+
{
19+
if (errors.Count == 0)
20+
{
21+
return Task.CompletedTask;
22+
}
23+
24+
const string metricName = "dd_trace_dotnet.ci.smoke_tests.reportable_errors";
25+
26+
return SendMetric(log, metricName: metricName, errors.Select(kvp => CreatePoint(kvp.Key, kvp.Value)));
27+
28+
static string CreatePoint(string errorReason, int count)
29+
{
30+
var tags = $$"""
31+
"ci.stage:{{SanitizeTagValue(Environment.GetEnvironmentVariable("DD_LOGGER_SYSTEM_STAGEDISPLAYNAME"))}}",
32+
"ci.job:{{SanitizeTagValue(Environment.GetEnvironmentVariable("DD_LOGGER_SYSTEM_JOBDISPLAYNAME"))}}",
33+
"git.branch:{{SanitizeTagValue(Environment.GetEnvironmentVariable("DD_LOGGER_BUILD_SOURCEBRANCH"))}}",
34+
"error_reason:{{SanitizeTagValue(errorReason)}}"
35+
""";
36+
37+
return $$"""
38+
{
39+
"metric": "{{metricName}}",
40+
"type": 1,
41+
"points": [{
42+
"timestamp": {{((DateTimeOffset)DateTime.UtcNow).ToUnixTimeSeconds()}},
43+
"value": {{count}}
44+
}],
45+
"tags": [
46+
{{tags}}
47+
]
48+
}
49+
""";
50+
}
51+
}
52+
53+
private static async Task SendMetric(ILogger log, string metricName, IEnumerable<string> metrics)
54+
{
55+
var envKey = Environment.GetEnvironmentVariable("DD_LOGGER_DD_API_KEY");
56+
if (string.IsNullOrEmpty(envKey))
57+
{
58+
// We're probably not in CI
59+
log.Debug("No CI API Key found, skipping {MetricName} metric submission", metricName);
60+
return;
61+
}
62+
63+
var payload = $$"""{ "series": [{{string.Join(",", metrics)}}] }""";
64+
65+
try
66+
{
67+
using var client = new HttpClient();
68+
client.DefaultRequestHeaders.Add("DD-API-KEY", envKey);
69+
70+
var content = new StringContent(payload, Encoding.UTF8, "application/json");
71+
var response = await client.PostAsync("https://api.datadoghq.com/api/v2/series", content);
72+
var responseContent = await response.Content.ReadAsStringAsync();
73+
74+
var result = response.IsSuccessStatusCode
75+
? "Successfully submitted metric"
76+
: "Failed to submit metric";
77+
log.Warning("{Result} {MetricName}. Response was: Code: {ResponseStatusCode}. Response: {ResponseContent}. Payload sent was: \"{Payload}\"", result, metricName, response.StatusCode, responseContent, payload);
78+
}
79+
catch (Exception ex)
80+
{
81+
log.Error(ex, "Error sending {MetricName} metric to backend with payload \"{Payload}\"", metricName, payload);
82+
}
83+
}
84+
85+
private static string SanitizeTagValue(string tag)
86+
{
87+
// Copied from
88+
// SpanTagHelper.TryNormalizeTagName(tag, normalizeSpaces: true, out var normalizedTag);
89+
return TryNormalizeTagName(tag, normalizeSpaces: true, out var normalizedTag) ? normalizedTag : tag;
90+
91+
static bool TryNormalizeTagName(
92+
string value,
93+
bool normalizeSpaces,
94+
[NotNullWhen(returnValue: true)] out string? normalizedTagName)
95+
{
96+
normalizedTagName = null;
97+
98+
if (!IsValidTagName(value, out var trimmedValue))
99+
{
100+
return false;
101+
}
102+
103+
var sb = new StringBuilder(trimmedValue.Length);
104+
sb.Append(trimmedValue.ToLowerInvariant());
105+
106+
for (var x = 0; x < sb.Length; x++)
107+
{
108+
switch (sb[x])
109+
{
110+
case (>= 'a' and <= 'z') or (>= '0' and <= '9') or '_' or ':' or '/' or '-':
111+
continue;
112+
case ' ' when !normalizeSpaces:
113+
continue;
114+
default:
115+
sb[x] = '_';
116+
break;
117+
}
118+
}
119+
120+
normalizedTagName = sb.ToString();
121+
return true;
122+
}
123+
124+
static bool IsValidTagName(
125+
string value,
126+
[NotNullWhen(returnValue: true)] out string? trimmedValue)
127+
{
128+
trimmedValue = null;
129+
130+
if (string.IsNullOrWhiteSpace(value))
131+
{
132+
return false;
133+
}
134+
135+
var trimmedTemp = value.Trim();
136+
137+
if (!char.IsLetter(trimmedTemp[0]) || trimmedTemp.Length > 200)
138+
{
139+
return false;
140+
}
141+
142+
trimmedValue = trimmedTemp;
143+
return true;
144+
}
145+
}
146+
}

0 commit comments

Comments
 (0)