-
Notifications
You must be signed in to change notification settings - Fork 0
no compatible agents alerts only if waiting 30+ minutes #122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2a00eaa
108c9af
513d064
88a3330
34ce660
2d110d6
85ddc31
fa67b5c
4cc6653
677e888
3e7093c
81b4268
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,9 @@ | ||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.Linq; | ||
| using System.Net.Http; | ||
| using System.Net.Http.Json; | ||
| using System.Text.Json; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Extensions.Configuration; | ||
|
|
@@ -15,6 +18,7 @@ class TeamCityCompatibleAgentsScraper : BackgroundService | |
| readonly IMetricFactory metricFactory; | ||
| readonly IConfiguration configuration; | ||
| readonly HashSet<(string buildTypeId, string buildId, string queuedDateTime)> seenBuildsNoAgents = new(); | ||
| readonly HttpClient httpClient = new(); | ||
|
|
||
| public TeamCityCompatibleAgentsScraper(IMetricFactory metricFactory, IConfiguration configuration, ILogger logger) | ||
| : base(logger.ForContext("Scraper", nameof(TeamCityCompatibleAgentsScraper))) | ||
|
|
@@ -25,47 +29,111 @@ public TeamCityCompatibleAgentsScraper(IMetricFactory metricFactory, IConfigurat | |
|
|
||
| protected override TimeSpan DelayBetweenScrapes => TimeSpan.FromSeconds(60); | ||
|
|
||
| protected override async Task Scrape(CancellationToken stoppingToken) | ||
| async Task<QueuedWaitReasonsResponse> GetQueuedWaitReasons(string teamCityUrl, string teamCityToken, string buildId, bool useSSL) | ||
| { | ||
| await Task.CompletedTask; | ||
| var protocol = useSSL ? "https" : "http"; | ||
| var url = $"{protocol}://{teamCityUrl}/app/rest/buildQueue/id:{buildId}?fields=queuedWaitReasons(property(name,value))"; | ||
|
|
||
| var request = new HttpRequestMessage(HttpMethod.Get, url); | ||
| request.Headers.Add("Authorization", $"Bearer {teamCityToken}"); | ||
| request.Headers.Add("Accept", "application/json"); | ||
|
|
||
| var response = await httpClient.SendAsync(request); | ||
| response.EnsureSuccessStatusCode(); | ||
|
|
||
| return await response.Content.ReadFromJsonAsync<QueuedWaitReasonsResponse>(new JsonSerializerOptions | ||
| { | ||
| PropertyNameCaseInsensitive = true | ||
| }); | ||
| } | ||
|
|
||
| protected override async Task Scrape(CancellationToken stoppingToken) | ||
| { | ||
| var teamCityToken = configuration.GetValue<string>("TEAMCITY_TOKEN"); | ||
| var teamCityUrl = configuration.GetValue<string>("BUILD_SERVER_URL"); | ||
| var useSSL = configuration.GetValue<bool>("USE_SSL"); | ||
| var teamCityClient = new TeamCityClient(teamCityUrl, useSSL); | ||
|
|
||
| teamCityClient.ConnectWithAccessToken(teamCityToken); | ||
|
|
||
| // only look at builds that have been queued for 30 minutes | ||
| var thirtyMinutesAgo = DateTime.UtcNow.AddMinutes(-30); | ||
| var queuedBuilds = teamCityClient.BuildQueue | ||
| .GetFields("count,build(id,waitReason,buildTypeId,queuedDate,compatibleAgents(count,agent(id)))") | ||
| .All() | ||
| // exclude builds with no wait reason - these are the ones that are 'starting shortly' | ||
| .Where(qb => qb.WaitReason != null) | ||
| .ToArray(); | ||
|
|
||
| // Track builds with no compatible agents | ||
| var buildsNoCompatibleAgents = queuedBuilds | ||
| .Where(qb => qb.WaitReason == "There are no idle compatible agents which can run this build") | ||
| .Where(qb => qb.CompatibleAgents?.Agent == null || qb.CompatibleAgents.Agent.Count == 0) | ||
| .Where(qb => qb.QueuedDate <= thirtyMinutesAgo) | ||
| .ToArray(); | ||
|
|
||
| var noAgentsGauge = metricFactory.CreateGauge("queued_builds_no_compatible_agents", "Queued builds waiting with no compatible agents available", "buildTypeId", "buildId", "queuedDateTime"); | ||
|
|
||
| foreach (var build in buildsNoCompatibleAgents) | ||
| var buildsNoCompatibleAgents = new List<(string buildTypeId, string buildId, string queuedDateTime)>(); | ||
|
|
||
| foreach (var build in queuedBuilds) | ||
| { | ||
| noAgentsGauge.WithLabels(build.BuildTypeId, build.Id, build.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ")).Set(1); | ||
| Logger.Debug("Build Type {BuildTypeId}, build ID {BuildId} has no compatible agents, queued at {QueuedDateTime}", build.BuildTypeId, build.Id, build.QueuedDate); | ||
| // Fetch queuedWaitReasons from TeamCity API | ||
| QueuedWaitReasonsResponse waitReasonsResponse = null; | ||
| try | ||
| { | ||
| waitReasonsResponse = await GetQueuedWaitReasons(teamCityUrl, teamCityToken, build.Id, useSSL); | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| Logger.Warning(ex, "Failed to fetch queuedWaitReasons for build {BuildId}", build.Id); | ||
| } | ||
|
|
||
| // Check if this build has the "no compatible agents" wait reason | ||
| var noAgentsWaitReason = waitReasonsResponse?.QueuedWaitReasons?.Property | ||
| ?.FirstOrDefault(p => p.Name == "There are no idle compatible agents which can run this build"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Usually we would extract this kind of 'magic string' out to a |
||
|
|
||
| if (noAgentsWaitReason != null && !string.IsNullOrEmpty(noAgentsWaitReason.Value)) | ||
| { | ||
| // Parse the wait time in milliseconds and convert to minutes | ||
| if (long.TryParse(noAgentsWaitReason.Value, out var milliseconds)) | ||
| { | ||
| var waitTimeMinutes = Math.Round(milliseconds / (60.0 * 1000.0)); | ||
|
|
||
| // Only track builds that have been waiting for more than 30 minutes | ||
| if (waitTimeMinutes > 30) | ||
|
Comment on lines
+90
to
+98
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect we could refactor this as LINQ, but for now keep it if it makes the logic easier to understand. |
||
| { | ||
| buildsNoCompatibleAgents.Add((build.BuildTypeId, build.Id, build.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ"))); | ||
|
|
||
| noAgentsGauge.WithLabels(build.BuildTypeId, build.Id, build.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ")).Set(1); | ||
| Logger.Information("ALERT: Build Type {BuildTypeId}, build ID {BuildId} has been waiting with no compatible agents for {WaitTimeMinutes} minutes (threshold exceeded)", | ||
| build.BuildTypeId, build.Id, waitTimeMinutes); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var currentBuildsNoAgents = buildsNoCompatibleAgents.Select(b => (b.BuildTypeId, b.Id, b.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ"))).ToArray(); | ||
| seenBuildsNoAgents.UnionWith(currentBuildsNoAgents); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason we aren't unioning any more? I think the absent builds won't work without it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, just saw it lower down. Is that intentional?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't impact the logic if it happens above or below. It just made sense to me for it to happen at the end of the function. |
||
| var absentBuildsNoAgents = seenBuildsNoAgents.Except(currentBuildsNoAgents); | ||
| var currentBuildsNoAgents = buildsNoCompatibleAgents.ToArray(); | ||
| var absentBuildsNoAgents = seenBuildsNoAgents.Except(currentBuildsNoAgents).ToArray(); | ||
|
|
||
| foreach (var (buildTypeId, buildId, queuedDateTime) in absentBuildsNoAgents) | ||
| { | ||
| noAgentsGauge.RemoveLabelled(buildTypeId, buildId, queuedDateTime); | ||
| Logger.Debug("Build Type {BuildTypeId}, build ID {BuildId} queued at {QueuedDateTime} no longer waiting with no compatible agents", buildTypeId, buildId, queuedDateTime); | ||
| Logger.Information("RESOLVED: Build Type {BuildTypeId}, build ID {BuildId} queued at {QueuedDateTime} no longer waiting with no compatible agents", buildTypeId, buildId, queuedDateTime); | ||
| seenBuildsNoAgents.Remove((buildTypeId, buildId, queuedDateTime)); | ||
| } | ||
|
|
||
| seenBuildsNoAgents.UnionWith(currentBuildsNoAgents); | ||
| } | ||
| } | ||
|
|
||
| class QueuedWaitReasonsResponse | ||
| { | ||
| public QueuedWaitReasons QueuedWaitReasons { get; set; } | ||
| } | ||
|
|
||
| class QueuedWaitReasons | ||
| { | ||
| public List<WaitReasonProperty> Property { get; set; } | ||
| } | ||
|
|
||
| class WaitReasonProperty | ||
| { | ||
| public string Name { get; set; } | ||
| public string Value { get; set; } | ||
| } | ||
|
Comment on lines
+124
to
+138
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These might be able to be |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As we discussed; sometimes this isn't good, but for right now let's leave it here and not instantiate one per invocation (because I honestly don't remember which way would be better given the request pattern here!)