Skip to content
98 changes: 83 additions & 15 deletions source/Scrapers/TeamCityCompatibleAgentsScraper.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Net.Http.Json;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Configuration;
Expand All @@ -15,6 +18,7 @@ class TeamCityCompatibleAgentsScraper : BackgroundService
readonly IMetricFactory metricFactory;
readonly IConfiguration configuration;
readonly HashSet<(string buildTypeId, string buildId, string queuedDateTime)> seenBuildsNoAgents = new();
readonly HttpClient httpClient = new();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we discussed; sometimes this isn't good, but for right now let's leave it here and not instantiate one per invocation (because I honestly don't remember which way would be better given the request pattern here!)


public TeamCityCompatibleAgentsScraper(IMetricFactory metricFactory, IConfiguration configuration, ILogger logger)
: base(logger.ForContext("Scraper", nameof(TeamCityCompatibleAgentsScraper)))
Expand All @@ -25,47 +29,111 @@ public TeamCityCompatibleAgentsScraper(IMetricFactory metricFactory, IConfigurat

protected override TimeSpan DelayBetweenScrapes => TimeSpan.FromSeconds(60);

protected override async Task Scrape(CancellationToken stoppingToken)
async Task<QueuedWaitReasonsResponse> GetQueuedWaitReasons(string teamCityUrl, string teamCityToken, string buildId, bool useSSL)
{
await Task.CompletedTask;
var protocol = useSSL ? "https" : "http";
var url = $"{protocol}://{teamCityUrl}/app/rest/buildQueue/id:{buildId}?fields=queuedWaitReasons(property(name,value))";

var request = new HttpRequestMessage(HttpMethod.Get, url);
request.Headers.Add("Authorization", $"Bearer {teamCityToken}");
request.Headers.Add("Accept", "application/json");

var response = await httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();

return await response.Content.ReadFromJsonAsync<QueuedWaitReasonsResponse>(new JsonSerializerOptions
{
PropertyNameCaseInsensitive = true
});
}

protected override async Task Scrape(CancellationToken stoppingToken)
{
var teamCityToken = configuration.GetValue<string>("TEAMCITY_TOKEN");
var teamCityUrl = configuration.GetValue<string>("BUILD_SERVER_URL");
var useSSL = configuration.GetValue<bool>("USE_SSL");
var teamCityClient = new TeamCityClient(teamCityUrl, useSSL);

teamCityClient.ConnectWithAccessToken(teamCityToken);

// only look at builds that have been queued for 30 minutes
var thirtyMinutesAgo = DateTime.UtcNow.AddMinutes(-30);
var queuedBuilds = teamCityClient.BuildQueue
.GetFields("count,build(id,waitReason,buildTypeId,queuedDate,compatibleAgents(count,agent(id)))")
.All()
// exclude builds with no wait reason - these are the ones that are 'starting shortly'
.Where(qb => qb.WaitReason != null)
.ToArray();

// Track builds with no compatible agents
var buildsNoCompatibleAgents = queuedBuilds
.Where(qb => qb.WaitReason == "There are no idle compatible agents which can run this build")
.Where(qb => qb.CompatibleAgents?.Agent == null || qb.CompatibleAgents.Agent.Count == 0)
.Where(qb => qb.QueuedDate <= thirtyMinutesAgo)
.ToArray();

var noAgentsGauge = metricFactory.CreateGauge("queued_builds_no_compatible_agents", "Queued builds waiting with no compatible agents available", "buildTypeId", "buildId", "queuedDateTime");

foreach (var build in buildsNoCompatibleAgents)
var buildsNoCompatibleAgents = new List<(string buildTypeId, string buildId, string queuedDateTime)>();

foreach (var build in queuedBuilds)
{
noAgentsGauge.WithLabels(build.BuildTypeId, build.Id, build.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ")).Set(1);
Logger.Debug("Build Type {BuildTypeId}, build ID {BuildId} has no compatible agents, queued at {QueuedDateTime}", build.BuildTypeId, build.Id, build.QueuedDate);
// Fetch queuedWaitReasons from TeamCity API
QueuedWaitReasonsResponse waitReasonsResponse = null;
try
{
waitReasonsResponse = await GetQueuedWaitReasons(teamCityUrl, teamCityToken, build.Id, useSSL);
}
catch (Exception ex)
{
Logger.Warning(ex, "Failed to fetch queuedWaitReasons for build {BuildId}", build.Id);
}

// Check if this build has the "no compatible agents" wait reason
var noAgentsWaitReason = waitReasonsResponse?.QueuedWaitReasons?.Property
?.FirstOrDefault(p => p.Name == "There are no idle compatible agents which can run this build");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Usually we would extract this kind of 'magic string' out to a const with a useful name like 'NoCompatibleAgentWaitReason' - not blocker, just FYI


if (noAgentsWaitReason != null && !string.IsNullOrEmpty(noAgentsWaitReason.Value))
{
// Parse the wait time in milliseconds and convert to minutes
if (long.TryParse(noAgentsWaitReason.Value, out var milliseconds))
{
var waitTimeMinutes = Math.Round(milliseconds / (60.0 * 1000.0));

// Only track builds that have been waiting for more than 30 minutes
if (waitTimeMinutes > 30)
Comment on lines +90 to +98
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect we could refactor this as LINQ, but for now keep it if it makes the logic easier to understand.

{
buildsNoCompatibleAgents.Add((build.BuildTypeId, build.Id, build.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ")));

noAgentsGauge.WithLabels(build.BuildTypeId, build.Id, build.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ")).Set(1);
Logger.Information("ALERT: Build Type {BuildTypeId}, build ID {BuildId} has been waiting with no compatible agents for {WaitTimeMinutes} minutes (threshold exceeded)",
build.BuildTypeId, build.Id, waitTimeMinutes);
}
}
}
}

var currentBuildsNoAgents = buildsNoCompatibleAgents.Select(b => (b.BuildTypeId, b.Id, b.QueuedDate.ToString("yyyy-MM-ddTHH:mm:ssZ"))).ToArray();
seenBuildsNoAgents.UnionWith(currentBuildsNoAgents);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we aren't unioning any more? I think the absent builds won't work without it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, just saw it lower down. Is that intentional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't impact the logic if it happens above or below. It just made sense to me for it to happen at the end of the function.

var absentBuildsNoAgents = seenBuildsNoAgents.Except(currentBuildsNoAgents);
var currentBuildsNoAgents = buildsNoCompatibleAgents.ToArray();
var absentBuildsNoAgents = seenBuildsNoAgents.Except(currentBuildsNoAgents).ToArray();

foreach (var (buildTypeId, buildId, queuedDateTime) in absentBuildsNoAgents)
{
noAgentsGauge.RemoveLabelled(buildTypeId, buildId, queuedDateTime);
Logger.Debug("Build Type {BuildTypeId}, build ID {BuildId} queued at {QueuedDateTime} no longer waiting with no compatible agents", buildTypeId, buildId, queuedDateTime);
Logger.Information("RESOLVED: Build Type {BuildTypeId}, build ID {BuildId} queued at {QueuedDateTime} no longer waiting with no compatible agents", buildTypeId, buildId, queuedDateTime);
seenBuildsNoAgents.Remove((buildTypeId, buildId, queuedDateTime));
}

seenBuildsNoAgents.UnionWith(currentBuildsNoAgents);
}
}

class QueuedWaitReasonsResponse
{
public QueuedWaitReasons QueuedWaitReasons { get; set; }
}

class QueuedWaitReasons
{
public List<WaitReasonProperty> Property { get; set; }
}

class WaitReasonProperty
{
public string Name { get; set; }
public string Value { get; set; }
}
Comment on lines +124 to +138
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These might be able to be record instead, which makes all the getter/setter stuff simpler, but that's a later problem (if ever).

}