Add Borrow TTL handling, improve node sweeper, and refactor Dockerfiles for caching

ereticul · ereticul · commit 0c350ec11166 · 2025-08-31T16:09:46.000+03:00
- Introduced Borrow TTL sweeper to auto-return expired sessions and persist state to Redis.
- Enhanced NodeSweeperService to prune in-use entries, preventing capacity leaks from orphaned records.
- Updated Dockerfiles across services for better cache optimization.
- Revised documentation: added node liveness, sweeper explanation, and operational tips.
- Adjusted Redis session state persistence for better resilience and recovery.
- Minor refactoring of unused imports and code cleanup.
diff --git a/Agenix.PlaywrightGrid.Domain.Tests/LabelKeyTests.cs b/Agenix.PlaywrightGrid.Domain.Tests/LabelKeyTests.cs
@@ -1,5 +1,5 @@
-using NUnit.Framework;
 using Agenix.PlaywrightGrid.Domain;
+using NUnit.Framework;
 
 namespace Agenix.PlaywrightGrid.Domain.Tests;
 
diff --git a/Agenix.PlaywrightGrid.Domain.Tests/LabelMatcherTests.cs b/Agenix.PlaywrightGrid.Domain.Tests/LabelMatcherTests.cs
@@ -1,8 +1,8 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
-using NUnit.Framework;
 using Agenix.PlaywrightGrid.Domain;
+using NUnit.Framework;
 
 namespace Agenix.PlaywrightGrid.Domain.Tests;
 
diff --git a/Agenix.PlaywrightGrid.Domain/Agenix.PlaywrightGrid.Domain.csproj b/Agenix.PlaywrightGrid.Domain/Agenix.PlaywrightGrid.Domain.csproj
@@ -4,6 +4,7 @@
     <TargetFramework>net8.0</TargetFramework>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
+    <IsPackable>false</IsPackable>
   </PropertyGroup>
 
 </Project>
diff --git a/Agenix.PlaywrightGrid.HubClient/Agenix.PlaywrightGrid.HubClient.csproj b/Agenix.PlaywrightGrid.HubClient/Agenix.PlaywrightGrid.HubClient.csproj
@@ -20,7 +20,4 @@
     <PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="9.0.8"/>
     <PackageReference Include="Microsoft.Playwright" Version="1.54.0"/>
   </ItemGroup>
-  <ItemGroup>
-    <ProjectReference Include="..\Agenix.PlaywrightGrid.Domain\Agenix.PlaywrightGrid.Domain.csproj" />
-  </ItemGroup>
 </Project>
diff --git a/Agenix.PlaywrightGrid.HubClient/HubUrlProvider.cs b/Agenix.PlaywrightGrid.HubClient/HubUrlProvider.cs
@@ -1,4 +1,3 @@
-using System;
 using Microsoft.Extensions.Configuration;
 
 namespace Agenix.PlaywrightGrid.HubClient;
diff --git a/Agenix.PlaywrightGrid.HubClient/PlaywrightEventForwarder.cs b/Agenix.PlaywrightGrid.HubClient/PlaywrightEventForwarder.cs
@@ -1,5 +1,3 @@
-using System;
-using System.Threading.Tasks;
 using Microsoft.Playwright;
 
 namespace Agenix.PlaywrightGrid.HubClient;
diff --git a/dashboard/Dockerfile b/dashboard/Dockerfile
@@ -3,13 +3,11 @@ FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
 ARG BUILD_CONFIGURATION=Release
 WORKDIR /src
 
-# Restore using only the project file for better layer caching
-COPY dashboard/Dashboard.csproj dashboard/
-COPY Agenix.PlaywrightGrid.Domain/Agenix.PlaywrightGrid.Domain.csproj Agenix.PlaywrightGrid.Domain/
-RUN dotnet restore dashboard/Dashboard.csproj
-
-# Build and publish
+# Copy source
 COPY . .
+# Restore
+RUN dotnet restore dashboard/Dashboard.csproj
+# Publish
 RUN dotnet publish dashboard/Dashboard.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
 
 # Runtime stage
diff --git a/dashboard/Program.cs b/dashboard/Program.cs
@@ -11,10 +11,10 @@
 using Microsoft.Extensions.DependencyInjection;
 using Microsoft.Extensions.Hosting;
 using Microsoft.Extensions.Logging;
+using OpenTelemetry.Exporter;
 using OpenTelemetry.Metrics;
-using OpenTelemetry.Trace;
 using OpenTelemetry.Resources;
-using OpenTelemetry.Exporter;
+using OpenTelemetry.Trace;
 
 const string hubSignalRConfigKey = "HUB_SIGNALR";
 
diff --git a/docs/Node-Liveness-and-Sweeper.md b/docs/Node-Liveness-and-Sweeper.md
@@ -0,0 +1,63 @@
+# Node Liveness and Sweeper (Hub)
+
+This document explains how the Hub tracks worker node liveness, the configuration knobs, and what happens when a node becomes stale or disappears. It also outlines how orphaned sessions are reclaimed to avoid capacity leaks.
+
+Overview
+- Workers periodically emit heartbeats to Redis updating:
+  - node:{nodeId} hash fields: LastSeen (ISO-8601 UTC), Labels (JSON), Capacity
+  - nodes set membership (nodeId)
+  - node_alive:{nodeId} key with TTL (default 90s in Worker)
+- The Hub runs a background NodeSweeperService that periodically scans for stale nodes and prunes associated capacity entries. It complements the Worker heartbeats by performing garbage collection when nodes are dead or unreachable.
+
+Configuration (environment variables)
+- HUB_NODE_TIMEOUT: seconds of inactivity before a node is considered stale. Default: 60.
+- HUB_SWEEPER_EXPIRE: if true, the sweeper will actually expire nodes and prune data. If false, it will refresh a short TTL on node_alive:{nodeId} and log what would happen. Default: false (dry-run).
+
+How the sweeper works
+1) Tick interval: every ~20 seconds the service performs a pass.
+2) For each nodeId in Redis set "nodes":
+   - If node_alive:{nodeId} exists → node is healthy, skip.
+   - Else, parse node:{nodeId} LastSeen (strict ISO-8601 Roundtrip). If missing/invalid or older than HUB_NODE_TIMEOUT → candidate for expiration.
+   - Small tolerance: if LastSeen is in the future by >5s (clock skew), do not expire.
+   - Double check: if node_alive:{nodeId} re-appears during the pass, skip to avoid race with a fresh heartbeat.
+   - If there are still available:* entries that reference this node, we treat the node as alive and refresh node_alive TTL to 30s, skipping expiration for this tick. This avoids evicting a node that is actively serving capacity but briefly missed heartbeat.
+3) When expiring (HUB_SWEEPER_EXPIRE=true):
+   - Remove nodeId from set "nodes" and delete hash key node:{nodeId}.
+   - Prune available:* lists: remove entries containing this nodeId.
+   - Prune inuse:* lists (new): remove entries containing this nodeId, and best-effort delete lightweight mappings browser_run:{browserId} and browser_test:{browserId} if browserId is present. This reclaims capacity that would otherwise be stuck.
+4) Logs include per-tick stats: scanned, expired, errors, and tick duration.
+
+Why prune inuse:* too?
+Previously, only available:* lists were pruned. If a node died while a browser was borrowed (inuse:*), capacity would remain stuck. The sweeper now removes those orphaned records and clears run/test mappings so new borrows are not blocked by phantom in-use entries.
+
+Related components
+- Worker HeartbeatService: updates LastSeen and sets node_alive TTL so healthy nodes are never swept.
+- RunCleanupService: a separate hub background service that can auto-return outstanding browsers when runs become inactive or exceed max duration. This operates at run level, whereas NodeSweeperService operates at node level.
+
+Operational tips
+- If you are testing locally and want to observe sweeper behavior quickly:
+  - Set HUB_NODE_TIMEOUT=5 and HUB_SWEEPER_EXPIRE=true on the hub.
+  - Stop a worker to simulate a dead node.
+  - Watch hub logs for "[Sweeper] Expiring node=..." and pruning messages.
+- In CI or during cautious rollouts, set HUB_SWEEPER_EXPIRE=false to dry-run. The sweeper will log and refresh a short node_alive TTL instead of deleting anything.
+
+Metrics
+- While the sweeper itself does not currently expose Prometheus metrics, overall pool gauges (available counts per label) are updated elsewhere. Consider adding sweeper-specific counters if needed for ops visibility.
+
+Security considerations
+- The sweeper only reads/writes keys used by the grid. Keys deleted are specific to the expired node or to browserId mappings captured from the in-use entries.
+
+Version
+- Introduced orphaned in-use pruning in this repository session (2025-08-31).
+
+Interpreting Sweeper logs
+- The service logs a summary at the end of each pass, e.g.: [Sweeper] Tick done: scanned=3 expired=0 errors=0 took=2ms
+  - scanned=N: number of nodeIds in the Redis set "nodes" that were evaluated this tick.
+  - expired=N: how many nodes were actually expired (removed and pruned) in this tick. This remains 0 when:
+    - Nodes are healthy (node_alive:{nodeId} TTL present), or
+    - LastSeen is within HUB_NODE_TIMEOUT, or
+    - HUB_SWEEPER_EXPIRE=false (dry-run mode), or
+    - The sweeper detected active available:* entries for the node and refreshed a short TTL instead of expiring.
+  - errors=N: number of caught exceptions during processing (per-node or loop-level). Non-zero suggests Redis or parsing issues.
+  - took=Xms: how long the entire sweep iteration took in milliseconds.
+- If you see scanned>0 with expired=0 consistently, it typically means heartbeats are healthy and no nodes are stale.
diff --git a/docs/tasks.md b/docs/tasks.md
@@ -14,8 +14,8 @@ The following is an ordered, actionable checklist covering architectural and cod
 8. [X] Add distributed tracing via OpenTelemetry (traces, metrics, logs) with exporters configurable (OTLP/Prometheus).
 9. [X] Expand Prometheus metrics: borrow latency histogram, borrow outcomes (success/timeout/denied), pool utilization per label, queue length, node heartbeats.
 10. [X] Introduce a capacity queue in Hub for pending borrows with timeout and fairness (per-label and per-run caps) to reduce thundering herd.
-11. [ ] Implement node heartbeat/liveness tracker with configurable timeout; evict stale nodes and reclaim/expire orphaned sessions.
-12. [ ] Add borrow TTL and auto-return on timeout; persist session state to Redis to survive Hub restarts.
+11. [X] Implement node heartbeat/liveness tracker with configurable timeout; evict stale nodes and reclaim/expire orphaned sessions.
+12. [X] Add borrow TTL and auto-return on timeout; persist session state to Redis to survive Hub restarts.
 13. [ ] Harden Redis usage: resilience (timeouts, retries with jitter, circuit breaker), connection settings, and health checks integrated into readiness.
 14. [ ] Support secret rotation: accept multiple HUB_RUNNER_SECRET/HUB_NODE_SECRET values (comma-separated) and log deprecation windows.
 15. [ ] Redact secrets and PII in logs; ensure headers and sensitive values never appear in structured logs.
diff --git a/hub/Dockerfile b/hub/Dockerfile
@@ -1,13 +1,10 @@
 # Build stage
 FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
 WORKDIR /src
-# Copy project files preserving folder structure for faster restores
-COPY hub/PlaywrightHub.csproj hub/
-COPY Agenix.PlaywrightGrid.Domain/Agenix.PlaywrightGrid.Domain.csproj Agenix.PlaywrightGrid.Domain/
-# Restore using the hub project relative to /src
-RUN dotnet restore hub/PlaywrightHub.csproj
-# Copy the rest of the source
+# Copy source
 COPY . .
+# Restore using the hub project
+RUN dotnet restore hub/PlaywrightHub.csproj
 # Publish the hub project
 RUN dotnet publish hub/PlaywrightHub.csproj -c Release -o /app/publish
 
diff --git a/hub/Infrastructure/Adapters/Background/BorrowTtlSweeperService.cs b/hub/Infrastructure/Adapters/Background/BorrowTtlSweeperService.cs
@@ -0,0 +1,118 @@
+using System.Text.Json;
+using Microsoft.Extensions.Configuration;
+using Microsoft.Extensions.Hosting;
+using StackExchange.Redis;
+
+namespace PlaywrightHub.Infrastructure.Adapters.Background;
+
+/// <summary>
+/// Background sweeper that auto-returns borrowed sessions whose TTL/lease has expired.
+/// Persists and consults session:* hashes to recover context after Hub restarts.
+/// </summary>
+public sealed class BorrowTtlSweeperService(IDatabase db, IConnectionMultiplexer mux, IConfiguration config) : BackgroundService
+{
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        var intervalSeconds = int.TryParse(config["HUB_BORROW_TTL_SWEEP_SECONDS"], out var s) ? Math.Max(5, s) : 10;
+        var server = mux.GetServer(mux.GetEndPoints()[0]);
+        var luaReturn = @"
+local inuse = KEYS[1]
+local avail = KEYS[2]
+local browserId = ARGV[1]
+local list = redis.call('LRANGE', inuse, 0, -1)
+for i,item in ipairs(list) do
+  if string.find(item, browserId, 1, true) then
+    redis.call('LREM', inuse, 1, item)
+    redis.call('RPUSH', avail, item)
+    return item
+  end
+end
+return nil
+";
+
+        Console.WriteLine($"[BorrowTTL] Starting. sweepInterval={intervalSeconds}s");
+
+        while (!stoppingToken.IsCancellationRequested)
+        {
+            var started = DateTime.UtcNow;
+            var processed = 0;
+            var returned = 0;
+            var errors = 0;
+            try
+            {
+                foreach (var key in server.Keys(pattern: "session:*"))
+                {
+                    if (stoppingToken.IsCancellationRequested) break;
+
+                    var sessionKey = key.ToString();
+                    var browserId = sessionKey.Substring("session:".Length);
+                    try
+                    {
+                        // if lease key still exists -> not expired
+                        if (await db.KeyExistsAsync($"borrow_ttl:{browserId}"))
+                        {
+                            processed++;
+                            continue;
+                        }
+
+                        // No lease key -> TTL expired. Load session record to know labelKey and attempt return.
+                        var entries = await db.HashGetAllAsync(sessionKey);
+                        if (entries is null || entries.Length == 0)
+                        {
+                            // Nothing to do; clean stray session key
+                            try { await db.KeyDeleteAsync(sessionKey); } catch { }
+                            processed++;
+                            continue;
+                        }
+                        string labelKey = entries.FirstOrDefault(e => e.Name == "labelKey").Value;
+                        if (!string.IsNullOrWhiteSpace(labelKey))
+                        {
+                            var inuseKey = $"inuse:{labelKey}";
+                            var availKey = $"available:{labelKey}";
+                            var res = await db.ScriptEvaluateAsync(luaReturn, new RedisKey[] { inuseKey, availKey }, new RedisValue[] { browserId });
+                            if (!res.IsNull)
+                            {
+                                returned++;
+                                // cleanup mappings
+                                try { await db.KeyDeleteAsync($"browser_run:{browserId}"); } catch { }
+                                try { await db.KeyDeleteAsync($"browser_test:{browserId}"); } catch { }
+                                try { await db.KeyDeleteAsync($"borrow_ttl:{browserId}"); } catch { }
+                                try { await db.KeyDeleteAsync(sessionKey); } catch { }
+                                // request recycle on worker
+                                try { await db.StringSetAsync($"recycle:{browserId}", "1", TimeSpan.FromMinutes(2)); } catch { }
+                            }
+                            else
+                            {
+                                // Could not find in inuse list; just clean session keys
+                                try { await db.KeyDeleteAsync($"borrow_ttl:{browserId}"); } catch { }
+                                try { await db.KeyDeleteAsync(sessionKey); } catch { }
+                            }
+                        }
+                        else
+                        {
+                            // malformed session; delete
+                            try { await db.KeyDeleteAsync(sessionKey); } catch { }
+                        }
+                        processed++;
+                    }
+                    catch (Exception ex)
+                    {
+                        errors++;
+                        Console.WriteLine($"[BorrowTTL] Error while processing {sessionKey}: {ex.Message}");
+                    }
+                }
+            }
+            catch (Exception exOuter)
+            {
+                errors++;
+                Console.WriteLine($"[BorrowTTL] Sweep error: {exOuter.Message}");
+            }
+
+            var took = (int)(DateTime.UtcNow - started).TotalMilliseconds;
+            Console.WriteLine($"[BorrowTTL] Tick done processed={processed} returned={returned} errors={errors} took={took}ms");
+
+            try { await Task.Delay(TimeSpan.FromSeconds(intervalSeconds), stoppingToken); }
+            catch (OperationCanceledException) { break; }
+        }
+    }
+}
diff --git a/hub/Infrastructure/Adapters/Background/NodeSweeperService.cs b/hub/Infrastructure/Adapters/Background/NodeSweeperService.cs
@@ -111,6 +111,7 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken)
                                 await db.SetRemoveAsync("nodes", nodeId);
                                 await db.KeyDeleteAsync(key);
                                 await PruneAvailableEntriesForNodeAsync(nodeId);
+                                await PruneInuseEntriesForNodeAsync(nodeId);
 
                                 expired++;
                             }
@@ -197,4 +198,47 @@ private async Task<bool> HasAvailableEntriesForNodeAsync(string nodeId)
 
         return false;
     }
+
+    /// <summary>
+    /// Removes orphaned in-use entries associated with the specified node from Redis and
+    /// clears lightweight browser mappings (browser_run:/browser_test:) when encountered.
+    /// This prevents capacity from being stuck when a worker disappears.
+    /// </summary>
+    private async Task PruneInuseEntriesForNodeAsync(string nodeId)
+    {
+        var server = mux.GetServer(mux.GetEndPoints()[0]);
+        var nodePattern = $"\"nodeId\":\"{nodeId}\"";
+        foreach (var rk in server.Keys(pattern: "inuse:*"))
+        {
+            var key = rk.ToString();
+            var list = await db.ListRangeAsync(key);
+            foreach (var item in list)
+            {
+                var s = item.ToString();
+                if (s.Contains(nodePattern, StringComparison.Ordinal))
+                {
+                    await db.ListRemoveAsync(key, item);
+
+                    // Best-effort: remove browser mappings if browserId is present in the JSON blob
+                    try
+                    {
+                        using var doc = JsonDocument.Parse(s);
+                        if (doc.RootElement.TryGetProperty("browserId", out var bidEl) &&
+                            bidEl.ValueKind == JsonValueKind.String)
+                        {
+                            var browserId = bidEl.GetString();
+                            if (!string.IsNullOrWhiteSpace(browserId))
+                            {
+                                try { await db.KeyDeleteAsync($"browser_run:{browserId}"); } catch { }
+                                try { await db.KeyDeleteAsync($"browser_test:{browserId}"); } catch { }
+                            }
+                        }
+                    }
+                    catch { }
+
+                    Console.WriteLine($"[Sweeper] Pruned orphaned in-use entry from {key} for node {nodeId}");
+                }
+            }
+        }
+    }
 }
diff --git a/hub/Infrastructure/Web/EndpointMappingExtensions.cs b/hub/Infrastructure/Web/EndpointMappingExtensions.cs
diff --git a/hub/Services/HubServiceRunner.cs b/hub/Services/HubServiceRunner.cs
diff --git a/worker/Dockerfile b/worker/Dockerfile
diff --git a/worker/Services/PoolManager.cs b/worker/Services/PoolManager.cs
diff --git a/worker/Services/WebServerHost.cs b/worker/Services/WebServerHost.cs

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-using System;`
`2`	`1`	`using Microsoft.Extensions.Configuration;`
`3`	`2`
`4`	`3`	`namespace Agenix.PlaywrightGrid.HubClient;`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-using System;`
`2`		`-using System.Threading.Tasks;`
`3`	`1`	`using Microsoft.Playwright;`
`4`	`2`
`5`	`3`	`namespace Agenix.PlaywrightGrid.HubClient;`