Skip to content

Commit d441af7

Browse files
committed
Add status endpoint to seperate webserver
The /internal/v4/status endpoint gives more fine-granular insights in the health state of the CC. When at leats one Puma worker is idling it returns OK. If all workers are occupied, it returns BUSY. If all workers are busy and since 60s no requests have been processed, it returns UNHEALTHY. This endpoint will be used to prevent restarts even if CC is still healthy and working off peak-loads.
1 parent 7c22bc9 commit d441af7

File tree

1 file changed

+53
-2
lines changed

1 file changed

+53
-2
lines changed

lib/cloud_controller/runner.rb

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,17 @@ def setup_metrics
136136
setup_metrics_webserver
137137
end
138138

139-
# The webserver runs in the main process and serves only the metrics endpoint.
140-
# This makes it possible to retrieve metrics even if all Puma workers of the main app are busy.
139+
# The webserver runs in the main process and serves only the metrics and status endpoint.
140+
# This makes it possible to retrieve both even if all Puma workers of the main app are busy.
141141
def setup_metrics_webserver
142+
readiness_status_proc = method(:status)
142143
metrics_app = Rack::Builder.new do
143144
use Prometheus::Middleware::Exporter, path: '/internal/v4/metrics'
144145

146+
map '/internal/v4/status' do
147+
run ->(_env) { readiness_status_proc.call }
148+
end
149+
145150
map '/' do
146151
run lambda { |_env|
147152
# Return 404 for any other request
@@ -163,6 +168,52 @@ def setup_metrics_webserver
163168
end
164169
end
165170

171+
# Persist state for status endpoint
172+
@previous_requests_count_sum = nil
173+
@last_requests_count_increase_time = nil
174+
175+
def status
176+
stats = Puma.stats_hash
177+
worker_statuses = stats[:worker_status]
178+
all_busy = all_workers_busy?(worker_statuses)
179+
current_requests_count_sum = worker_requests_count_sum(worker_statuses)
180+
181+
now = Time.now
182+
prev = @previous_requests_count_sum
183+
184+
# Track when requests_count_sum increases
185+
@last_requests_count_increase_time = now if prev.nil? || current_requests_count_sum > prev
186+
@previous_requests_count_sum = current_requests_count_sum
187+
188+
unhealthy = false
189+
if all_busy && @last_requests_count_increase_time && (now - @last_requests_count_increase_time) > 60
190+
# If requests_count_sum hasn't increased in 60 seconds, unhealthy
191+
unhealthy = true
192+
end
193+
194+
if all_busy && unhealthy
195+
[503, { 'Content-Type' => 'text/plain' }, ['UNHEALTHY']]
196+
elsif all_busy
197+
[429, { 'Content-Type' => 'text/plain' }, ['BUSY']]
198+
else
199+
[200, { 'Content-Type' => 'text/plain' }, ['OK']]
200+
end
201+
rescue StandardError => e
202+
[500, { 'Content-Type' => 'text/plain' }, ["Readiness check error: #{e}"]]
203+
end
204+
205+
def all_workers_busy?(worker_statuses)
206+
worker_statuses.all? do |worker|
207+
worker[:last_status][:busy_threads] == worker[:last_status][:running]
208+
end
209+
end
210+
211+
def worker_requests_count_sum(worker_statuses)
212+
worker_statuses.sum do |worker|
213+
worker[:last_status][:requests_count] || 0
214+
end
215+
end
216+
166217
def setup_logging
167218
return if @setup_logging
168219

0 commit comments

Comments
 (0)