Skip to content

Commit 3635a6b

Browse files
authored
Add status endpoint to seperate webserver (#4441)
* Add status endpoint to seperate webserver The /internal/v4/status endpoint gives more fine-granular insights in the health state of the CC. When at leats one Puma worker is idling it returns OK. If all workers are occupied, it returns BUSY. If all workers are busy and since 60s no requests have been processed, it returns UNHEALTHY. This endpoint will be used to prevent restarts even if CC is still healthy and working off peak-loads. * Refactor runner and move seperate webserver into own class
1 parent deb5107 commit 3635a6b

File tree

5 files changed

+216
-47
lines changed

5 files changed

+216
-47
lines changed
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
require 'rack'
2+
require 'prometheus/middleware/exporter'
3+
4+
module VCAP
5+
module CloudController
6+
class MetricsWebserver
7+
attr_reader :app
8+
9+
def initialize
10+
@app = build_app
11+
end
12+
13+
def start(config)
14+
server = Puma::Server.new(@app)
15+
16+
if config.get(:nginx, :metrics_socket).nil? || config.get(:nginx, :metrics_socket).empty?
17+
server.add_tcp_listener('127.0.0.1', 9395)
18+
else
19+
server.add_unix_listener(config.get(:nginx, :metrics_socket))
20+
end
21+
22+
server.run
23+
end
24+
25+
private
26+
27+
def build_app
28+
status_proc = method(:status)
29+
Rack::Builder.new do
30+
use Prometheus::Middleware::Exporter, path: '/internal/v4/metrics'
31+
32+
map '/internal/v4/status' do
33+
run ->(_env) { status_proc.call }
34+
end
35+
36+
map '/' do
37+
run lambda { |_env|
38+
# Return 404 for any other request
39+
['404', { 'Content-Type' => 'text/plain' }, ['Not Found']]
40+
}
41+
end
42+
end
43+
end
44+
45+
def status
46+
stats = Puma.stats_hash
47+
worker_statuses = stats[:worker_status]
48+
49+
all_busy = all_workers_busy?(worker_statuses)
50+
current_requests_count_sum = worker_requests_count_sum(worker_statuses)
51+
52+
track_request_count_increase(current_requests_count_sum)
53+
54+
unhealthy = determine_unhealthy_state(all_busy)
55+
56+
build_status_response(all_busy, unhealthy)
57+
rescue StandardError => e
58+
[500, { 'Content-Type' => 'text/plain' }, ["Readiness check error: #{e}"]]
59+
end
60+
61+
def track_request_count_increase(current_requests_count_sum)
62+
now = Time.now
63+
prev = @previous_requests_count_sum
64+
65+
@last_requests_count_increase_time = now if prev.nil? || current_requests_count_sum > prev
66+
@previous_requests_count_sum = current_requests_count_sum
67+
end
68+
69+
def determine_unhealthy_state(all_busy)
70+
return false unless all_busy && @last_requests_count_increase_time
71+
72+
(Time.now - @last_requests_count_increase_time) > 60
73+
end
74+
75+
def build_status_response(all_busy, unhealthy)
76+
if all_busy && unhealthy
77+
[503, { 'Content-Type' => 'text/plain' }, ['UNHEALTHY']]
78+
elsif all_busy
79+
[429, { 'Content-Type' => 'text/plain' }, ['BUSY']]
80+
else
81+
[200, { 'Content-Type' => 'text/plain' }, ['OK']]
82+
end
83+
end
84+
85+
def all_workers_busy?(worker_statuses)
86+
worker_statuses.all? do |worker|
87+
worker[:last_status][:busy_threads] == worker[:last_status][:running]
88+
end
89+
end
90+
91+
def worker_requests_count_sum(worker_statuses)
92+
worker_statuses.sum do |worker|
93+
worker[:last_status][:requests_count] || 0
94+
end
95+
end
96+
end
97+
end
98+
end

lib/cloud_controller/runner.rb

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
require 'cloud_controller/secrets_fetcher'
1414
require 'cloud_controller/runners/thin_runner'
1515
require 'cloud_controller/runners/puma_runner'
16+
require 'cloud_controller/metrics_webserver'
1617
require 'prometheus/client/data_stores/direct_file_store'
1718
require 'prometheus/middleware/exporter'
1819

@@ -133,33 +134,8 @@ def setup_metrics
133134

134135
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: prometheus_dir)
135136

136-
setup_metrics_webserver
137-
end
138-
139-
# The webserver runs in the main process and serves only the metrics endpoint.
140-
# This makes it possible to retrieve metrics even if all Puma workers of the main app are busy.
141-
def setup_metrics_webserver
142-
metrics_app = Rack::Builder.new do
143-
use Prometheus::Middleware::Exporter, path: '/internal/v4/metrics'
144-
145-
map '/' do
146-
run lambda { |_env|
147-
# Return 404 for any other request
148-
['404', { 'Content-Type' => 'text/plain' }, ['Not Found']]
149-
}
150-
end
151-
end
152-
153137
Thread.new do
154-
server = Puma::Server.new(metrics_app)
155-
156-
if config.get(:nginx, :metrics_socket).nil? || config.get(:nginx, :metrics_socket).empty?
157-
server.add_tcp_listener('127.0.0.1', 9395)
158-
else
159-
server.add_unix_listener(@config.get(:nginx, :metrics_socket))
160-
end
161-
162-
server.run
138+
VCAP::CloudController::MetricsWebserver.new.start(@config)
163139
end
164140
end
165141

spec/request/status_spec.rb

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
require 'spec_helper'
2+
3+
module VCAP::CloudController
4+
RSpec.describe 'Status Endpoint' do
5+
include Rack::Test::Methods
6+
7+
let(:metrics_webserver) { MetricsWebserver.new }
8+
9+
delegate :app, to: :metrics_webserver
10+
11+
describe 'GET /internal/v4/status' do
12+
context 'when all workers are busy and unhealthy' do
13+
before do
14+
allow(Puma).to receive(:stats_hash).and_return(
15+
worker_status: [
16+
{ last_status: { busy_threads: 2, running: 2, requests_count: 5 } },
17+
{ last_status: { busy_threads: 1, running: 1, requests_count: 3 } }
18+
]
19+
)
20+
allow(metrics_webserver).to receive(:determine_unhealthy_state).and_return(true)
21+
end
22+
23+
it 'returns 503 UNHEALTHY' do
24+
get '/internal/v4/status'
25+
26+
expect(last_response.status).to eq(503)
27+
expect(last_response.body).to eq('UNHEALTHY')
28+
end
29+
end
30+
31+
context 'when all workers are busy but not unhealthy' do
32+
before do
33+
allow(Puma).to receive(:stats_hash).and_return(
34+
worker_status: [
35+
{ last_status: { busy_threads: 2, running: 2, requests_count: 5 } },
36+
{ last_status: { busy_threads: 1, running: 1, requests_count: 3 } }
37+
]
38+
)
39+
allow(metrics_webserver).to receive(:determine_unhealthy_state).and_return(false)
40+
end
41+
42+
it 'returns 429 BUSY' do
43+
get '/internal/v4/status'
44+
45+
expect(last_response.status).to eq(429)
46+
expect(last_response.body).to eq('BUSY')
47+
end
48+
end
49+
50+
context 'when not all workers are busy' do
51+
before do
52+
allow(Puma).to receive(:stats_hash).and_return(
53+
worker_status: [
54+
{ last_status: { busy_threads: 1, running: 2, requests_count: 5 } },
55+
{ last_status: { busy_threads: 0, running: 1, requests_count: 3 } }
56+
]
57+
)
58+
end
59+
60+
it 'returns 200 OK' do
61+
get '/internal/v4/status'
62+
63+
expect(last_response.status).to eq(200)
64+
expect(last_response.body).to eq('OK')
65+
end
66+
end
67+
end
68+
end
69+
end
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
require 'spec_helper'
2+
3+
module VCAP::CloudController
4+
RSpec.describe MetricsWebserver do
5+
let(:metrics_webserver) { described_class.new }
6+
let(:config) { double('config', get: nil) }
7+
8+
describe '#start' do
9+
it 'configures and starts a Puma server' do
10+
allow(Puma::Server).to receive(:new).and_call_original
11+
expect_any_instance_of(Puma::Server).to receive(:run)
12+
13+
metrics_webserver.start(config)
14+
end
15+
16+
context 'when no socket is specified' do
17+
before do
18+
allow(config).to receive(:get).with(:nginx, :metrics_socket).and_return(nil)
19+
end
20+
21+
it 'uses a TCP listener' do
22+
expect_any_instance_of(Puma::Server).to receive(:add_tcp_listener).with('127.0.0.1', 9395)
23+
24+
metrics_webserver.start(config)
25+
end
26+
end
27+
28+
context 'when a socket is specified' do
29+
before do
30+
allow(config).to receive(:get).with(:nginx, :metrics_socket).and_return('/tmp/metrics.sock')
31+
end
32+
33+
it 'uses a Unix socket listener' do
34+
expect_any_instance_of(Puma::Server).to receive(:add_unix_listener).with('/tmp/metrics.sock')
35+
36+
metrics_webserver.start(config)
37+
end
38+
end
39+
end
40+
end
41+
end

spec/unit/lib/cloud_controller/runner_spec.rb

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -290,36 +290,21 @@ module VCAP::CloudController
290290
context 'when the webserver is puma' do
291291
let(:test_config_overrides) { super().merge(webserver: 'puma') }
292292

293-
it 'sets up a separate webserver for metrics' do
294-
subject
295-
296-
expect(Puma::Server).to have_received(:new).once
297-
end
293+
it 'starts the MetricsWebserver' do
294+
expect(MetricsWebserver).to receive(:new).and_call_original
295+
expect_any_instance_of(MetricsWebserver).to receive(:start)
298296

299-
it 'listens on the specified metrics port' do
300297
subject
301-
302-
expect(puma_server_double).to have_received(:add_tcp_listener).with('127.0.0.1', 9395)
303-
end
304-
305-
context 'metrics_socket is configured' do
306-
let(:test_config_overrides) { super().merge(nginx: { metrics_socket: '/tmp/metrics_socket.sock' }) }
307-
308-
it 'listens on the specified metrics socket' do
309-
subject
310-
311-
expect(puma_server_double).to have_received(:add_unix_listener).with('/tmp/metrics_socket.sock')
312-
end
313298
end
314299
end
315300

316301
context 'when the webserver is not puma' do
317302
let(:test_config_overrides) { super().merge(webserver: 'thin') }
318303

319-
it 'does not set up the webserver' do
320-
subject
304+
it 'does not start the MetricsWebserver' do
305+
expect(MetricsWebserver).not_to receive(:new)
321306

322-
expect(Puma::Server).not_to have_received(:new)
307+
subject
323308
end
324309
end
325310
end

0 commit comments

Comments
 (0)