Skip to content

Commit 3b4e2d8

Browse files
committed
Publish cc-worker metrics
Implemented a small puma webserver, which will be started in a separate thread with the first worker process. Using the exporter middleware provided by the prometheus client, the metrics stored in the registry will be published under `/metrics`. All processes use the same DirectFileStore registry, so that metrics can be published in a single webserver.
1 parent 4c6c470 commit 3b4e2d8

File tree

7 files changed

+78
-13
lines changed

7 files changed

+78
-13
lines changed

lib/cloud_controller/config_schemas/base/worker_schema.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ class WorkerSchema < VCAP::Config
3737

3838
log_audit_events: bool,
3939

40+
directories: {
41+
tmpdir: String,
42+
},
43+
4044
stacks_file: String,
4145
newrelic_enabled: bool,
4246

lib/cloud_controller/db.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def self.add_connection_expiration_extension(db, opts)
7272

7373
def self.add_connection_metrics_extension(db)
7474
# only add the metrics for api processes. Otherwise e.g. rake db:migrate would also initialize metric updaters, which need additional config
75-
return if Object.const_defined?(:RakeConfig)
75+
return if Object.const_defined?(:RakeConfig) && RakeConfig.context != :worker
7676

7777
db.extension(:connection_metrics)
7878
# so that we gather connection metrics from the beginning

lib/cloud_controller/dependency_locator.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ def prometheus_updater
7474
@dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
7575
end
7676

77+
def cc_worker_prometheus_updater
78+
@dependencies[:cc_worker_prometheus_updater] || register(:cc_worker_prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new(cc_worker: true))
79+
end
80+
7781
def statsd_updater
7882
@dependencies[:statsd_updater] || register(:statsd_updater, VCAP::CloudController::Metrics::StatsdUpdater.new(statsd_client))
7983
end

lib/cloud_controller/metrics/prometheus_updater.rb

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,7 @@ def self.allow_pid_label
3636
{ type: :gauge, name: :cc_running_tasks_total, docstring: 'Total running tasks', aggregation: :most_recent },
3737
{ type: :gauge, name: :cc_running_tasks_memory_bytes, docstring: 'Total memory consumed by running tasks', aggregation: :most_recent },
3838
{ type: :gauge, name: :cc_users_total, docstring: 'Number of users', aggregation: :most_recent },
39-
{ type: :gauge, name: :cc_deployments_in_progress_total, docstring: 'Number of in progress deployments', aggregation: :most_recent },
40-
{ type: :gauge, name: :cc_acquired_db_connections_total, labels: %i[process_type], docstring: 'Number of acquired DB connections' },
41-
{ type: :histogram, name: :cc_db_connection_hold_duration_seconds, docstring: 'The time threads were holding DB connections', buckets: CONNECTION_DURATION_BUCKETS },
42-
# cc_connection_pool_timeouts_total must be a gauge metric, because otherwise we cannot match them with processes
43-
{ type: :gauge, name: :cc_db_connection_pool_timeouts_total, labels: %i[process_type],
44-
docstring: 'Number of threads which failed to acquire a free DB connection from the pool within the timeout' },
45-
{ type: :gauge, name: :cc_open_db_connections_total, labels: %i[process_type], docstring: 'Number of open DB connections (acquired + available)' },
46-
{ type: :histogram, name: :cc_db_connection_wait_duration_seconds, docstring: 'The time threads were waiting for an available DB connection',
47-
buckets: CONNECTION_DURATION_BUCKETS }
39+
{ type: :gauge, name: :cc_deployments_in_progress_total, docstring: 'Number of in progress deployments', aggregation: :most_recent }
4840
].freeze
4941

5042
THIN_METRICS = [
@@ -63,17 +55,36 @@ def self.allow_pid_label
6355
{ type: :gauge, name: :cc_puma_worker_backlog, docstring: 'Puma worker: backlog', labels: %i[index pid], aggregation: :most_recent }
6456
].freeze
6557

66-
def initialize(registry=Prometheus::Client.registry)
58+
DB_CONNECTION_POOL_METRICS = [
59+
{ type: :gauge, name: :cc_acquired_db_connections_total, labels: %i[process_type], docstring: 'Number of acquired DB connections' },
60+
{ type: :histogram, name: :cc_db_connection_hold_duration_seconds, docstring: 'The time threads were holding DB connections', buckets: CONNECTION_DURATION_BUCKETS },
61+
# cc_connection_pool_timeouts_total must be a gauge metric, because otherwise we cannot match them with processes
62+
{ type: :gauge, name: :cc_db_connection_pool_timeouts_total, labels: %i[process_type],
63+
docstring: 'Number of threads which failed to acquire a free DB connection from the pool within the timeout' },
64+
{ type: :gauge, name: :cc_open_db_connections_total, labels: %i[process_type], docstring: 'Number of open DB connections (acquired + available)' },
65+
{ type: :histogram, name: :cc_db_connection_wait_duration_seconds, docstring: 'The time threads were waiting for an available DB connection',
66+
buckets: CONNECTION_DURATION_BUCKETS }
67+
].freeze
68+
69+
def initialize(registry: Prometheus::Client.registry, cc_worker: false)
6770
self.class.allow_pid_label
6871

6972
@registry = registry
7073

7174
# Register all metrics, to initialize them for discoverability
75+
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
76+
77+
return if cc_worker
78+
7279
METRICS.each { |metric| register(metric) }
7380
THIN_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config&.get(:webserver) == 'thin'
7481
PUMA_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config&.get(:webserver) == 'puma'
7582
end
7683

84+
def registry
85+
@registry
86+
end
87+
7788
def update_gauge_metric(metric, value, labels: {})
7889
@registry.get(metric).set(value, labels:)
7990
end

lib/delayed_job/delayed_worker.rb

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
require 'delayed_job/threaded_worker'
2+
require 'rack'
3+
require 'puma'
4+
require 'prometheus/middleware/exporter'
25

36
class CloudController::DelayedWorker
47
def initialize(options)
@@ -9,6 +12,8 @@ def initialize(options)
912
worker_name: options[:name],
1013
quiet: true
1114
}
15+
16+
@publish_metrics = options.fetch(:publish_metrics, false)
1217
return unless options[:num_threads] && options[:num_threads].to_i > 0
1318

1419
@queue_options[:num_threads] = options[:num_threads].to_i
@@ -17,6 +22,7 @@ def initialize(options)
1722

1823
def start_working
1924
config = RakeConfig.config
25+
setup_metrics_endpoint if @publish_metrics
2026
BackgroundJobEnvironment.new(config).setup_environment(readiness_port)
2127

2228
logger = Steno.logger('cc-worker')
@@ -92,4 +98,34 @@ def readiness_port
9298
def is_first_generic_worker_on_machine?
9399
RakeConfig.context != :api && ENV['INDEX']&.to_i == 1
94100
end
101+
102+
def setup_metrics_endpoint
103+
prometheus_dir = File.join(RakeConfig.config.get(:directories, :tmpdir), 'prometheus')
104+
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: prometheus_dir)
105+
return unless is_first_generic_worker_on_machine?
106+
107+
FileUtils.mkdir_p(prometheus_dir)
108+
109+
# Resetting metrics on startup
110+
Dir["#{prometheus_dir}/*.bin"].each do |file_path|
111+
File.unlink(file_path)
112+
end
113+
114+
metrics_app = Rack::Builder.new do
115+
use Prometheus::Middleware::Exporter, path: '/metrics'
116+
117+
map '/' do
118+
run lambda { |env|
119+
# Return 404 for any other request
120+
['404', { 'Content-Type' => 'text/plain' }, ['Not Found']]
121+
}
122+
end
123+
end
124+
125+
Thread.new do
126+
server = Puma::Server.new(metrics_app)
127+
server.add_tcp_listener '0.0.0.0', 9394
128+
server.run
129+
end
130+
end
95131
end

lib/sequel/extensions/connection_metrics.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@ def self.extended(pool)
2323

2424
pool.instance_exec do
2525
sync do
26-
@prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater
26+
@prometheus_updater = if process_type == 'cc-worker'
27+
CloudController::DependencyLocator.instance.cc_worker_prometheus_updater
28+
else
29+
CloudController::DependencyLocator.instance.prometheus_updater
30+
end
2731
@connection_info = {}
2832
end
2933
end

lib/tasks/jobs.rake

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ namespace :jobs do
6464
'prune_excess_app_revisions'
6565
]
6666

67-
CloudController::DelayedWorker.new(queues: queues, name: args.name, num_threads: args.num_threads, thread_grace_period_seconds: args.thread_grace_period_seconds).start_working
67+
ENV['PROCESS_TYPE'] = 'cc-worker'
68+
69+
CloudController::DelayedWorker.new(queues: queues,
70+
name: args.name,
71+
num_threads: args.num_threads,
72+
thread_grace_period_seconds: args.thread_grace_period_seconds,
73+
publish_metrics: true).start_working
6874
end
6975
end

0 commit comments

Comments
 (0)