Skip to content

Commit eb733d3

Browse files
committed
Add vital and db connection metrics for clock and deployment updater
1 parent 7b27f15 commit eb733d3

34 files changed

+571
-157
lines changed

lib/cloud_controller/metrics_webserver.rb renamed to lib/cloud_controller/api_metrics_webserver.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
module VCAP
55
module CloudController
6-
class MetricsWebserver
6+
class ApiMetricsWebserver
77
attr_reader :app
88

99
def initialize

lib/cloud_controller/clock/scheduler.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
require 'clockwork'
22
require 'cloud_controller/clock/clock'
33
require 'cloud_controller/clock/job_timeout_calculator'
4+
require 'cloud_controller/standalone_metrics_webserver'
45

56
module VCAP::CloudController
67
class Scheduler
@@ -35,6 +36,12 @@ def initialize(config)
3536
end
3637

3738
def start
39+
if @config.get(:publish_metrics) || false
40+
StandaloneMetricsWebserver.start_for_bosh_job(@config.get(:prometheus_port) || 9394)
41+
periodic_updater = CloudController::DependencyLocator.instance.vitals_periodic_updater
42+
periodic_updater.setup_updates
43+
end
44+
3845
start_daily_jobs
3946
start_frequent_jobs
4047
start_inline_jobs

lib/cloud_controller/config_schemas/clock_schema.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ class ClockSchema < VCAP::Config
165165
optional(:port) => Integer
166166
},
167167

168+
optional(:publish_metrics) => bool,
169+
optional(:prometheus_port) => Integer,
170+
168171
skip_cert_verify: bool,
169172

170173
optional(:routing_api) => {

lib/cloud_controller/config_schemas/deployment_updater_schema.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ class DeploymentUpdaterSchema < VCAP::Config
148148

149149
stacks_file: String,
150150

151+
optional(:publish_metrics) => bool,
152+
optional(:prometheus_port) => Integer,
153+
151154
skip_cert_verify: bool,
152155

153156
optional(:credhub_api) => {

lib/cloud_controller/db.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
require 'cloud_controller/db_migrator'
33
require 'cloud_controller/db_connection/options_factory'
44
require 'cloud_controller/db_connection/finalizer'
5+
require 'cloud_controller/execution_context'
56
require 'sequel/extensions/query_length_logging'
67
require 'sequel/extensions/request_query_metrics'
78

@@ -73,8 +74,9 @@ def self.add_connection_expiration_extension(db, opts)
7374
end
7475

7576
def self.add_connection_metrics_extension(db)
76-
# only add the metrics for api and cc-worker processes. Otherwise e.g. rake db:migrate would also initialize metric updaters, which need additional config
77-
return if Object.const_defined?(:RakeConfig) && RakeConfig.context != :worker
77+
# only add the metrics for api, cc-worker, clock & deployment_updater processes.
78+
# Otherwise, e.g. rake db:migrate would also initialize metric updaters, which need additional config
79+
return if ExecutionContext.from_process_type_env.nil?
7880

7981
db.extension(:connection_metrics)
8082
# so that we gather connection metrics from the beginning

lib/cloud_controller/dependency_locator.rb

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
require 'cloud_controller/packager/local_bits_packer'
2525
require 'credhub/client'
2626
require 'cloud_controller/metrics/prometheus_updater'
27+
require 'cloud_controller/execution_context'
2728

2829
module CloudController
2930
class DependencyLocator
@@ -70,12 +71,21 @@ def periodic_updater
7071
))
7172
end
7273

73-
def prometheus_updater
74-
@dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
74+
def vitals_periodic_updater
75+
@dependencies[:vitals_periodic_updater] ||
76+
register(:vitals_periodic_updater,
77+
VCAP::CloudController::Metrics::PeriodicUpdater.new(
78+
Time.now.utc,
79+
log_counter,
80+
Steno.logger('cc.vitals'),
81+
statsd_updater,
82+
prometheus_updater,
83+
task_list: [VCAP::CloudController::Metrics::PeriodicUpdater::VITALS_TASK],
84+
))
7585
end
7686

77-
def cc_worker_prometheus_updater
78-
@dependencies[:cc_worker_prometheus_updater] || register(:cc_worker_prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new(cc_worker: true))
87+
def prometheus_updater
88+
@dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
7989
end
8090

8191
def statsd_updater
@@ -362,6 +372,8 @@ def statsd_client
362372
else
363373
register(:statsd_client, NullStatsdClient.new)
364374
end
375+
rescue VCAP::CloudController::Config::InvalidConfigPath
376+
register(:statsd_client, NullStatsdClient.new)
365377
end
366378

367379
private

lib/cloud_controller/deployment_updater/scheduler.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
require 'cloud_controller/deployment_updater/dispatcher'
22
require 'locket/lock_worker'
33
require 'locket/lock_runner'
4+
require 'cloud_controller/standalone_metrics_webserver'
45

56
module VCAP::CloudController
67
module DeploymentUpdater
@@ -9,6 +10,11 @@ class << self
910
def start
1011
with_error_logging('cc.deployment_updater') do
1112
config = CloudController::DependencyLocator.instance.config
13+
if config.get(:publish_metrics) || false
14+
VCAP::CloudController::StandaloneMetricsWebserver.start_for_bosh_job(config.get(:prometheus_port) || 9395)
15+
periodic_updater = CloudController::DependencyLocator.instance.vitals_periodic_updater
16+
periodic_updater.setup_updates
17+
end
1218
statsd_client = CloudController::DependencyLocator.instance.statsd_client
1319

1420
update_step = proc {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
module VCAP::CloudController
2+
class ExecutionContext
3+
ExecutionInfo = Struct.new(:process_type, :capi_job_name, :rake_context, keyword_init: true) do
4+
def initialize(process_type:, capi_job_name:, rake_context: nil)
5+
super
6+
end
7+
8+
def set_process_type_env
9+
ENV['PROCESS_TYPE'] = process_type
10+
end
11+
12+
def set_rake_context
13+
raise 'RakeConfig is not defined or rake_context argument is nil' if rake_context.nil? || !Object.const_defined?(:RakeConfig)
14+
15+
RakeConfig.context = rake_context
16+
end
17+
end
18+
19+
API_PUMA_MAIN = ExecutionInfo.new(process_type: 'main', capi_job_name: 'cloud_controller_ng')
20+
API_PUMA_WORKER = ExecutionInfo.new(process_type: 'puma_worker', capi_job_name: 'cloud_controller_ng')
21+
CC_WORKER = ExecutionInfo.new(process_type: 'cc-worker', capi_job_name: 'cloud_controller_worker', rake_context: :worker)
22+
CLOCK = ExecutionInfo.new(process_type: 'clock', capi_job_name: 'cloud_controller_clock', rake_context: :clock)
23+
DEPLOYMENT_UPDATER = ExecutionInfo.new(process_type: 'deployment_updater', capi_job_name: 'cc_deployment_updater', rake_context: :deployment_updater)
24+
25+
ALL_EXECUTION_CONTEXTS = [API_PUMA_MAIN, API_PUMA_WORKER, CC_WORKER, CLOCK, DEPLOYMENT_UPDATER].freeze
26+
27+
class << self
28+
def from_process_type_env
29+
process_type = ENV.fetch('PROCESS_TYPE', nil)
30+
exec_ctx = ALL_EXECUTION_CONTEXTS.find { |p| p.process_type == process_type }
31+
32+
# For test environments where PROCESS_TYPE may not be set, default to API_PUMA_MAIN
33+
exec_ctx = API_PUMA_MAIN if exec_ctx.nil? && ENV.fetch('CC_TEST', nil) == 'true'
34+
35+
exec_ctx
36+
end
37+
end
38+
end
39+
end

lib/cloud_controller/metrics/periodic_updater.rb

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,35 @@
33

44
module VCAP::CloudController::Metrics
55
class PeriodicUpdater
6-
def initialize(start_time, log_counter, logger, statsd_updater, prometheus_updater)
6+
UPDATE_TASK = Struct.new(:method_name, :interval)
7+
8+
USER_COUNT_TASK = UPDATE_TASK.new(:update_user_count, 600).freeze
9+
JOB_QUEUE_LENGTH_TASK = UPDATE_TASK.new(:update_job_queue_length, 30).freeze
10+
JOB_QUEUE_LOAD_TASK = UPDATE_TASK.new(:update_job_queue_load, 30).freeze
11+
FAILED_JOB_COUNT_TASK = UPDATE_TASK.new(:update_failed_job_count, 30).freeze
12+
VITALS_TASK = UPDATE_TASK.new(:update_vitals, 30).freeze
13+
LOG_COUNTS_TASK = UPDATE_TASK.new(:update_log_counts, 30).freeze
14+
TASK_STATS_TASK = UPDATE_TASK.new(:update_task_stats, 30).freeze
15+
DEPLOYING_COUNT_TASK = UPDATE_TASK.new(:update_deploying_count, 30).freeze
16+
WEBSERVER_STATS_TASK = UPDATE_TASK.new(:update_webserver_stats, 30).freeze
17+
18+
ALL_TASKS = [USER_COUNT_TASK, JOB_QUEUE_LENGTH_TASK, JOB_QUEUE_LOAD_TASK, FAILED_JOB_COUNT_TASK, VITALS_TASK, LOG_COUNTS_TASK, TASK_STATS_TASK, DEPLOYING_COUNT_TASK,
19+
WEBSERVER_STATS_TASK].freeze
20+
21+
def initialize(start_time, log_counter, logger, statsd_updater, prometheus_updater, task_list: ALL_TASKS)
722
@start_time = start_time
823
@statsd_updater = statsd_updater
924
@prometheus_updater = prometheus_updater
1025
@log_counter = log_counter
1126
@logger = logger
12-
@known_job_queues = {
13-
VCAP::CloudController::Jobs::Queues.local(VCAP::CloudController::Config.config).to_sym => 0
14-
}
27+
@known_job_queues = { VCAP::CloudController::Jobs::Queues.local(VCAP::CloudController::Config.config).to_sym => 0 }
28+
@task_list = task_list
1529
end
1630

1731
def setup_updates
18-
update!
32+
@task_list.each { |task| update!(task) }
1933
@update_tasks = []
20-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 600) { catch_error { update_user_count } }
21-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_job_queue_length } }
22-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_job_queue_load } }
23-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_failed_job_count } }
24-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_vitals } }
25-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_log_counts } }
26-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_task_stats } }
27-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_deploying_count } }
28-
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_webserver_stats } }
34+
@task_list.each { |task| @update_tasks << Concurrent::TimerTask.new(execution_interval: task.interval) { catch_error { update!(task) } } }
2935
@update_tasks.each(&:execute)
3036
end
3137

@@ -35,16 +41,8 @@ def stop_updates
3541
@update_tasks.each(&:shutdown)
3642
end
3743

38-
def update!
39-
update_user_count
40-
update_job_queue_length
41-
update_job_queue_load
42-
update_failed_job_count
43-
update_vitals
44-
update_log_counts
45-
update_task_stats
46-
update_deploying_count
47-
update_webserver_stats
44+
def update!(task)
45+
send(task.method_name)
4846
end
4947

5048
def catch_error

lib/cloud_controller/metrics/prometheus_updater.rb

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
require 'prometheus/client'
22
require 'prometheus/client/data_stores/direct_file_store'
3+
require 'cloud_controller/execution_context'
34

45
module VCAP::CloudController::Metrics
56
class PrometheusUpdater
@@ -29,12 +30,6 @@ def self.allow_pid_label
2930
{ type: :histogram, name: :cc_staging_failed_duration_seconds, docstring: 'Durations of failed staging events', buckets: DURATION_BUCKETS },
3031
{ type: :gauge, name: :cc_requests_outstanding_total, docstring: 'Requests outstanding', aggregation: :sum },
3132
{ type: :counter, name: :cc_requests_completed_total, docstring: 'Requests completed' },
32-
{ type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at', aggregation: :most_recent },
33-
{ type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes', aggregation: :most_recent },
34-
{ type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg', aggregation: :most_recent },
35-
{ type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes', aggregation: :most_recent },
36-
{ type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes', aggregation: :most_recent },
37-
{ type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores', aggregation: :most_recent },
3833
{ type: :gauge, name: :cc_running_tasks_total, docstring: 'Total running tasks', aggregation: :most_recent },
3934
{ type: :gauge, name: :cc_running_tasks_memory_bytes, docstring: 'Total memory consumed by running tasks', aggregation: :most_recent },
4035
{ type: :gauge, name: :cc_users_total, docstring: 'Number of users', aggregation: :most_recent },
@@ -67,19 +62,68 @@ def self.allow_pid_label
6762
{ type: :histogram, name: :cc_job_duration_seconds, docstring: 'Job processing time (start to finish)', labels: %i[queue worker], buckets: DELAYED_JOB_METRIC_BUCKETS }
6863
].freeze
6964

70-
def initialize(registry: Prometheus::Client.registry, cc_worker: false)
65+
VITAL_METRICS = [
66+
{ type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at', aggregation: :most_recent },
67+
{ type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes', aggregation: :most_recent },
68+
{ type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg', aggregation: :most_recent },
69+
{ type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes', aggregation: :most_recent },
70+
{ type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes', aggregation: :most_recent },
71+
{ type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores', aggregation: :most_recent }
72+
].freeze
73+
74+
def initialize(registry: Prometheus::Client.registry)
7175
self.class.allow_pid_label
7276

7377
@registry = registry
78+
execution_context = VCAP::CloudController::ExecutionContext.from_process_type_env
7479

75-
# Register all metrics, to initialize them for discoverability
76-
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
77-
DELAYED_JOB_METRICS.each { |metric| register(metric) }
80+
register_metrics_for_process(execution_context)
81+
initialize_cc_db_connection_pool_timeouts_total(execution_context)
82+
end
83+
84+
private
85+
86+
# rubocop:disable Metrics/CyclomaticComplexity
87+
def register_metrics_for_process(execution_context)
88+
case execution_context
89+
when VCAP::CloudController::ExecutionContext::CC_WORKER
90+
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
91+
DELAYED_JOB_METRICS.each { |metric| register(metric) }
92+
VITAL_METRICS.each { |metric| register(metric) }
93+
when VCAP::CloudController::ExecutionContext::CLOCK, VCAP::CloudController::ExecutionContext::DEPLOYMENT_UPDATER
94+
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
95+
VITAL_METRICS.each { |metric| register(metric) }
96+
when VCAP::CloudController::ExecutionContext::API_PUMA_MAIN, VCAP::CloudController::ExecutionContext::API_PUMA_WORKER
97+
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
98+
DELAYED_JOB_METRICS.each { |metric| register(metric) }
99+
VITAL_METRICS.each { |metric| register(metric) }
100+
METRICS.each { |metric| register(metric) }
101+
PUMA_METRICS.each { |metric| register(metric) } if is_puma_webserver?
102+
else
103+
raise 'Could not register Prometheus metrics: Unknown execution context'
104+
end
105+
end
106+
# rubocop:enable Metrics/CyclomaticComplexity
107+
108+
def initialize_cc_db_connection_pool_timeouts_total(execution_context)
109+
return if execution_context.nil? # In unit tests, the execution context might not be set - thus skip initialization
110+
return unless @registry.exist?(:cc_db_connection_pool_timeouts_total) # If the metric is not registered, we don't need to initialize it
111+
112+
# initialize metric with 0 for discoverability, because it likely won't get updated on healthy systems
113+
update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: execution_context.process_type })
114+
115+
return unless execution_context == VCAP::CloudController::ExecutionContext::API_PUMA_MAIN
116+
117+
# also initialize for puma_worker
118+
update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: VCAP::CloudController::ExecutionContext::API_PUMA_WORKER.process_type })
119+
end
78120

79-
return if cc_worker
121+
public
80122

81-
METRICS.each { |metric| register(metric) }
82-
PUMA_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config&.get(:webserver) == 'puma'
123+
def is_puma_webserver?
124+
VCAP::CloudController::Config.config&.get(:webserver) == 'puma'
125+
rescue VCAP::CloudController::Config::InvalidConfigPath
126+
false
83127
end
84128

85129
def update_gauge_metric(metric, value, labels: {})

0 commit comments

Comments
 (0)