Skip to content

Commit fd63b36

Browse files
committed
Use worker process index as pid for metrics store
1 parent 8d928fc commit fd63b36

File tree

3 files changed

+45
-18
lines changed

3 files changed

+45
-18
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Storing the metrics of several worker processes on cc-worker VMs in a DirectFileStore residing in a single directory
2+
# did not work because the different processes are isolated by bpm and several processes used the same pid within their container.
3+
# This pid is used for the filename and resulted in corrupted data because several processes were writing data to the same files.
4+
# When requiring this file, the process_id method of the MetricStore will be overridden to first check for `INDEX` in
5+
# env variables before returning the actual pid. The `INDEX` is provided for cc-worker processes.
6+
7+
module CustomProcessId
8+
def process_id
9+
ENV.fetch('INDEX', Process.pid).to_i
10+
end
11+
end
12+
13+
module Prometheus
14+
module Client
15+
module DataStores
16+
class DirectFileStore
17+
class MetricStore
18+
prepend CustomProcessId
19+
end
20+
end
21+
end
22+
end
23+
end

lib/delayed_job/delayed_worker.rb

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -102,30 +102,33 @@ def is_first_generic_worker_on_machine?
102102
def setup_metrics_endpoint(config)
103103
prometheus_dir = File.join(config.get(:directories, :tmpdir), 'prometheus')
104104
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: prometheus_dir)
105-
return unless is_first_generic_worker_on_machine?
106105

107-
FileUtils.mkdir_p(prometheus_dir)
106+
if is_first_generic_worker_on_machine?
107+
FileUtils.mkdir_p(prometheus_dir)
108108

109-
# Resetting metrics on startup
110-
Dir["#{prometheus_dir}/*.bin"].each do |file_path|
111-
File.unlink(file_path)
112-
end
109+
# Resetting metrics on startup
110+
Dir["#{prometheus_dir}/*.bin"].each do |file_path|
111+
File.unlink(file_path)
112+
end
113113

114-
metrics_app = Rack::Builder.new do
115-
use Prometheus::Middleware::Exporter, path: '/metrics'
114+
metrics_app = Rack::Builder.new do
115+
use Prometheus::Middleware::Exporter, path: '/metrics'
116116

117-
map '/' do
118-
run lambda { |env|
119-
# Return 404 for any other request
120-
['404', { 'Content-Type' => 'text/plain' }, ['Not Found']]
121-
}
117+
map '/' do
118+
run lambda { |env|
119+
# Return 404 for any other request
120+
['404', { 'Content-Type' => 'text/plain' }, ['Not Found']]
121+
}
122+
end
122123
end
123-
end
124124

125-
Thread.new do
126-
server = Puma::Server.new(metrics_app)
127-
server.add_tcp_listener '0.0.0.0', config.get(:prometheus_port) || 9394
128-
server.run
125+
Thread.new do
126+
server = Puma::Server.new(metrics_app)
127+
server.add_tcp_listener '0.0.0.0', config.get(:prometheus_port) || 9394
128+
server.run
129+
end
129130
end
131+
132+
CloudController::DependencyLocator.instance.cc_worker_prometheus_updater.update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: 'cc-worker' })
130133
end
131134
end

lib/tasks/jobs.rake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ namespace :jobs do
6565
]
6666

6767
ENV['PROCESS_TYPE'] = 'cc-worker'
68+
require 'cloud_controller/metrics/custom_process_id'
6869

6970
publish_metrics = RakeConfig.config.get(:publish_metrics) || false
7071

0 commit comments

Comments
 (0)