From bde0cab4261ee1c0a18eee23077ee4b051265b02 Mon Sep 17 00:00:00 2001 From: Dennis Ahaus Date: Mon, 24 Nov 2025 10:52:27 +0100 Subject: [PATCH 01/14] feat: propagate process_length from heartbeat events to Riemann plugin - Update Heartbeat.to_hash to include process_length attribute when present - Extract process_length from heartbeat payload in Riemann plugin - Attach process_length to each Riemann event (if present) - Support both symbol and string keys for payload compatibility - Add unit tests verifying process_length inclusion/omission in events - All unit tests pass (Riemann: 5/5, Heartbeat: 10/10) --- .../lib/bosh/monitor/events/heartbeat.rb | 5 ++- .../lib/bosh/monitor/plugins/riemann.rb | 14 ++++++- .../unit/bosh/monitor/plugins/riemann_spec.rb | 40 +++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb index bfa5a46032e..5619af85ca0 100644 --- a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb +++ b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb @@ -65,7 +65,7 @@ def to_s end def to_hash - { + result = { kind: 'heartbeat', id: @id, timestamp: @timestamp.to_i, @@ -79,6 +79,9 @@ def to_hash teams: @teams, metrics: @metrics.map(&:to_hash), } + # Include process_length if present in attributes + result[:process_length] = @attributes['process_length'] if @attributes.key?('process_length') + result end def to_json(*_args) diff --git a/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb b/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb index 5d4fc44cf67..0b5a0544614 100644 --- a/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb +++ b/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb @@ -33,11 +33,23 @@ def process(event) def process_heartbeat(event) payload = event.to_hash.merge(service: 'bosh.hm') payload.delete :vitals + + # Extract process_length if present (support symbol or string keys) + process_length = if payload.key?(:process_length) + payload[:process_length] + elsif payload.key?('process_length') + payload['process_length'] + end + event.metrics.each do |metric| - client << payload.merge( + data = payload.merge( name: metric.name, metric: metric.value, ) + # attach process_length as additional attribute when present + data[:process_length] = process_length unless process_length.nil? + + client << data rescue StandardError => e logger.error("Error sending riemann event: #{e}") end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb index de99685a3c0..8ffe5893c89 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb @@ -49,4 +49,44 @@ @plugin.process(heartbeat) end end + + it 'includes process_length in heartbeat events sent to Riemann' do + heartbeat = make_heartbeat(process_length: 42) + + Sync do + expect(@plugin.run).to be(true) + + allow(@client).to receive(:<<) + + # Verify that process_length is attached to events + expect(@client).to receive(:<<).at_least(1).times do |event| + expect(event[:service]).to eq('bosh.hm') + expect(event[:process_length]).to eq(42) + expect(event[:name]).not_to be_nil + expect(event[:metric]).not_to be_nil + end + + @plugin.process(heartbeat) + end + end + + it 'omits process_length when not present in heartbeat' do + heartbeat = make_heartbeat + + Sync do + expect(@plugin.run).to be(true) + + allow(@client).to receive(:<<) + + # Verify that process_length is NOT attached to events + expect(@client).to receive(:<<).at_least(1).times do |event| + expect(event[:service]).to eq('bosh.hm') + expect(event).not_to have_key(:process_length) + expect(event[:name]).not_to be_nil + expect(event[:metric]).not_to be_nil + end + + @plugin.process(heartbeat) + end + end end From a6716d9899d382c0e49c6300e51a26cf3d8ca307 Mon Sep 17 00:00:00 2001 From: Dennis Ahaus Date: Wed, 26 Nov 2025 13:39:47 +0100 Subject: [PATCH 02/14] Save work --- .../lib/bosh/director/metrics_collector.rb | 28 +++++++++- .../bosh/director/metrics_collector_spec.rb | 56 +++++++++++++------ src/bosh-monitor/lib/bosh/monitor/agent.rb | 6 +- .../lib/bosh/monitor/api_controller.rb | 8 +++ .../lib/bosh/monitor/events/heartbeat.rb | 7 ++- .../lib/bosh/monitor/instance_manager.rb | 17 +++++- .../lib/bosh/monitor/plugins/riemann.rb | 14 +---- .../spec/unit/bosh/monitor/agent_spec.rb | 11 ++++ .../unit/bosh/monitor/api_controller_spec.rb | 29 ++++++++++ .../bosh/monitor/instance_manager_spec.rb | 40 ++++++++++++- .../unit/bosh/monitor/plugins/riemann_spec.rb | 40 ------------- 11 files changed, 178 insertions(+), 78 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index 0c94c941639..def8a4e7dd2 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -40,7 +40,13 @@ def initialize(config) @unresponsive_agents = Prometheus::Client.registry.gauge( :bosh_unresponsive_agents, labels: %i[name], - docstring: 'Number of unresponsive agents per deployment', + docstring: "Number of unresponsive agents per deployment", + ) + + @unhealthy_agents = Prometheus::Client.registry.gauge( + :bosh_unhealthy_agents, + labels: %i[name], + docstring: "Number of unhealthy agents (job_state != running AND process_length == 0) per deployment", ) @scheduler = Rufus::Scheduler.new end @@ -136,6 +142,26 @@ def populate_vm_metrics removed_deployments.each do |deployment| @unresponsive_agents.set(0, labels: { name: deployment }) end + + # Fetch and populate unhealthy_agents metrics + response_unhealthy = Net::HTTP.get_response("127.0.0.1", "/unhealthy_agents", @config.health_monitor_port) + return unless response_unhealthy.is_a?(Net::HTTPSuccess) + + unhealthy_agent_counts = JSON.parse(response_unhealthy.body) + return unless unhealthy_agent_counts.is_a?(Hash) + + existing_unhealthy_deployment_names = @unhealthy_agents.values.map do |key, _| + key[:name] + end + + unhealthy_agent_counts.each do |deployment, count| + @unhealthy_agents.set(count, labels: { name: deployment }) + end + + removed_unhealthy_deployments = existing_unhealthy_deployment_names - unhealthy_agent_counts.keys + removed_unhealthy_deployments.each do |deployment| + @unhealthy_agents.set(0, labels: { name: deployment }) + end end def populate_network_metrics diff --git a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb index 35c8aaac1e0..02f92fc8029 100644 --- a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb +++ b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb @@ -34,7 +34,9 @@ def tick allow(resurrector_manager).to receive(:pause_for_all?).and_return(false, true, false) allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false) stub_request(:get, /unresponsive_agents/) - .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0)) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) + stub_request(:get, /unhealthy_agents/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0)) end after do @@ -44,9 +46,10 @@ def tick Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total) Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total) Prometheus::Client.registry.unregister(:bosh_unresponsive_agents) + Prometheus::Client.registry.unregister(:bosh_unhealthy_agents) end - describe '#prep' do + describe "#prep" do let(:db_migrator) { instance_double(DBMigrator) } let(:db) { instance_double(Sequel::Database) } let(:logger) { double(Logging::Logger) } @@ -180,7 +183,7 @@ def tick 'networks' => [older_network_spec], 'vm_extensions' => [], 'compilation' => { 'az' => 'az-1', 'network' => manual_network_spec['name'], 'workers' => 3 }, - )) + )) FactoryBot.create(:models_config_cloud, name: 'some-cloud-config', content: YAML.dump( 'azs' => [az], @@ -189,7 +192,7 @@ def tick 'networks' => [dynamic_network_spec, manual_network_spec, vip_network_spec, weird_name_network_spec], 'vm_extensions' => [], 'compilation' => { 'az' => 'az-1', 'network' => manual_network_spec['name'], 'workers' => 3 }, - )) + )) end it 'emits the total number of dynamic IPs in the network' do @@ -219,8 +222,8 @@ def tick before do FactoryBot.create(:models_ip_address, - instance_id: instance.id, - vm_id: vm.id, + instance_id: instance.id, + vm_id: vm.id, address_str: IPAddr.new('192.168.1.5').to_i.to_s, network_name: manual_network_spec['name'], static: false, @@ -236,8 +239,8 @@ def tick context 'when deployed VMs are using static ips' do before do FactoryBot.create(:models_ip_address, - instance_id: instance.id, - vm_id: vm.id, + instance_id: instance.id, + vm_id: vm.id, address_str: IPAddr.new('192.168.1.4').to_i.to_s, network_name: manual_network_spec['name'], static: true, @@ -261,7 +264,7 @@ def tick 'disk_types' => [], 'vm_extensions' => [], 'networks' => [], - )) + )) end it 'can still get metrics without errors' do @@ -293,9 +296,18 @@ def tick expect(metric.get(labels: { name: 'good_deployment' })).to eq(0) end - context 'when the health monitor returns a non 200 response' do + it "emits the number of unhealthy agents for each deployment" do + metrics_collector.start + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.get(labels: { name: "flaky_deployment" })).to eq(2) + expect(metric.get(labels: { name: "good_deployment" })).to eq(0) + end + + context "when the health monitor returns a non 200 response" do before do - stub_request(:get, '127.0.0.1:12345/unresponsive_agents') + stub_request(:get, "127.0.0.1:12345/unresponsive_agents") + .to_return(status: 404) + stub_request(:get, "127.0.0.1:12345/unhealthy_agents") .to_return(status: 404) end @@ -303,36 +315,48 @@ def tick metrics_collector.start metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents) expect(metric.values).to be_empty + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.values).to be_empty end end context 'when the health monitor returns a non-json response' do before do - stub_request(:get, '127.0.0.1:12345/unresponsive_agents') - .to_return(status: 200, body: JSON.dump('bad response')) + stub_request(:get, "127.0.0.1:12345/unresponsive_agents") + .to_return(status: 200, body: JSON.dump("bad response")) + stub_request(:get, "127.0.0.1:12345/unhealthy_agents") + .to_return(status: 200, body: JSON.dump("bad response")) end it 'does not emit the vm metrics' do metrics_collector.start metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents) expect(metric.values).to be_empty + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.values).to be_empty end end context 'when a deployment is deleted after metrics are gathered' do before do stub_request(:get, /unresponsive_agents/) - .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0)) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) + stub_request(:get, /unhealthy_agents/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0)) metrics_collector.start stub_request(:get, /unresponsive_agents/) - .to_return(status: 200, body: JSON.dump('good_deployment' => 0)) + .to_return(status: 200, body: JSON.dump("good_deployment" => 0)) + stub_request(:get, /unhealthy_agents/) + .to_return(status: 200, body: JSON.dump("good_deployment" => 0)) scheduler.tick end it 'resets the metrics for the deleted deployment' do metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents) - expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0) + expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0) + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0) end end end diff --git a/src/bosh-monitor/lib/bosh/monitor/agent.rb b/src/bosh-monitor/lib/bosh/monitor/agent.rb index 8ba8b282538..0981f0e0f7a 100644 --- a/src/bosh-monitor/lib/bosh/monitor/agent.rb +++ b/src/bosh-monitor/lib/bosh/monitor/agent.rb @@ -1,8 +1,10 @@ module Bosh::Monitor class Agent - attr_reader :id - attr_reader :discovered_at + attr_reader :id + attr_reader :discovered_at attr_accessor :updated_at + attr_accessor :job_state + attr_accessor :process_length ATTRIBUTES = %i[deployment job index instance_id cid].freeze diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb index 21a27385a07..cfeffa5d8ba 100644 --- a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb +++ b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb @@ -40,5 +40,13 @@ def initialize(heartbeat_interval = 1) status(503) end end + + get "/unhealthy_agents" do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.unhealthy_agents) + else + status(503) + end + end end end diff --git a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb index 5619af85ca0..888a07db19e 100644 --- a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb +++ b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb @@ -11,9 +11,9 @@ def initialize(attributes = {}) @id = @attributes['id'] @timestamp = begin Time.at(@attributes['timestamp']) - rescue StandardError + rescue StandardError @attributes['timestamp'] - end + end @deployment = @attributes['deployment'] @agent_id = @attributes['agent_id'] @@ -80,7 +80,8 @@ def to_hash metrics: @metrics.map(&:to_hash), } # Include process_length if present in attributes - result[:process_length] = @attributes['process_length'] if @attributes.key?('process_length') + result[:process_length] = @attributes["process_length"] if @attributes.key?("process_length") + result end diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 2c21f86e974..7da37afdac7 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -121,6 +121,17 @@ def unresponsive_agents agents_hash end + def unhealthy_agents + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count do |agent| + agent.job_state && agent.job_state != "running" && agent.process_length == 0 + end + end + + agents_hash + end + def analyze_agents @logger.info('Analyzing agents...') started = Time.now @@ -325,7 +336,11 @@ def on_heartbeat(agent, deployment, message) message['instance_id'] = agent.instance_id message['teams'] = deployment ? deployment.teams : [] - return if message['instance_id'].nil? || message['job'].nil? || message['deployment'].nil? + # Store job_state and process_length on the agent for unhealthy detection + agent.job_state = message["job_state"] + agent.process_length = message["process_length"] + + return if message["instance_id"].nil? || message["job"].nil? || message["deployment"].nil? end @processor.process(:heartbeat, message) diff --git a/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb b/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb index 0b5a0544614..5d4fc44cf67 100644 --- a/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb +++ b/src/bosh-monitor/lib/bosh/monitor/plugins/riemann.rb @@ -33,23 +33,11 @@ def process(event) def process_heartbeat(event) payload = event.to_hash.merge(service: 'bosh.hm') payload.delete :vitals - - # Extract process_length if present (support symbol or string keys) - process_length = if payload.key?(:process_length) - payload[:process_length] - elsif payload.key?('process_length') - payload['process_length'] - end - event.metrics.each do |metric| - data = payload.merge( + client << payload.merge( name: metric.name, metric: metric.value, ) - # attach process_length as additional attribute when present - data[:process_length] = process_length unless process_length.nil? - - client << data rescue StandardError => e logger.error("Error sending riemann event: #{e}") end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb index c6b2b59c4fd..f689c7518fb 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb @@ -66,6 +66,17 @@ def make_agent(id) expect(agent.cid).to eq(instance.cid) expect(agent.instance_id).to eq(instance.id) end + + it "does not modify job_state or process_length when updating instance" do + agent = make_agent("agent_with_instance") + agent.job_state = "running" + agent.process_length = 3 + + agent.update_instance(instance) + + expect(agent.job_state).to eq("running") + expect(agent.process_length).to eq(3) + end end end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb index 11f565dc365..5cb0530a096 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb @@ -82,7 +82,36 @@ def app get '/unresponsive_agents' expect(last_response.status).to eq(503) end + end + end + + describe "/unhealthy_agents" do + let(:unhealthy_agents) do + { + "first_deployment" => 3, + "second_deployment" => 1, + } + end + before do + allow(Bosh::Monitor.instance_manager).to receive(:unhealthy_agents).and_return(unhealthy_agents) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + it "renders the unhealthy agents" do + get "/unhealthy_agents" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(unhealthy_agents)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /unhealthy_agents is requested" do + get "/unhealthy_agents" + expect(last_response.status).to eq(503) + end end end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb index 5f1e631d92b..6cd22315750 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb @@ -84,7 +84,7 @@ module Bosh::Monitor manager.sync_deployment_state({ 'name' => 'mycloud', 'teams' => ['ateam'] }, cloud1) expect(event_processor).to receive(:process).with( - :heartbeat, + :heartbeat, { 'timestamp' => an_instance_of(Integer), 'agent_id' => '007', 'deployment' => 'mycloud', @@ -273,7 +273,43 @@ module Bosh::Monitor expect(manager.unresponsive_agents).to eq('mycloud' => 0) ts = Time.now allow(Time).to receive(:now).and_return(ts + Bosh::Monitor.intervals.agent_timeout + 10) - expect(manager.unresponsive_agents).to eq('mycloud' => 3) + expect(manager.unresponsive_agents).to eq("mycloud" => 3) + end + end + + describe "#unhealthy_agents" do + it "can return number of unhealthy agents (job_state != running AND process_length == 0) for each deployment" do + instance1 = Bosh::Monitor::Instance.create("id" => "iuuid1", "agent_id" => "007", "index" => "0", "job" => "mutator") + instance2 = Bosh::Monitor::Instance.create("id" => "iuuid2", "agent_id" => "008", "index" => "0", "job" => "nats") + instance3 = Bosh::Monitor::Instance.create("id" => "iuuid3", "agent_id" => "009", "index" => "28", "job" => "mysql_node") + + manager.sync_deployments([{ "name" => "mycloud" }]) + manager.sync_agents("mycloud", [instance1, instance2, instance3]) + + # Initially all agents are healthy + expect(manager.unhealthy_agents).to eq("mycloud" => 0) + + # Set agent job_state != 'running' and process_length == 0 (unhealthy) + agent1 = manager.get_agents_for_deployment("mycloud")["007"] + agent1.job_state = "stopped" + agent1.process_length = 0 + expect(manager.unhealthy_agents).to eq("mycloud" => 1) + + # Set another agent to same state + agent2 = manager.get_agents_for_deployment("mycloud")["008"] + agent2.job_state = "failing" + agent2.process_length = 0 + expect(manager.unhealthy_agents).to eq("mycloud" => 2) + + # Agent with job_state='running' should not count as unhealthy + agent3 = manager.get_agents_for_deployment("mycloud")["009"] + agent3.job_state = "running" + agent3.process_length = 0 + expect(manager.unhealthy_agents).to eq("mycloud" => 2) + + # Agent with process_length > 0 should not count as unhealthy even if job_state != 'running' + agent1.process_length = 5 + expect(manager.unhealthy_agents).to eq("mycloud" => 1) end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb index 8ffe5893c89..de99685a3c0 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/plugins/riemann_spec.rb @@ -49,44 +49,4 @@ @plugin.process(heartbeat) end end - - it 'includes process_length in heartbeat events sent to Riemann' do - heartbeat = make_heartbeat(process_length: 42) - - Sync do - expect(@plugin.run).to be(true) - - allow(@client).to receive(:<<) - - # Verify that process_length is attached to events - expect(@client).to receive(:<<).at_least(1).times do |event| - expect(event[:service]).to eq('bosh.hm') - expect(event[:process_length]).to eq(42) - expect(event[:name]).not_to be_nil - expect(event[:metric]).not_to be_nil - end - - @plugin.process(heartbeat) - end - end - - it 'omits process_length when not present in heartbeat' do - heartbeat = make_heartbeat - - Sync do - expect(@plugin.run).to be(true) - - allow(@client).to receive(:<<) - - # Verify that process_length is NOT attached to events - expect(@client).to receive(:<<).at_least(1).times do |event| - expect(event[:service]).to eq('bosh.hm') - expect(event).not_to have_key(:process_length) - expect(event[:name]).not_to be_nil - expect(event[:metric]).not_to be_nil - end - - @plugin.process(heartbeat) - end - end end From 8c78fbf302c97e76b73d2953ab7c3d567af50655 Mon Sep 17 00:00:00 2001 From: I766702 Date: Tue, 2 Dec 2025 16:00:22 +0100 Subject: [PATCH 03/14] first prompt --- .../lib/bosh/monitor/api_controller.rb | 8 +++++ .../lib/bosh/monitor/instance_manager.rb | 13 ++++++++ .../unit/bosh/monitor/api_controller_spec.rb | 31 +++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb index cfeffa5d8ba..658de838eb1 100644 --- a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb +++ b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb @@ -48,5 +48,13 @@ def initialize(heartbeat_interval = 1) status(503) end end + + get '/total_available_agents' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.total_available_agents) + else + status(503) + end + end end end diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 7da37afdac7..3a0b57ddc60 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -132,6 +132,19 @@ def unhealthy_agents agents_hash end + # Returns a hash of deployment_name => count of agents that are available. + # "Available" here means agent is not timed out and is not considered rogue. + def total_available_agents + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count do |agent| + !agent.timed_out? && !agent.rogue? + end + end + + agents_hash + end + def analyze_agents @logger.info('Analyzing agents...') started = Time.now diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb index 5cb0530a096..9fe82b6ab29 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb @@ -114,4 +114,35 @@ def app end end end + + describe "/total_available_agents" do + let(:available_agents) do + { + "first_deployment" => 5, + "second_deployment" => 2, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:total_available_agents).and_return(available_agents) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders the total available agents" do + get "/total_available_agents" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(available_agents)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /total_available_agents is requested" do + get "/total_available_agents" + expect(last_response.status).to eq(503) + end + end + end end From 538ffdcdb5ba9354ffe7de7b7668a2620f38b558 Mon Sep 17 00:00:00 2001 From: I766702 Date: Tue, 2 Dec 2025 16:06:08 +0100 Subject: [PATCH 04/14] saved work --- src/bosh-monitor/lib/bosh/monitor/instance_manager.rb | 5 ++--- .../spec/unit/bosh/monitor/api_controller_spec.rb | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 3a0b57ddc60..9de818b23f0 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -137,9 +137,8 @@ def unhealthy_agents def total_available_agents agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| - agents_hash[name] = deployment.agents.count do |agent| - !agent.timed_out? && !agent.rogue? - end + # Count all agents for the deployment (no additional criteria) + agents_hash[name] = deployment.agents.count end agents_hash diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb index 9fe82b6ab29..3208147d8af 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb @@ -128,7 +128,7 @@ def app allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) end - it "renders the total available agents" do + it "renders the total available agents (all agents, no criteria)" do get "/total_available_agents" expect(last_response.status).to eq(200) expect(last_response.body).to eq(JSON.generate(available_agents)) From 1a099746ed907dc979ff0827c090b2c79d2e00cd Mon Sep 17 00:00:00 2001 From: I766702 Date: Wed, 3 Dec 2025 16:35:39 +0100 Subject: [PATCH 05/14] added failing agents endpoint --- .../lib/bosh/monitor/api_controller.rb | 8 +++++ .../lib/bosh/monitor/instance_manager.rb | 15 +++++++-- .../unit/bosh/monitor/api_controller_spec.rb | 31 +++++++++++++++++++ .../bosh/monitor/instance_manager_spec.rb | 23 ++++++++++++++ 4 files changed, 74 insertions(+), 3 deletions(-) diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb index 658de838eb1..f271ee4cb19 100644 --- a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb +++ b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb @@ -56,5 +56,13 @@ def initialize(heartbeat_interval = 1) status(503) end end + + get '/failing_agents' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.failing_agents) + else + status(503) + end + end end end diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 9de818b23f0..0509ad0303b 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -132,15 +132,24 @@ def unhealthy_agents agents_hash end - # Returns a hash of deployment_name => count of agents that are available. - # "Available" here means agent is not timed out and is not considered rogue. + def failing_agents + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count { |agent| agent.job_state == 'failing' } + end + + agents_hash + end + def total_available_agents agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| - # Count all agents for the deployment (no additional criteria) + # Count all agents for the deployment agents_hash[name] = deployment.agents.count end + agents_hash['unmanaged'] = @unmanaged_agents.keys.size + agents_hash end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb index 3208147d8af..11590dfd9f4 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb @@ -145,4 +145,35 @@ def app end end end + + describe "/failing_agents" do + let(:failing_agents) do + { + "first_deployment" => 2, + "second_deployment" => 0, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:failing_agents).and_return(failing_agents) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders failing agents" do + get "/failing_agents" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(failing_agents)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /failing_agents is requested" do + get "/failing_agents" + expect(last_response.status).to eq(503) + end + end + end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb index 6cd22315750..ebc466f3a36 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb @@ -313,6 +313,29 @@ module Bosh::Monitor end end + describe '#total_available_agents' do + it 'counts all agents for each deployment and includes unmanaged agents' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Initially both agents are present + expect(manager.total_available_agents).to include('mycloud' => 2) + + # Add an unmanaged (rogue) agent via heartbeat processing + manager.process_event(:heartbeat, 'hm.agent.heartbeat.unmanaged-1') + expect(manager.total_available_agents['unmanaged']).to eq(1) + + # Simulate timed out agents -- they should still be counted as part of the deployment total + ts = Time.now + allow(Time).to receive(:now).and_return(ts + Bosh::Monitor.intervals.agent_timeout + 100) + expect(manager.unresponsive_agents['mycloud']).to eq(2) + expect(manager.total_available_agents).to include('mycloud' => 2) + end + end + describe '#analyze_agents' do let(:instance_1) { Bosh::Monitor::Instance.create('id' => 'instance-uuid-1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') } let(:instance_2) { Bosh::Monitor::Instance.create('id' => 'instance-uuid-2', 'agent_id' => '008', 'index' => '1', 'job' => 'mutator') } From d656019eceea17dd869f45616dad08c68835e065 Mon Sep 17 00:00:00 2001 From: I766702 Date: Thu, 4 Dec 2025 10:39:13 +0100 Subject: [PATCH 06/14] added stopped and unknown instances endpoints --- .../lib/bosh/director/metrics_collector.rb | 93 +++++++++++++++++++ .../lib/bosh/monitor/api_controller.rb | 20 +++- .../lib/bosh/monitor/instance_manager.rb | 24 ++++- 3 files changed, 132 insertions(+), 5 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index def8a4e7dd2..1e768142b6d 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -48,6 +48,27 @@ def initialize(config) labels: %i[name], docstring: "Number of unhealthy agents (job_state != running AND process_length == 0) per deployment", ) + @total_available_agents = Prometheus::Client.registry.gauge( + :bosh_total_available_agents, + labels: %i[name], + docstring: "Number of total available agents (all agents, no criteria) per deployment", + ) + @failing_clients = Prometheus::Client.registry.gauge( + :bosh_failing_clients, + labels: %i[name], + docstring: "Number of failing instances (job_state == 'failing') per deployment", + ) + @stopped_instances = Prometheus::Client.registry.gauge( + :bosh_stopped_instances, + labels: %i[name], + docstring: "Number of instances with job_state == 'stopped' per deployment", + ) + + @unknown_instances = Prometheus::Client.registry.gauge( + :bosh_unknown_instances, + labels: %i[name], + docstring: "Number of instances with unknown job_state per deployment", + ) @scheduler = Rufus::Scheduler.new end @@ -162,6 +183,78 @@ def populate_vm_metrics removed_unhealthy_deployments.each do |deployment| @unhealthy_agents.set(0, labels: { name: deployment }) end + + # Fetch and populate total_available_agents metrics + response_total = Net::HTTP.get_response('127.0.0.1', '/total_available_agents', @config.health_monitor_port) + if response_total.is_a?(Net::HTTPSuccess) + total_agent_counts = JSON.parse(response_total.body) rescue nil + if total_agent_counts.is_a?(Hash) + existing_total_deployment_names = @total_available_agents.values.map { |key, _| key[:name] } + + total_agent_counts.each do |deployment, count| + @total_available_agents.set(count, labels: { name: deployment }) + end + + removed_total_deployments = existing_total_deployment_names - total_agent_counts.keys + removed_total_deployments.each do |deployment| + @total_available_agents.set(0, labels: { name: deployment }) + end + end + end + + # Fetch and populate failing_instances metrics + response_failing = Net::HTTP.get_response('127.0.0.1', '/failing_instances', @config.health_monitor_port) + if response_failing.is_a?(Net::HTTPSuccess) + failing_counts = JSON.parse(response_failing.body) rescue nil + if failing_counts.is_a?(Hash) + existing_failing_deployment_names = @failing_clients.values.map { |key, _| key[:name] } + + failing_counts.each do |deployment, count| + @failing_clients.set(count, labels: { name: deployment }) + end + + removed_failing_deployments = existing_failing_deployment_names - failing_counts.keys + removed_failing_deployments.each do |deployment| + @failing_clients.set(0, labels: { name: deployment }) + end + end + end + + # Fetch and populate stopped_instances metrics + response_stopped = Net::HTTP.get_response('127.0.0.1', '/stopped_instances', @config.health_monitor_port) + if response_stopped.is_a?(Net::HTTPSuccess) + stopped_counts = JSON.parse(response_stopped.body) rescue nil + if stopped_counts.is_a?(Hash) + existing_stopped_deployment_names = @stopped_instances.values.map { |key, _| key[:name] } + + stopped_counts.each do |deployment, count| + @stopped_instances.set(count, labels: { name: deployment }) + end + + removed_stopped_deployments = existing_stopped_deployment_names - stopped_counts.keys + removed_stopped_deployments.each do |deployment| + @stopped_instances.set(0, labels: { name: deployment }) + end + end + end + + # Fetch and populate unknown_instances metrics + response_unknown = Net::HTTP.get_response('127.0.0.1', '/unknown_instances', @config.health_monitor_port) + if response_unknown.is_a?(Net::HTTPSuccess) + unknown_counts = JSON.parse(response_unknown.body) rescue nil + if unknown_counts.is_a?(Hash) + existing_unknown_deployment_names = @unknown_instances.values.map { |key, _| key[:name] } + + unknown_counts.each do |deployment, count| + @unknown_instances.set(count, labels: { name: deployment }) + end + + removed_unknown_deployments = existing_unknown_deployment_names - unknown_counts.keys + removed_unknown_deployments.each do |deployment| + @unknown_instances.set(0, labels: { name: deployment }) + end + end + end end def populate_network_metrics diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb index f271ee4cb19..f88764f3ea1 100644 --- a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb +++ b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb @@ -57,9 +57,25 @@ def initialize(heartbeat_interval = 1) end end - get '/failing_agents' do + get '/failing_instances' do if @instance_manager.director_initial_deployment_sync_done - JSON.generate(@instance_manager.failing_agents) + JSON.generate(@instance_manager.failing_instances) + else + status(503) + end + end + + get '/stopped_instances' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.stopped_instances) + else + status(503) + end + end + + get '/unknown_instances' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.unknown_instances) else status(503) end diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 0509ad0303b..31215c7aefe 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -131,8 +131,7 @@ def unhealthy_agents agents_hash end - - def failing_agents + def failing_instances agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| agents_hash[name] = deployment.agents.count { |agent| agent.job_state == 'failing' } @@ -141,13 +140,32 @@ def failing_agents agents_hash end + def stopped_instances + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count { |agent| agent.job_state == 'stopped' } + end + + agents_hash + end + + def unknown_instances + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count { |agent| agent.job_state.nil? } + end + + agents_hash + end + def total_available_agents agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| - # Count all agents for the deployment + # Count all agents for the deployment (no additional criteria) agents_hash[name] = deployment.agents.count end + # Include unmanaged (rogue) agents in the aggregate under 'unmanaged' agents_hash['unmanaged'] = @unmanaged_agents.keys.size agents_hash From 8bf025b1b79f644eac03a2fc5708e0d0363a485d Mon Sep 17 00:00:00 2001 From: I766702 Date: Thu, 4 Dec 2025 12:46:05 +0100 Subject: [PATCH 07/14] refactored --- .../lib/bosh/director/metrics_collector.rb | 12 ++++++------ src/bosh-monitor/lib/bosh/monitor/agent.rb | 4 ++-- .../lib/bosh/monitor/events/heartbeat.rb | 4 ++-- .../lib/bosh/monitor/instance_manager.rb | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index 1e768142b6d..a41265b7ef6 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -46,15 +46,15 @@ def initialize(config) @unhealthy_agents = Prometheus::Client.registry.gauge( :bosh_unhealthy_agents, labels: %i[name], - docstring: "Number of unhealthy agents (job_state != running AND process_length == 0) per deployment", + docstring: "Number of unhealthy agents (job_state = running AND process_length == 0) per deployment", ) @total_available_agents = Prometheus::Client.registry.gauge( :bosh_total_available_agents, labels: %i[name], docstring: "Number of total available agents (all agents, no criteria) per deployment", ) - @failing_clients = Prometheus::Client.registry.gauge( - :bosh_failing_clients, + @failing_instances = Prometheus::Client.registry.gauge( + :bosh_failing_instances, labels: %i[name], docstring: "Number of failing instances (job_state == 'failing') per deployment", ) @@ -207,15 +207,15 @@ def populate_vm_metrics if response_failing.is_a?(Net::HTTPSuccess) failing_counts = JSON.parse(response_failing.body) rescue nil if failing_counts.is_a?(Hash) - existing_failing_deployment_names = @failing_clients.values.map { |key, _| key[:name] } + existing_failing_deployment_names = @failing_instances.values.map { |key, _| key[:name] } failing_counts.each do |deployment, count| - @failing_clients.set(count, labels: { name: deployment }) + @failing_instances.set(count, labels: { name: deployment }) end removed_failing_deployments = existing_failing_deployment_names - failing_counts.keys removed_failing_deployments.each do |deployment| - @failing_clients.set(0, labels: { name: deployment }) + @failing_instances.set(0, labels: { name: deployment }) end end end diff --git a/src/bosh-monitor/lib/bosh/monitor/agent.rb b/src/bosh-monitor/lib/bosh/monitor/agent.rb index 0981f0e0f7a..e5d9c4a8e5a 100644 --- a/src/bosh-monitor/lib/bosh/monitor/agent.rb +++ b/src/bosh-monitor/lib/bosh/monitor/agent.rb @@ -1,7 +1,7 @@ module Bosh::Monitor class Agent - attr_reader :id - attr_reader :discovered_at + attr_reader :id + attr_reader :discovered_at attr_accessor :updated_at attr_accessor :job_state attr_accessor :process_length diff --git a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb index 888a07db19e..4bc5b8beb64 100644 --- a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb +++ b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb @@ -11,9 +11,9 @@ def initialize(attributes = {}) @id = @attributes['id'] @timestamp = begin Time.at(@attributes['timestamp']) - rescue StandardError + rescue StandardError @attributes['timestamp'] - end + end @deployment = @attributes['deployment'] @agent_id = @attributes['agent_id'] diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 31215c7aefe..9e37c067279 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -125,7 +125,7 @@ def unhealthy_agents agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| agents_hash[name] = deployment.agents.count do |agent| - agent.job_state && agent.job_state != "running" && agent.process_length == 0 + agent.job_state && agent.job_state = "running" && agent.process_length == 0 end end From e66c2702e4f2cb947444673f9515c246e211be06 Mon Sep 17 00:00:00 2001 From: I766702 Date: Thu, 4 Dec 2025 12:57:06 +0100 Subject: [PATCH 08/14] fixed api controller tests --- .../unit/bosh/monitor/api_controller_spec.rb | 78 +++++++++++++++++-- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb index 11590dfd9f4..da700a2115a 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb @@ -146,8 +146,8 @@ def app end end - describe "/failing_agents" do - let(:failing_agents) do + describe "/failing_instances" do + let(:failing_instances) do { "first_deployment" => 2, "second_deployment" => 0, @@ -155,14 +155,14 @@ def app end before do - allow(Bosh::Monitor.instance_manager).to receive(:failing_agents).and_return(failing_agents) + allow(Bosh::Monitor.instance_manager).to receive(:failing_instances).and_return(failing_instances) allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) end - it "renders failing agents" do - get "/failing_agents" + it "renders failing instances" do + get "/failing_instances" expect(last_response.status).to eq(200) - expect(last_response.body).to eq(JSON.generate(failing_agents)) + expect(last_response.body).to eq(JSON.generate(failing_instances)) end context "When director initial deployment sync has not completed" do @@ -170,8 +170,70 @@ def app allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) end - it "returns 503 when /failing_agents is requested" do - get "/failing_agents" + it "returns 503 when /failing_instances is requested" do + get "/failing_instances" + expect(last_response.status).to eq(503) + end + end + end + + describe "/stopped_instances" do + let(:stopped_instances) do + { + "first_deployment" => 1, + "second_deployment" => 0, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:stopped_instances).and_return(stopped_instances) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders stopped instances" do + get "/stopped_instances" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(stopped_instances)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /stopped_instances is requested" do + get "/stopped_instances" + expect(last_response.status).to eq(503) + end + end + end + + describe "/unknown_instances" do + let(:unknown_instances) do + { + "first_deployment" => 0, + "second_deployment" => 1, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:unknown_instances).and_return(unknown_instances) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders unknown instances" do + get "/unknown_instances" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(unknown_instances)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /unknown_instances is requested" do + get "/unknown_instances" expect(last_response.status).to eq(503) end end From 4227f2958c0e7645f319776fca46da8657ec51af Mon Sep 17 00:00:00 2001 From: I766702 Date: Thu, 4 Dec 2025 13:09:31 +0100 Subject: [PATCH 09/14] fixed instance manager tests --- .../lib/bosh/monitor/instance_manager.rb | 2 +- .../bosh/monitor/instance_manager_spec.rb | 70 +++++++++++++++++-- 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 9e37c067279..931d92fb703 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -125,7 +125,7 @@ def unhealthy_agents agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| agents_hash[name] = deployment.agents.count do |agent| - agent.job_state && agent.job_state = "running" && agent.process_length == 0 + agent.job_state && agent.job_state == "running" && agent.process_length == 0 end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb index ebc466f3a36..984e0b03749 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb @@ -278,7 +278,7 @@ module Bosh::Monitor end describe "#unhealthy_agents" do - it "can return number of unhealthy agents (job_state != running AND process_length == 0) for each deployment" do + it "can return number of unhealthy agents (job_state == 'running' AND process_length == 0) for each deployment" do instance1 = Bosh::Monitor::Instance.create("id" => "iuuid1", "agent_id" => "007", "index" => "0", "job" => "mutator") instance2 = Bosh::Monitor::Instance.create("id" => "iuuid2", "agent_id" => "008", "index" => "0", "job" => "nats") instance3 = Bosh::Monitor::Instance.create("id" => "iuuid3", "agent_id" => "009", "index" => "28", "job" => "mysql_node") @@ -289,25 +289,25 @@ module Bosh::Monitor # Initially all agents are healthy expect(manager.unhealthy_agents).to eq("mycloud" => 0) - # Set agent job_state != 'running' and process_length == 0 (unhealthy) + # Set agent job_state == 'running' and process_length == 0 (unhealthy) agent1 = manager.get_agents_for_deployment("mycloud")["007"] - agent1.job_state = "stopped" + agent1.job_state = "running" agent1.process_length = 0 expect(manager.unhealthy_agents).to eq("mycloud" => 1) # Set another agent to same state agent2 = manager.get_agents_for_deployment("mycloud")["008"] - agent2.job_state = "failing" + agent2.job_state = "running" agent2.process_length = 0 expect(manager.unhealthy_agents).to eq("mycloud" => 2) - # Agent with job_state='running' should not count as unhealthy + # Agent with job_state != 'running' should not count as unhealthy agent3 = manager.get_agents_for_deployment("mycloud")["009"] - agent3.job_state = "running" + agent3.job_state = "stopped" agent3.process_length = 0 expect(manager.unhealthy_agents).to eq("mycloud" => 2) - # Agent with process_length > 0 should not count as unhealthy even if job_state != 'running' + # Agent with process_length > 0 should not count as unhealthy even if job_state == 'running' agent1.process_length = 5 expect(manager.unhealthy_agents).to eq("mycloud" => 1) end @@ -336,6 +336,62 @@ module Bosh::Monitor end end + describe '#failing_instances' do + it 'counts agents with job_state == failing for each deployment' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Initially none are failing + expect(manager.failing_instances).to eq('mycloud' => 0) + + agent1 = manager.get_agents_for_deployment('mycloud')['007'] + agent1.job_state = 'failing' + + expect(manager.failing_instances).to eq('mycloud' => 1) + end + end + + describe '#stopped_instances' do + it 'counts agents with job_state == stopped for each deployment' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Initially none are stopped + expect(manager.stopped_instances).to eq('mycloud' => 0) + + agent2 = manager.get_agents_for_deployment('mycloud')['008'] + agent2.job_state = 'stopped' + + expect(manager.stopped_instances).to eq('mycloud' => 1) + end + end + + describe '#unknown_instances' do + it 'counts agents with unknown (nil) job_state for each deployment' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Ensure both have a defined state first + manager.get_agents_for_deployment('mycloud').each_value { |a| a.job_state = 'running' } + expect(manager.unknown_instances).to eq('mycloud' => 0) + + # Set one to unknown + agent1 = manager.get_agents_for_deployment('mycloud')['007'] + agent1.job_state = nil + + expect(manager.unknown_instances).to eq('mycloud' => 1) + end + end + describe '#analyze_agents' do let(:instance_1) { Bosh::Monitor::Instance.create('id' => 'instance-uuid-1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') } let(:instance_2) { Bosh::Monitor::Instance.create('id' => 'instance-uuid-2', 'agent_id' => '008', 'index' => '1', 'job' => 'mutator') } From 6364ec4646e9ebe1a4cd384f91244d2dd28679e2 Mon Sep 17 00:00:00 2001 From: I766702 Date: Thu, 4 Dec 2025 13:18:04 +0100 Subject: [PATCH 10/14] fixed metrics collector tests --- .../lib/bosh/director/metrics_collector.rb | 12 +++---- .../bosh/director/metrics_collector_spec.rb | 34 +++++++++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index a41265b7ef6..9fcad92f38d 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -40,34 +40,34 @@ def initialize(config) @unresponsive_agents = Prometheus::Client.registry.gauge( :bosh_unresponsive_agents, labels: %i[name], - docstring: "Number of unresponsive agents per deployment", + docstring: 'Number of unresponsive agents per deployment', ) @unhealthy_agents = Prometheus::Client.registry.gauge( :bosh_unhealthy_agents, labels: %i[name], - docstring: "Number of unhealthy agents (job_state = running AND process_length == 0) per deployment", + docstring: 'Number of unhealthy agents (job_state = running AND process_length == 0) per deployment', ) @total_available_agents = Prometheus::Client.registry.gauge( :bosh_total_available_agents, labels: %i[name], - docstring: "Number of total available agents (all agents, no criteria) per deployment", + docstring: 'Number of total available agents (all agents, no criteria) per deployment', ) @failing_instances = Prometheus::Client.registry.gauge( :bosh_failing_instances, labels: %i[name], - docstring: "Number of failing instances (job_state == 'failing') per deployment", + docstring: 'Number of failing instances (job_state == "failing") per deployment', ) @stopped_instances = Prometheus::Client.registry.gauge( :bosh_stopped_instances, labels: %i[name], - docstring: "Number of instances with job_state == 'stopped' per deployment", + docstring: 'Number of instances with job_state == "stopped" per deployment', ) @unknown_instances = Prometheus::Client.registry.gauge( :bosh_unknown_instances, labels: %i[name], - docstring: "Number of instances with unknown job_state per deployment", + docstring: 'Number of instances with unknown job_state per deployment', ) @scheduler = Rufus::Scheduler.new end diff --git a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb index 02f92fc8029..52d99796ec0 100644 --- a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb +++ b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb @@ -37,6 +37,14 @@ def tick .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) stub_request(:get, /unhealthy_agents/) .to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0)) + stub_request(:get, /total_available_agents/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 3, "good_deployment" => 2)) + stub_request(:get, /failing_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) + stub_request(:get, /stopped_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 0)) + stub_request(:get, /unknown_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 1)) end after do @@ -47,6 +55,10 @@ def tick Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total) Prometheus::Client.registry.unregister(:bosh_unresponsive_agents) Prometheus::Client.registry.unregister(:bosh_unhealthy_agents) + Prometheus::Client.registry.unregister(:bosh_total_available_agents) + Prometheus::Client.registry.unregister(:bosh_failing_instances) + Prometheus::Client.registry.unregister(:bosh_stopped_instances) + Prometheus::Client.registry.unregister(:bosh_unknown_instances) end describe "#prep" do @@ -303,6 +315,28 @@ def tick expect(metric.get(labels: { name: "good_deployment" })).to eq(0) end + it 'emits the number of total available agents for each deployment' do + metrics_collector.start + metric = Prometheus::Client.registry.get(:bosh_total_available_agents) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(3) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(2) + end + + it 'emits the failing/stopped/unknown instance metrics for each deployment' do + metrics_collector.start + metric = Prometheus::Client.registry.get(:bosh_failing_instances) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(0) + + metric = Prometheus::Client.registry.get(:bosh_stopped_instances) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(0) + + metric = Prometheus::Client.registry.get(:bosh_unknown_instances) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(1) + end + context "when the health monitor returns a non 200 response" do before do stub_request(:get, "127.0.0.1:12345/unresponsive_agents") From 31cd750c2670ea1c29d73b32e9a8ba7232458411 Mon Sep 17 00:00:00 2001 From: I766702 Date: Thu, 4 Dec 2025 15:23:27 +0100 Subject: [PATCH 11/14] formatted docstring --- .../lib/bosh/director/metrics_collector.rb | 4 ++-- .../unit/bosh/director/metrics_collector_spec.rb | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index 9fcad92f38d..a0911986bd9 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -46,7 +46,7 @@ def initialize(config) @unhealthy_agents = Prometheus::Client.registry.gauge( :bosh_unhealthy_agents, labels: %i[name], - docstring: 'Number of unhealthy agents (job_state = running AND process_length == 0) per deployment', + docstring: 'Number of unhealthy agents (job_state == running AND process_length == 0) per deployment', ) @total_available_agents = Prometheus::Client.registry.gauge( :bosh_total_available_agents, @@ -61,7 +61,7 @@ def initialize(config) @stopped_instances = Prometheus::Client.registry.gauge( :bosh_stopped_instances, labels: %i[name], - docstring: 'Number of instances with job_state == "stopped" per deployment', + docstring: 'Number of instances (job_state == "stopped") per deployment', ) @unknown_instances = Prometheus::Client.registry.gauge( diff --git a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb index 52d99796ec0..526898a2fd6 100644 --- a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb +++ b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb @@ -61,7 +61,7 @@ def tick Prometheus::Client.registry.unregister(:bosh_unknown_instances) end - describe "#prep" do + describe '#prep' do let(:db_migrator) { instance_double(DBMigrator) } let(:db) { instance_double(Sequel::Database) } let(:logger) { double(Logging::Logger) } @@ -195,7 +195,7 @@ def tick 'networks' => [older_network_spec], 'vm_extensions' => [], 'compilation' => { 'az' => 'az-1', 'network' => manual_network_spec['name'], 'workers' => 3 }, - )) + )) FactoryBot.create(:models_config_cloud, name: 'some-cloud-config', content: YAML.dump( 'azs' => [az], @@ -204,7 +204,7 @@ def tick 'networks' => [dynamic_network_spec, manual_network_spec, vip_network_spec, weird_name_network_spec], 'vm_extensions' => [], 'compilation' => { 'az' => 'az-1', 'network' => manual_network_spec['name'], 'workers' => 3 }, - )) + )) end it 'emits the total number of dynamic IPs in the network' do @@ -234,8 +234,8 @@ def tick before do FactoryBot.create(:models_ip_address, - instance_id: instance.id, - vm_id: vm.id, + instance_id: instance.id, + vm_id: vm.id, address_str: IPAddr.new('192.168.1.5').to_i.to_s, network_name: manual_network_spec['name'], static: false, @@ -251,8 +251,8 @@ def tick context 'when deployed VMs are using static ips' do before do FactoryBot.create(:models_ip_address, - instance_id: instance.id, - vm_id: vm.id, + instance_id: instance.id, + vm_id: vm.id, address_str: IPAddr.new('192.168.1.4').to_i.to_s, network_name: manual_network_spec['name'], static: true, @@ -276,7 +276,7 @@ def tick 'disk_types' => [], 'vm_extensions' => [], 'networks' => [], - )) + )) end it 'can still get metrics without errors' do From ddb67f01df03a53d64232d7f899593c43f6f3a5b Mon Sep 17 00:00:00 2001 From: I766702 Date: Thu, 11 Dec 2025 15:40:37 +0100 Subject: [PATCH 12/14] renamed process_length into number_of_processes --- .../lib/bosh/director/metrics_collector.rb | 2 +- src/bosh-monitor/lib/bosh/monitor/agent.rb | 2 +- .../lib/bosh/monitor/events/heartbeat.rb | 4 ++-- .../lib/bosh/monitor/instance_manager.rb | 6 +++--- .../spec/unit/bosh/monitor/agent_spec.rb | 6 +++--- .../unit/bosh/monitor/instance_manager_spec.rb | 14 +++++++------- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index a0911986bd9..cad704afa8c 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -46,7 +46,7 @@ def initialize(config) @unhealthy_agents = Prometheus::Client.registry.gauge( :bosh_unhealthy_agents, labels: %i[name], - docstring: 'Number of unhealthy agents (job_state == running AND process_length == 0) per deployment', + docstring: 'Number of unhealthy agents (job_state == running AND number_of_processes == 0) per deployment', ) @total_available_agents = Prometheus::Client.registry.gauge( :bosh_total_available_agents, diff --git a/src/bosh-monitor/lib/bosh/monitor/agent.rb b/src/bosh-monitor/lib/bosh/monitor/agent.rb index e5d9c4a8e5a..ec6dd773e31 100644 --- a/src/bosh-monitor/lib/bosh/monitor/agent.rb +++ b/src/bosh-monitor/lib/bosh/monitor/agent.rb @@ -4,7 +4,7 @@ class Agent attr_reader :discovered_at attr_accessor :updated_at attr_accessor :job_state - attr_accessor :process_length + attr_accessor :number_of_processes ATTRIBUTES = %i[deployment job index instance_id cid].freeze diff --git a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb index 4bc5b8beb64..6a298f746b9 100644 --- a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb +++ b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb @@ -79,8 +79,8 @@ def to_hash teams: @teams, metrics: @metrics.map(&:to_hash), } - # Include process_length if present in attributes - result[:process_length] = @attributes["process_length"] if @attributes.key?("process_length") + # Include number_of_processes if present in attributes + result[:number_of_processes] = @attributes["number_of_processes"] if @attributes.key?("number_of_processes") result end diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 931d92fb703..7774de7cab8 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -125,7 +125,7 @@ def unhealthy_agents agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| agents_hash[name] = deployment.agents.count do |agent| - agent.job_state && agent.job_state == "running" && agent.process_length == 0 + agent.job_state && agent.job_state == "running" && agent.number_of_processes == 0 end end @@ -375,9 +375,9 @@ def on_heartbeat(agent, deployment, message) message['instance_id'] = agent.instance_id message['teams'] = deployment ? deployment.teams : [] - # Store job_state and process_length on the agent for unhealthy detection + # Store job_state and number_of_processes on the agent for unhealthy detection agent.job_state = message["job_state"] - agent.process_length = message["process_length"] + agent.number_of_processes = message["number_of_processes"] return if message["instance_id"].nil? || message["job"].nil? || message["deployment"].nil? end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb index f689c7518fb..baf3cbc4950 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb @@ -67,15 +67,15 @@ def make_agent(id) expect(agent.instance_id).to eq(instance.id) end - it "does not modify job_state or process_length when updating instance" do + it "does not modify job_state or number_of_processes when updating instance" do agent = make_agent("agent_with_instance") agent.job_state = "running" - agent.process_length = 3 + agent.number_of_processes = 3 agent.update_instance(instance) expect(agent.job_state).to eq("running") - expect(agent.process_length).to eq(3) + expect(agent.number_of_processes).to eq(3) end end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb index 984e0b03749..f004e6b2651 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb @@ -278,7 +278,7 @@ module Bosh::Monitor end describe "#unhealthy_agents" do - it "can return number of unhealthy agents (job_state == 'running' AND process_length == 0) for each deployment" do + it "can return number of unhealthy agents (job_state == 'running' AND number_of_processes == 0) for each deployment" do instance1 = Bosh::Monitor::Instance.create("id" => "iuuid1", "agent_id" => "007", "index" => "0", "job" => "mutator") instance2 = Bosh::Monitor::Instance.create("id" => "iuuid2", "agent_id" => "008", "index" => "0", "job" => "nats") instance3 = Bosh::Monitor::Instance.create("id" => "iuuid3", "agent_id" => "009", "index" => "28", "job" => "mysql_node") @@ -289,26 +289,26 @@ module Bosh::Monitor # Initially all agents are healthy expect(manager.unhealthy_agents).to eq("mycloud" => 0) - # Set agent job_state == 'running' and process_length == 0 (unhealthy) + # Set agent job_state == 'running' and number_of_processes == 0 (unhealthy) agent1 = manager.get_agents_for_deployment("mycloud")["007"] agent1.job_state = "running" - agent1.process_length = 0 + agent1.number_of_processes = 0 expect(manager.unhealthy_agents).to eq("mycloud" => 1) # Set another agent to same state agent2 = manager.get_agents_for_deployment("mycloud")["008"] agent2.job_state = "running" - agent2.process_length = 0 + agent2.number_of_processes = 0 expect(manager.unhealthy_agents).to eq("mycloud" => 2) # Agent with job_state != 'running' should not count as unhealthy agent3 = manager.get_agents_for_deployment("mycloud")["009"] agent3.job_state = "stopped" - agent3.process_length = 0 + agent3.number_of_processes = 0 expect(manager.unhealthy_agents).to eq("mycloud" => 2) - # Agent with process_length > 0 should not count as unhealthy even if job_state == 'running' - agent1.process_length = 5 + # Agent with number_of_processes > 0 should not count as unhealthy even if job_state == 'running' + agent1.number_of_processes = 5 expect(manager.unhealthy_agents).to eq("mycloud" => 1) end end From 186820e8572e127ec59e9c5fba97d93edc522205 Mon Sep 17 00:00:00 2001 From: I766702 Date: Wed, 7 Jan 2026 15:09:12 +0100 Subject: [PATCH 13/14] refactored repeating patterns --- .../lib/bosh/director/metrics_collector.rb | 117 +++--------------- .../bosh/director/metrics_collector_spec.rb | 16 +-- .../lib/bosh/monitor/api_controller.rb | 2 +- .../lib/bosh/monitor/instance_manager.rb | 4 +- 4 files changed, 28 insertions(+), 111 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index cad704afa8c..63ff9c3182b 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -142,118 +142,35 @@ def populate_metrics end def populate_vm_metrics - response = Net::HTTP.get_response('127.0.0.1', '/unresponsive_agents', @config.health_monitor_port) + fetch_and_update_gauge('/unresponsive_agents', @unresponsive_agents) + fetch_and_update_gauge('/unhealthy_agents', @unhealthy_agents) + fetch_and_update_gauge('/total_available_agents', @total_available_agents) + fetch_and_update_gauge('/failing_instances', @failing_instances) + fetch_and_update_gauge('/stopped_instances', @stopped_instances) + fetch_and_update_gauge('/unknown_instances', @unknown_instances) + end + + def fetch_and_update_gauge(endpoint, gauge) + response = Net::HTTP.get_response('127.0.0.1', endpoint, @config.health_monitor_port) return unless response.is_a?(Net::HTTPSuccess) - unresponsive_agent_counts = JSON.parse(response.body) - return unless unresponsive_agent_counts.is_a?(Hash) + deployment_counts = JSON.parse(response.body) rescue nil + return unless deployment_counts.is_a?(Hash) - existing_deployment_names = @unresponsive_agents.values.map do |key, _| + existing_deployment_names = gauge.values.map do |key, _| # The keys within the Prometheus::Client::Metric#values method are actually hashes. So the # data returned from values looks like: # { { name: "deployment_a"} => 10, { name: "deployment_b "} => 0, ... } key[:name] end - unresponsive_agent_counts.each do |deployment, count| - @unresponsive_agents.set(count, labels: { name: deployment }) + deployment_counts.each do |deployment, count| + gauge.set(count, labels: { name: deployment }) end - removed_deployments = existing_deployment_names - unresponsive_agent_counts.keys + removed_deployments = existing_deployment_names - deployment_counts.keys removed_deployments.each do |deployment| - @unresponsive_agents.set(0, labels: { name: deployment }) - end - - # Fetch and populate unhealthy_agents metrics - response_unhealthy = Net::HTTP.get_response("127.0.0.1", "/unhealthy_agents", @config.health_monitor_port) - return unless response_unhealthy.is_a?(Net::HTTPSuccess) - - unhealthy_agent_counts = JSON.parse(response_unhealthy.body) - return unless unhealthy_agent_counts.is_a?(Hash) - - existing_unhealthy_deployment_names = @unhealthy_agents.values.map do |key, _| - key[:name] - end - - unhealthy_agent_counts.each do |deployment, count| - @unhealthy_agents.set(count, labels: { name: deployment }) - end - - removed_unhealthy_deployments = existing_unhealthy_deployment_names - unhealthy_agent_counts.keys - removed_unhealthy_deployments.each do |deployment| - @unhealthy_agents.set(0, labels: { name: deployment }) - end - - # Fetch and populate total_available_agents metrics - response_total = Net::HTTP.get_response('127.0.0.1', '/total_available_agents', @config.health_monitor_port) - if response_total.is_a?(Net::HTTPSuccess) - total_agent_counts = JSON.parse(response_total.body) rescue nil - if total_agent_counts.is_a?(Hash) - existing_total_deployment_names = @total_available_agents.values.map { |key, _| key[:name] } - - total_agent_counts.each do |deployment, count| - @total_available_agents.set(count, labels: { name: deployment }) - end - - removed_total_deployments = existing_total_deployment_names - total_agent_counts.keys - removed_total_deployments.each do |deployment| - @total_available_agents.set(0, labels: { name: deployment }) - end - end - end - - # Fetch and populate failing_instances metrics - response_failing = Net::HTTP.get_response('127.0.0.1', '/failing_instances', @config.health_monitor_port) - if response_failing.is_a?(Net::HTTPSuccess) - failing_counts = JSON.parse(response_failing.body) rescue nil - if failing_counts.is_a?(Hash) - existing_failing_deployment_names = @failing_instances.values.map { |key, _| key[:name] } - - failing_counts.each do |deployment, count| - @failing_instances.set(count, labels: { name: deployment }) - end - - removed_failing_deployments = existing_failing_deployment_names - failing_counts.keys - removed_failing_deployments.each do |deployment| - @failing_instances.set(0, labels: { name: deployment }) - end - end - end - - # Fetch and populate stopped_instances metrics - response_stopped = Net::HTTP.get_response('127.0.0.1', '/stopped_instances', @config.health_monitor_port) - if response_stopped.is_a?(Net::HTTPSuccess) - stopped_counts = JSON.parse(response_stopped.body) rescue nil - if stopped_counts.is_a?(Hash) - existing_stopped_deployment_names = @stopped_instances.values.map { |key, _| key[:name] } - - stopped_counts.each do |deployment, count| - @stopped_instances.set(count, labels: { name: deployment }) - end - - removed_stopped_deployments = existing_stopped_deployment_names - stopped_counts.keys - removed_stopped_deployments.each do |deployment| - @stopped_instances.set(0, labels: { name: deployment }) - end - end - end - - # Fetch and populate unknown_instances metrics - response_unknown = Net::HTTP.get_response('127.0.0.1', '/unknown_instances', @config.health_monitor_port) - if response_unknown.is_a?(Net::HTTPSuccess) - unknown_counts = JSON.parse(response_unknown.body) rescue nil - if unknown_counts.is_a?(Hash) - existing_unknown_deployment_names = @unknown_instances.values.map { |key, _| key[:name] } - - unknown_counts.each do |deployment, count| - @unknown_instances.set(count, labels: { name: deployment }) - end - - removed_unknown_deployments = existing_unknown_deployment_names - unknown_counts.keys - removed_unknown_deployments.each do |deployment| - @unknown_instances.set(0, labels: { name: deployment }) - end - end + gauge.set(0, labels: { name: deployment }) end end diff --git a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb index 526898a2fd6..288d300dca0 100644 --- a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb +++ b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb @@ -37,14 +37,14 @@ def tick .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) stub_request(:get, /unhealthy_agents/) .to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0)) - stub_request(:get, /total_available_agents/) - .to_return(status: 200, body: JSON.dump("flaky_deployment" => 3, "good_deployment" => 2)) - stub_request(:get, /failing_instances/) - .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) - stub_request(:get, /stopped_instances/) - .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 0)) - stub_request(:get, /unknown_instances/) - .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 1)) + stub_request(:get, /total_available_agents/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 3, "good_deployment" => 2)) + stub_request(:get, /failing_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) + stub_request(:get, /stopped_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 0)) + stub_request(:get, /unknown_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 1)) end after do diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb index f88764f3ea1..e946b5a177b 100644 --- a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb +++ b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb @@ -41,7 +41,7 @@ def initialize(heartbeat_interval = 1) end end - get "/unhealthy_agents" do + get '/unhealthy_agents' do if @instance_manager.director_initial_deployment_sync_done JSON.generate(@instance_manager.unhealthy_agents) else diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 7774de7cab8..e876ad2929f 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -134,7 +134,7 @@ def unhealthy_agents def failing_instances agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| - agents_hash[name] = deployment.agents.count { |agent| agent.job_state == 'failing' } + agents_hash[name] = deployment.agents.count { |agent| agent.job_state == "failing" } end agents_hash @@ -143,7 +143,7 @@ def failing_instances def stopped_instances agents_hash = {} @deployment_name_to_deployments.each do |name, deployment| - agents_hash[name] = deployment.agents.count { |agent| agent.job_state == 'stopped' } + agents_hash[name] = deployment.agents.count { |agent| agent.job_state == "stopped" } end agents_hash From d323d2d1e5e8e69f3345d749dd129848d159d0fc Mon Sep 17 00:00:00 2001 From: I766702 Date: Wed, 7 Jan 2026 15:49:23 +0100 Subject: [PATCH 14/14] Use specific JSON::ParserError rescue instead of catch-all --- .../lib/bosh/director/metrics_collector.rb | 7 ++++++- .../unit/bosh/director/metrics_collector_spec.rb | 13 ++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index 63ff9c3182b..966fbf47cb6 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -154,7 +154,12 @@ def fetch_and_update_gauge(endpoint, gauge) response = Net::HTTP.get_response('127.0.0.1', endpoint, @config.health_monitor_port) return unless response.is_a?(Net::HTTPSuccess) - deployment_counts = JSON.parse(response.body) rescue nil + begin + deployment_counts = JSON.parse(response.body) + rescue JSON::ParserError => e + @logger.warn("Failed to parse JSON response from #{endpoint}: #{e.message}") + return + end return unless deployment_counts.is_a?(Hash) existing_deployment_names = gauge.values.map do |key, _| diff --git a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb index 288d300dca0..7ff2c4aaabe 100644 --- a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb +++ b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb @@ -355,14 +355,21 @@ def tick end context 'when the health monitor returns a non-json response' do + let(:logger) { double(Logging::Logger) } + before do + allow(config).to receive(:metrics_server_logger).and_return(logger) + allow(logger).to receive(:info) stub_request(:get, "127.0.0.1:12345/unresponsive_agents") - .to_return(status: 200, body: JSON.dump("bad response")) + .to_return(status: 200, body: "not valid json {") stub_request(:get, "127.0.0.1:12345/unhealthy_agents") - .to_return(status: 200, body: JSON.dump("bad response")) + .to_return(status: 200, body: "not valid json {") end - it 'does not emit the vm metrics' do + it 'does not emit the vm metrics and logs a warning' do + expect(logger).to receive(:warn).with(/Failed to parse JSON response from \/unresponsive_agents/) + expect(logger).to receive(:warn).with(/Failed to parse JSON response from \/unhealthy_agents/) + metrics_collector.start metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents) expect(metric.values).to be_empty