diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb index 0c94c941639..966fbf47cb6 100644 --- a/src/bosh-director/lib/bosh/director/metrics_collector.rb +++ b/src/bosh-director/lib/bosh/director/metrics_collector.rb @@ -42,6 +42,33 @@ def initialize(config) labels: %i[name], docstring: 'Number of unresponsive agents per deployment', ) + + @unhealthy_agents = Prometheus::Client.registry.gauge( + :bosh_unhealthy_agents, + labels: %i[name], + docstring: 'Number of unhealthy agents (job_state == running AND number_of_processes == 0) per deployment', + ) + @total_available_agents = Prometheus::Client.registry.gauge( + :bosh_total_available_agents, + labels: %i[name], + docstring: 'Number of total available agents (all agents, no criteria) per deployment', + ) + @failing_instances = Prometheus::Client.registry.gauge( + :bosh_failing_instances, + labels: %i[name], + docstring: 'Number of failing instances (job_state == "failing") per deployment', + ) + @stopped_instances = Prometheus::Client.registry.gauge( + :bosh_stopped_instances, + labels: %i[name], + docstring: 'Number of instances (job_state == "stopped") per deployment', + ) + + @unknown_instances = Prometheus::Client.registry.gauge( + :bosh_unknown_instances, + labels: %i[name], + docstring: 'Number of instances with unknown job_state per deployment', + ) @scheduler = Rufus::Scheduler.new end @@ -115,26 +142,40 @@ def populate_metrics end def populate_vm_metrics - response = Net::HTTP.get_response('127.0.0.1', '/unresponsive_agents', @config.health_monitor_port) + fetch_and_update_gauge('/unresponsive_agents', @unresponsive_agents) + fetch_and_update_gauge('/unhealthy_agents', @unhealthy_agents) + fetch_and_update_gauge('/total_available_agents', @total_available_agents) + fetch_and_update_gauge('/failing_instances', @failing_instances) + fetch_and_update_gauge('/stopped_instances', @stopped_instances) + fetch_and_update_gauge('/unknown_instances', @unknown_instances) + end + + def fetch_and_update_gauge(endpoint, gauge) + response = Net::HTTP.get_response('127.0.0.1', endpoint, @config.health_monitor_port) return unless response.is_a?(Net::HTTPSuccess) - unresponsive_agent_counts = JSON.parse(response.body) - return unless unresponsive_agent_counts.is_a?(Hash) + begin + deployment_counts = JSON.parse(response.body) + rescue JSON::ParserError => e + @logger.warn("Failed to parse JSON response from #{endpoint}: #{e.message}") + return + end + return unless deployment_counts.is_a?(Hash) - existing_deployment_names = @unresponsive_agents.values.map do |key, _| + existing_deployment_names = gauge.values.map do |key, _| # The keys within the Prometheus::Client::Metric#values method are actually hashes. So the # data returned from values looks like: # { { name: "deployment_a"} => 10, { name: "deployment_b "} => 0, ... } key[:name] end - unresponsive_agent_counts.each do |deployment, count| - @unresponsive_agents.set(count, labels: { name: deployment }) + deployment_counts.each do |deployment, count| + gauge.set(count, labels: { name: deployment }) end - removed_deployments = existing_deployment_names - unresponsive_agent_counts.keys + removed_deployments = existing_deployment_names - deployment_counts.keys removed_deployments.each do |deployment| - @unresponsive_agents.set(0, labels: { name: deployment }) + gauge.set(0, labels: { name: deployment }) end end diff --git a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb index 35c8aaac1e0..7ff2c4aaabe 100644 --- a/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb +++ b/src/bosh-director/spec/unit/bosh/director/metrics_collector_spec.rb @@ -34,7 +34,17 @@ def tick allow(resurrector_manager).to receive(:pause_for_all?).and_return(false, true, false) allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false) stub_request(:get, /unresponsive_agents/) - .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0)) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) + stub_request(:get, /unhealthy_agents/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0)) + stub_request(:get, /total_available_agents/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 3, "good_deployment" => 2)) + stub_request(:get, /failing_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) + stub_request(:get, /stopped_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 0)) + stub_request(:get, /unknown_instances/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 1)) end after do @@ -44,6 +54,11 @@ def tick Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total) Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total) Prometheus::Client.registry.unregister(:bosh_unresponsive_agents) + Prometheus::Client.registry.unregister(:bosh_unhealthy_agents) + Prometheus::Client.registry.unregister(:bosh_total_available_agents) + Prometheus::Client.registry.unregister(:bosh_failing_instances) + Prometheus::Client.registry.unregister(:bosh_stopped_instances) + Prometheus::Client.registry.unregister(:bosh_unknown_instances) end describe '#prep' do @@ -293,9 +308,40 @@ def tick expect(metric.get(labels: { name: 'good_deployment' })).to eq(0) end - context 'when the health monitor returns a non 200 response' do + it "emits the number of unhealthy agents for each deployment" do + metrics_collector.start + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.get(labels: { name: "flaky_deployment" })).to eq(2) + expect(metric.get(labels: { name: "good_deployment" })).to eq(0) + end + + it 'emits the number of total available agents for each deployment' do + metrics_collector.start + metric = Prometheus::Client.registry.get(:bosh_total_available_agents) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(3) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(2) + end + + it 'emits the failing/stopped/unknown instance metrics for each deployment' do + metrics_collector.start + metric = Prometheus::Client.registry.get(:bosh_failing_instances) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(0) + + metric = Prometheus::Client.registry.get(:bosh_stopped_instances) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(0) + + metric = Prometheus::Client.registry.get(:bosh_unknown_instances) + expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0) + expect(metric.get(labels: { name: 'good_deployment' })).to eq(1) + end + + context "when the health monitor returns a non 200 response" do before do - stub_request(:get, '127.0.0.1:12345/unresponsive_agents') + stub_request(:get, "127.0.0.1:12345/unresponsive_agents") + .to_return(status: 404) + stub_request(:get, "127.0.0.1:12345/unhealthy_agents") .to_return(status: 404) end @@ -303,36 +349,55 @@ def tick metrics_collector.start metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents) expect(metric.values).to be_empty + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.values).to be_empty end end context 'when the health monitor returns a non-json response' do + let(:logger) { double(Logging::Logger) } + before do - stub_request(:get, '127.0.0.1:12345/unresponsive_agents') - .to_return(status: 200, body: JSON.dump('bad response')) + allow(config).to receive(:metrics_server_logger).and_return(logger) + allow(logger).to receive(:info) + stub_request(:get, "127.0.0.1:12345/unresponsive_agents") + .to_return(status: 200, body: "not valid json {") + stub_request(:get, "127.0.0.1:12345/unhealthy_agents") + .to_return(status: 200, body: "not valid json {") end - it 'does not emit the vm metrics' do + it 'does not emit the vm metrics and logs a warning' do + expect(logger).to receive(:warn).with(/Failed to parse JSON response from \/unresponsive_agents/) + expect(logger).to receive(:warn).with(/Failed to parse JSON response from \/unhealthy_agents/) + metrics_collector.start metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents) expect(metric.values).to be_empty + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.values).to be_empty end end context 'when a deployment is deleted after metrics are gathered' do before do stub_request(:get, /unresponsive_agents/) - .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0)) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0)) + stub_request(:get, /unhealthy_agents/) + .to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0)) metrics_collector.start stub_request(:get, /unresponsive_agents/) - .to_return(status: 200, body: JSON.dump('good_deployment' => 0)) + .to_return(status: 200, body: JSON.dump("good_deployment" => 0)) + stub_request(:get, /unhealthy_agents/) + .to_return(status: 200, body: JSON.dump("good_deployment" => 0)) scheduler.tick end it 'resets the metrics for the deleted deployment' do metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents) - expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0) + expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0) + metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents) + expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0) end end end diff --git a/src/bosh-monitor/lib/bosh/monitor/agent.rb b/src/bosh-monitor/lib/bosh/monitor/agent.rb index 8ba8b282538..ec6dd773e31 100644 --- a/src/bosh-monitor/lib/bosh/monitor/agent.rb +++ b/src/bosh-monitor/lib/bosh/monitor/agent.rb @@ -3,6 +3,8 @@ class Agent attr_reader :id attr_reader :discovered_at attr_accessor :updated_at + attr_accessor :job_state + attr_accessor :number_of_processes ATTRIBUTES = %i[deployment job index instance_id cid].freeze diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb index 21a27385a07..e946b5a177b 100644 --- a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb +++ b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb @@ -40,5 +40,45 @@ def initialize(heartbeat_interval = 1) status(503) end end + + get '/unhealthy_agents' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.unhealthy_agents) + else + status(503) + end + end + + get '/total_available_agents' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.total_available_agents) + else + status(503) + end + end + + get '/failing_instances' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.failing_instances) + else + status(503) + end + end + + get '/stopped_instances' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.stopped_instances) + else + status(503) + end + end + + get '/unknown_instances' do + if @instance_manager.director_initial_deployment_sync_done + JSON.generate(@instance_manager.unknown_instances) + else + status(503) + end + end end end diff --git a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb index bfa5a46032e..6a298f746b9 100644 --- a/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb +++ b/src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb @@ -65,7 +65,7 @@ def to_s end def to_hash - { + result = { kind: 'heartbeat', id: @id, timestamp: @timestamp.to_i, @@ -79,6 +79,10 @@ def to_hash teams: @teams, metrics: @metrics.map(&:to_hash), } + # Include number_of_processes if present in attributes + result[:number_of_processes] = @attributes["number_of_processes"] if @attributes.key?("number_of_processes") + + result end def to_json(*_args) diff --git a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb index 2c21f86e974..e876ad2929f 100644 --- a/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb +++ b/src/bosh-monitor/lib/bosh/monitor/instance_manager.rb @@ -121,6 +121,56 @@ def unresponsive_agents agents_hash end + def unhealthy_agents + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count do |agent| + agent.job_state && agent.job_state == "running" && agent.number_of_processes == 0 + end + end + + agents_hash + end + def failing_instances + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count { |agent| agent.job_state == "failing" } + end + + agents_hash + end + + def stopped_instances + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count { |agent| agent.job_state == "stopped" } + end + + agents_hash + end + + def unknown_instances + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + agents_hash[name] = deployment.agents.count { |agent| agent.job_state.nil? } + end + + agents_hash + end + + def total_available_agents + agents_hash = {} + @deployment_name_to_deployments.each do |name, deployment| + # Count all agents for the deployment (no additional criteria) + agents_hash[name] = deployment.agents.count + end + + # Include unmanaged (rogue) agents in the aggregate under 'unmanaged' + agents_hash['unmanaged'] = @unmanaged_agents.keys.size + + agents_hash + end + def analyze_agents @logger.info('Analyzing agents...') started = Time.now @@ -325,7 +375,11 @@ def on_heartbeat(agent, deployment, message) message['instance_id'] = agent.instance_id message['teams'] = deployment ? deployment.teams : [] - return if message['instance_id'].nil? || message['job'].nil? || message['deployment'].nil? + # Store job_state and number_of_processes on the agent for unhealthy detection + agent.job_state = message["job_state"] + agent.number_of_processes = message["number_of_processes"] + + return if message["instance_id"].nil? || message["job"].nil? || message["deployment"].nil? end @processor.process(:heartbeat, message) diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb index c6b2b59c4fd..baf3cbc4950 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/agent_spec.rb @@ -66,6 +66,17 @@ def make_agent(id) expect(agent.cid).to eq(instance.cid) expect(agent.instance_id).to eq(instance.id) end + + it "does not modify job_state or number_of_processes when updating instance" do + agent = make_agent("agent_with_instance") + agent.job_state = "running" + agent.number_of_processes = 3 + + agent.update_instance(instance) + + expect(agent.job_state).to eq("running") + expect(agent.number_of_processes).to eq(3) + end end end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb index 11f565dc365..da700a2115a 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/api_controller_spec.rb @@ -82,7 +82,160 @@ def app get '/unresponsive_agents' expect(last_response.status).to eq(503) end + end + end + + describe "/unhealthy_agents" do + let(:unhealthy_agents) do + { + "first_deployment" => 3, + "second_deployment" => 1, + } + end + before do + allow(Bosh::Monitor.instance_manager).to receive(:unhealthy_agents).and_return(unhealthy_agents) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders the unhealthy agents" do + get "/unhealthy_agents" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(unhealthy_agents)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /unhealthy_agents is requested" do + get "/unhealthy_agents" + expect(last_response.status).to eq(503) + end + end + end + + describe "/total_available_agents" do + let(:available_agents) do + { + "first_deployment" => 5, + "second_deployment" => 2, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:total_available_agents).and_return(available_agents) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders the total available agents (all agents, no criteria)" do + get "/total_available_agents" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(available_agents)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /total_available_agents is requested" do + get "/total_available_agents" + expect(last_response.status).to eq(503) + end + end + end + + describe "/failing_instances" do + let(:failing_instances) do + { + "first_deployment" => 2, + "second_deployment" => 0, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:failing_instances).and_return(failing_instances) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + it "renders failing instances" do + get "/failing_instances" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(failing_instances)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /failing_instances is requested" do + get "/failing_instances" + expect(last_response.status).to eq(503) + end + end + end + + describe "/stopped_instances" do + let(:stopped_instances) do + { + "first_deployment" => 1, + "second_deployment" => 0, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:stopped_instances).and_return(stopped_instances) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders stopped instances" do + get "/stopped_instances" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(stopped_instances)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /stopped_instances is requested" do + get "/stopped_instances" + expect(last_response.status).to eq(503) + end + end + end + + describe "/unknown_instances" do + let(:unknown_instances) do + { + "first_deployment" => 0, + "second_deployment" => 1, + } + end + + before do + allow(Bosh::Monitor.instance_manager).to receive(:unknown_instances).and_return(unknown_instances) + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(true) + end + + it "renders unknown instances" do + get "/unknown_instances" + expect(last_response.status).to eq(200) + expect(last_response.body).to eq(JSON.generate(unknown_instances)) + end + + context "When director initial deployment sync has not completed" do + before do + allow(Bosh::Monitor.instance_manager).to receive(:director_initial_deployment_sync_done).and_return(false) + end + + it "returns 503 when /unknown_instances is requested" do + get "/unknown_instances" + expect(last_response.status).to eq(503) + end end end end diff --git a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb index 5f1e631d92b..f004e6b2651 100644 --- a/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb +++ b/src/bosh-monitor/spec/unit/bosh/monitor/instance_manager_spec.rb @@ -84,7 +84,7 @@ module Bosh::Monitor manager.sync_deployment_state({ 'name' => 'mycloud', 'teams' => ['ateam'] }, cloud1) expect(event_processor).to receive(:process).with( - :heartbeat, + :heartbeat, { 'timestamp' => an_instance_of(Integer), 'agent_id' => '007', 'deployment' => 'mycloud', @@ -273,7 +273,122 @@ module Bosh::Monitor expect(manager.unresponsive_agents).to eq('mycloud' => 0) ts = Time.now allow(Time).to receive(:now).and_return(ts + Bosh::Monitor.intervals.agent_timeout + 10) - expect(manager.unresponsive_agents).to eq('mycloud' => 3) + expect(manager.unresponsive_agents).to eq("mycloud" => 3) + end + end + + describe "#unhealthy_agents" do + it "can return number of unhealthy agents (job_state == 'running' AND number_of_processes == 0) for each deployment" do + instance1 = Bosh::Monitor::Instance.create("id" => "iuuid1", "agent_id" => "007", "index" => "0", "job" => "mutator") + instance2 = Bosh::Monitor::Instance.create("id" => "iuuid2", "agent_id" => "008", "index" => "0", "job" => "nats") + instance3 = Bosh::Monitor::Instance.create("id" => "iuuid3", "agent_id" => "009", "index" => "28", "job" => "mysql_node") + + manager.sync_deployments([{ "name" => "mycloud" }]) + manager.sync_agents("mycloud", [instance1, instance2, instance3]) + + # Initially all agents are healthy + expect(manager.unhealthy_agents).to eq("mycloud" => 0) + + # Set agent job_state == 'running' and number_of_processes == 0 (unhealthy) + agent1 = manager.get_agents_for_deployment("mycloud")["007"] + agent1.job_state = "running" + agent1.number_of_processes = 0 + expect(manager.unhealthy_agents).to eq("mycloud" => 1) + + # Set another agent to same state + agent2 = manager.get_agents_for_deployment("mycloud")["008"] + agent2.job_state = "running" + agent2.number_of_processes = 0 + expect(manager.unhealthy_agents).to eq("mycloud" => 2) + + # Agent with job_state != 'running' should not count as unhealthy + agent3 = manager.get_agents_for_deployment("mycloud")["009"] + agent3.job_state = "stopped" + agent3.number_of_processes = 0 + expect(manager.unhealthy_agents).to eq("mycloud" => 2) + + # Agent with number_of_processes > 0 should not count as unhealthy even if job_state == 'running' + agent1.number_of_processes = 5 + expect(manager.unhealthy_agents).to eq("mycloud" => 1) + end + end + + describe '#total_available_agents' do + it 'counts all agents for each deployment and includes unmanaged agents' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Initially both agents are present + expect(manager.total_available_agents).to include('mycloud' => 2) + + # Add an unmanaged (rogue) agent via heartbeat processing + manager.process_event(:heartbeat, 'hm.agent.heartbeat.unmanaged-1') + expect(manager.total_available_agents['unmanaged']).to eq(1) + + # Simulate timed out agents -- they should still be counted as part of the deployment total + ts = Time.now + allow(Time).to receive(:now).and_return(ts + Bosh::Monitor.intervals.agent_timeout + 100) + expect(manager.unresponsive_agents['mycloud']).to eq(2) + expect(manager.total_available_agents).to include('mycloud' => 2) + end + end + + describe '#failing_instances' do + it 'counts agents with job_state == failing for each deployment' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Initially none are failing + expect(manager.failing_instances).to eq('mycloud' => 0) + + agent1 = manager.get_agents_for_deployment('mycloud')['007'] + agent1.job_state = 'failing' + + expect(manager.failing_instances).to eq('mycloud' => 1) + end + end + + describe '#stopped_instances' do + it 'counts agents with job_state == stopped for each deployment' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Initially none are stopped + expect(manager.stopped_instances).to eq('mycloud' => 0) + + agent2 = manager.get_agents_for_deployment('mycloud')['008'] + agent2.job_state = 'stopped' + + expect(manager.stopped_instances).to eq('mycloud' => 1) + end + end + + describe '#unknown_instances' do + it 'counts agents with unknown (nil) job_state for each deployment' do + instance1 = Bosh::Monitor::Instance.create('id' => 'iuuid1', 'agent_id' => '007', 'index' => '0', 'job' => 'mutator') + instance2 = Bosh::Monitor::Instance.create('id' => 'iuuid2', 'agent_id' => '008', 'index' => '0', 'job' => 'nats') + + manager.sync_deployments([{ 'name' => 'mycloud' }]) + manager.sync_agents('mycloud', [instance1, instance2]) + + # Ensure both have a defined state first + manager.get_agents_for_deployment('mycloud').each_value { |a| a.job_state = 'running' } + expect(manager.unknown_instances).to eq('mycloud' => 0) + + # Set one to unknown + agent1 = manager.get_agents_for_deployment('mycloud')['007'] + agent1.job_state = nil + + expect(manager.unknown_instances).to eq('mycloud' => 1) end end