Skip to content

Commit ac49825

Browse files
authored
Health check responder (#2639)
* Revert "Revert "Health check responder (#2636)" (#2638)" This reverts commit ae4867a. * Revert "Include the health check reponser into heartbeat" This reverts commit 3834b08. * Include the health check reponser into run_heartbeat * Allow outside network connections to get in. * Run responder server in its own thread * fix http import * map docker port to host port * import with full path
1 parent 9fb41a9 commit ac49825

File tree

7 files changed

+156
-0
lines changed

7 files changed

+156
-0
lines changed

configs/test/gce/linux-init-ml-with-gpu.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ bootcmd:
3131
- echo never > /sys/kernel/mm/transparent_hugepage/defrag
3232
- echo core > /proc/sys/kernel/core_pattern # for AFL
3333
- swapon -a
34+
- iptables -w -A INPUT -p tcp --dport 7123 -j ACCEPT # health check port
3435

3536
# Note that NVIDIA_DRIVER_VERSION can be used with a particular CUDA version
3637
# only, e.g. CUDA 9.0 needs drivers version 384.130. CUDA and TensorFlow are

configs/test/gce/linux-init.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ bootcmd:
3131
- echo never > /sys/kernel/mm/transparent_hugepage/defrag
3232
- echo core > /proc/sys/kernel/core_pattern # for AFL
3333
- swapon -a
34+
- iptables -w -A INPUT -p tcp --dport 7123 -j ACCEPT # health check port
3435

3536
write_files:
3637
- path: /etc/systemd/system/clusterfuzz.service

local/run_docker.bash

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,5 @@ sudo docker run -e COMMAND_OVERRIDE="$COMMAND_OVERRIDE" -e SETUP_NFS= -e HOST_UI
4646
-e LOCAL_SRC=$LOCAL_SRC \
4747
-e CONFIG_DIR_OVERRIDE=$CONFIG_DIR_OVERRIDE \
4848
--hostname test-bot-$USER \
49+
-p 7123:7123 \
4950
-ti --privileged --cap-add=all $IMAGE "$@"

src/clusterfuzz/_internal/system/process_handler.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,3 +649,16 @@ def terminate_processes_matching_cmd_line(match_strings,
649649
if any(x in process_path for x in match_strings):
650650
if not any([x in process_path for x in exclude_strings]):
651651
terminate_process(process_info['pid'], kill)
652+
653+
654+
def scripts_are_running(expected_scripts):
655+
"""Check if all target scripts are running as expected."""
656+
scripts_left = expected_scripts.copy()
657+
for process in psutil.process_iter():
658+
for expected_script in scripts_left:
659+
if any(expected_script == os.path.basename(cmdline)
660+
for cmdline in process.cmdline()):
661+
scripts_left.remove(expected_script)
662+
if not scripts_left:
663+
return True
664+
return False
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""health check reposnser tests."""
15+
16+
from http.server import HTTPServer
17+
import threading
18+
import unittest
19+
20+
import mock
21+
import requests
22+
23+
from python.bot.startup.health_check_responder import EXPECTED_SCRIPTS
24+
from python.bot.startup.health_check_responder import RequestHandler
25+
from python.bot.startup.health_check_responder import RESPONDER_IP
26+
from python.bot.startup.health_check_responder import RESPONDER_PORT
27+
28+
RESPONDER_ADDR = f'http://{RESPONDER_IP}:{RESPONDER_PORT}'
29+
30+
31+
class HealthCheckResponderTest(unittest.TestCase):
32+
"""Test health check responder."""
33+
34+
def setUp(self):
35+
"""Prepare mock processes and start the responder server thread."""
36+
self.mock_run_process = mock.MagicMock()
37+
self.mock_run_process.cmdline.return_value = ['./' + EXPECTED_SCRIPTS[0]]
38+
self.mock_run_bot_process = mock.MagicMock()
39+
self.mock_run_bot_process.cmdline.return_value = [
40+
'./' + EXPECTED_SCRIPTS[1]
41+
]
42+
43+
self.health_check_responder_server = HTTPServer(
44+
(RESPONDER_IP, RESPONDER_PORT), RequestHandler)
45+
server_thread = threading.Thread(
46+
target=self.health_check_responder_server.serve_forever)
47+
server_thread.start()
48+
49+
def tearDown(self):
50+
self.health_check_responder_server.shutdown()
51+
self.health_check_responder_server.server_close()
52+
53+
@mock.patch(
54+
'python.bot.startup.health_check_responder.process_handler.psutil')
55+
def test_healthy(self, mock_psutil):
56+
"""Testcase for both scripts are running."""
57+
mock_psutil.process_iter.return_value = [
58+
self.mock_run_process, self.mock_run_bot_process
59+
]
60+
61+
self.assertEqual(200, requests.get(f'{RESPONDER_ADDR}').status_code)
62+
63+
@mock.patch(
64+
'python.bot.startup.health_check_responder.process_handler.psutil')
65+
def test_run_terminated(self, mock_psutil):
66+
"""Testcase for only the run script is running."""
67+
mock_psutil.process_iter.return_value = [self.mock_run_process]
68+
69+
self.assertEqual(500, requests.get(f'{RESPONDER_ADDR}').status_code)
70+
71+
@mock.patch(
72+
'python.bot.startup.health_check_responder.process_handler.psutil')
73+
def test_run_bot_terminated(self, mock_psutil):
74+
"""Testcase for only the run_bot script is running."""
75+
mock_psutil.process_iter.return_value = [self.mock_run_bot_process]
76+
77+
self.assertEqual(500, requests.get(f'{RESPONDER_ADDR}').status_code)
78+
79+
@mock.patch(
80+
'python.bot.startup.health_check_responder.process_handler.psutil')
81+
def test_both_terminated(self, mock_psutil):
82+
"""Testcase for neither script is running."""
83+
mock_psutil.process_iter.return_value = []
84+
self.assertEqual(500, requests.get(f'{RESPONDER_ADDR}').status_code)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Health check responder that checks if all scripts are running as expected
15+
and responds to health checks."""
16+
17+
from http import HTTPStatus
18+
from http.server import BaseHTTPRequestHandler
19+
from http.server import HTTPServer
20+
import threading
21+
22+
from clusterfuzz._internal.system import process_handler
23+
24+
RESPONDER_IP = '0.0.0.0'
25+
RESPONDER_PORT = 7123
26+
EXPECTED_SCRIPTS = ['run.py', 'run_bot.py']
27+
28+
29+
class RequestHandler(BaseHTTPRequestHandler):
30+
"""Handler for GET request form the health checker."""
31+
32+
def do_GET(self): # pylint: disable=invalid-name
33+
"""Handle a GET request."""
34+
if process_handler.scripts_are_running(EXPECTED_SCRIPTS):
35+
# Note: run_bot.py is expected to go down during source updates
36+
# (which can take a few minutes)
37+
# Health checks should be resilient to this
38+
# and set a threshold / check interval to account for this.
39+
response_code = HTTPStatus.OK
40+
else:
41+
response_code = HTTPStatus.INTERNAL_SERVER_ERROR
42+
self.send_response(response_code)
43+
self.end_headers()
44+
45+
46+
def run_server():
47+
"""Start a HTTP server to respond to the health checker."""
48+
health_check_responder_server = HTTPServer((RESPONDER_IP, RESPONDER_PORT),
49+
RequestHandler)
50+
server_thread = threading.Thread(
51+
target=health_check_responder_server.serve_forever)
52+
server_thread.start()

src/python/bot/startup/run_heartbeat.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
from clusterfuzz._internal.metrics import logs
3030
from clusterfuzz._internal.system import environment
3131
from clusterfuzz._internal.system import shell
32+
from python.bot.startup.health_check_responder import \
33+
run_server as run_health_responser_server
3234

3335
BEAT_SCRIPT = 'heartbeat.py'
3436

@@ -51,6 +53,8 @@ def main():
5153
beat_interpreter = shell.get_interpreter(beat_script_path)
5254
assert beat_interpreter
5355

56+
run_health_responser_server()
57+
5458
while True:
5559
beat_command = [
5660
beat_interpreter, beat_script_path,

0 commit comments

Comments
 (0)