diff --git a/src/badfish/main.py b/src/badfish/main.py index 643bae5..09dac4b 100755 --- a/src/badfish/main.py +++ b/src/badfish/main.py @@ -1759,7 +1759,8 @@ async def get_processor_details(self): processors = [] if data.get("Members"): for member in data["Members"]: - processors.append(member["@odata.id"]) + if "CPU" in member["@odata.id"]: + processors.append(member["@odata.id"]) proc_details = {} for processor in processors: @@ -1793,6 +1794,83 @@ async def get_processor_details(self): return proc_details + async def get_gpu_data(self): + _url = "%s%s/Processors" % (self.host_uri, self.system_resource) + _response = await self.get_request(_url) + + if _response.status == 404: + raise BadfishException("GPU endpoint not available on host.") + + try: + raw = await _response.text("utf-8", "ignore") + data = json.loads(raw.strip()) + + except (ValueError, AttributeError): + raise BadfishException("There was something wrong getting GPU data") + return data + + async def get_gpu_responses(self, data): + gpu_responses = [] + gpu_endpoints = [] + try: + if data.get("Members"): + for member in data["Members"]: + if "Video" in member["@odata.id"] or "ProcAccelerator" in member["@odata.id"]: + gpu_endpoints.append(member["@odata.id"]) + + for gpu in gpu_endpoints: + gpu_url = "%s%s" % (self.host_uri, gpu) + gpu_response = await self.get_request(gpu_url) + gpu_raw = await gpu_response.text("utf-8", "ignore") + gpu_data = json.loads(gpu_raw.strip()) + gpu_responses.append(gpu_data) + + except (ValueError, AttributeError): # pragma: no cover + raise BadfishException("There was something wrong getting host GPU details") + + return gpu_responses + + async def get_gpu_summary(self, gpu_responses): + gpu_summary = {} + try: + for gpu_data in gpu_responses: + + gpu_model = gpu_data["Model"] + + if not gpu_summary.get(gpu_model): + gpu_summary[gpu_model] = 1 + else: + gpu_summary[gpu_model] = gpu_summary[gpu_model] + 1 + + except (ValueError, AttributeError, KeyError): + raise BadfishException("There was something wrong getting GPU summary values.") + return gpu_summary + + async def get_gpu_details(self, gpu_responses): + try: + gpu_details = {} + for gpu_data in gpu_responses: + + gpu_name = gpu_data.get("Id") + fields = [ + "Model", + "Manufacturer", + "ProcessorType", + ] + + values = {} + for field in fields: + value = gpu_data.get(field) + if value: + values[field] = value + + gpu_details.update({gpu_name: values}) + + except (ValueError, AttributeError): # pragma: no cover + raise BadfishException("There was something wrong getting host GPU details values.") + + return gpu_details + async def get_memory_summary(self): _url = "%s%s" % (self.host_uri, self.system_resource) _response = await self.get_request(_url) @@ -1916,6 +1994,27 @@ async def list_processors(self): return True + async def list_gpu(self): + data = await self.get_gpu_data() + gpu_responses = await self.get_gpu_responses(data) + + summary = await self.get_gpu_summary(gpu_responses) + + self.logger.info("GPU Summary:") + for _key, _value in summary.items(): + self.logger.info(f" Model: {_key} (Count: {_value})") + + self.logger.info("Current GPU's on host:") + + gpu_data = await self.get_gpu_details(gpu_responses) + + for _gpu, _properties in gpu_data.items(): + self.logger.info(f" {_gpu}:") + for _key, _value in _properties.items(): + self.logger.info(f" {_key}: {_value}") + + return True + async def list_memory(self): data = await self.get_memory_summary() @@ -2404,7 +2503,6 @@ async def set_nic_attribute(self, fqdd, attribute, value): await self.reboot_server() - async def execute_badfish(_host, _args, logger, format_handler=None): _username = _args["u"] _password = _args["p"] @@ -2431,6 +2529,7 @@ async def execute_badfish(_host, _args, logger, format_handler=None): check_job = _args["check_job"] list_jobs = _args["ls_jobs"] list_interfaces = _args["ls_interfaces"] + list_gpu = _args["ls_gpu"] list_processors = _args["ls_processors"] list_memory = _args["ls_memory"] list_serial = _args["ls_serial"] @@ -2521,6 +2620,8 @@ async def execute_badfish(_host, _args, logger, format_handler=None): await badfish.list_interfaces() elif list_processors: await badfish.list_processors() + elif list_gpu: + await badfish.list_gpu() elif list_memory: await badfish.list_memory() elif list_serial: @@ -2716,6 +2817,11 @@ def main(argv=None): help="List Processor Summary", action="store_true", ) + parser.add_argument( + "--ls-gpu", + help="List GPU's on host", + action="store_true", + ) parser.add_argument( "--ls-memory", help="List Memory Summary", diff --git a/tests/config.py b/tests/config.py index 052a5a4..6788761 100644 --- a/tests/config.py +++ b/tests/config.py @@ -570,7 +570,62 @@ def render_device_dict(index, device): "- INFO - Model: Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz\n" "- ERROR - There was something wrong getting processor details\n" ) +GPU_SUMMARY_RESP = ( + '{"GPUSummary":"AMD Instinct MI300X": 2,}' +) +GPU_SUMMARY_RESP_FAULTY = ( + '{"GPUSummary":"Unknown: 1"}' +) +GPU_MEMBERS_RESP = ( + '{"Members": [' + '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.21-1"},' + '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.22-1"}]}' +) +GPU_MEMBERS_RESP_FAULTY = ( + '{"Members": [' + '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.21-1"},' + '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.22-1"}]}' +) +GPU_DATA_RESP1 = ( + '{"Model": "AMD Instinct MI300X",' + '"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",' + '"ProcessorType": "Accelerator",' + '"Id": "ProcAccelerator.Slot.21-1"}' +) +GPU_DATA_RESP2 = ( + '{"Model": "AMD Instinct MI300X",' + '"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",' + '"ProcessorType": "Accelerator",' + '"Id": "ProcAccelerator.Slot.22-1"}' +) +GPU_DATA_RESP_FAULTY = ( + '{"GPU":"" }' +) +RESPONSE_LS_GPU = ( + "- INFO - GPU Summary:\n" + "- INFO - Model: AMD Instinct MI300X (Count: 2)\n" + "- INFO - Current GPU's on host:\n" + "- INFO - ProcAccelerator.Slot.21-1:\n" + "- INFO - Model: AMD Instinct MI300X\n" + "- INFO - Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n" + "- INFO - ProcessorType: Accelerator\n" + "- INFO - ProcAccelerator.Slot.22-1:\n" + "- INFO - Model: AMD Instinct MI300X\n" + "- INFO - Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n" + "- INFO - ProcessorType: Accelerator\n" +) + +RESPONSE_LS_GPU_SUMMARY_DATA_ERROR = "- ERROR - GPU endpoint not available on host.\n" +RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR = "- ERROR - There was something wrong getting GPU summary values.\n" +RESPONSE_LS_GPU_SUMMARY_BAD_JSON = "- ERROR - There was something wrong getting GPU data\n" +RESPONSE_LS_GPU_DETAILS_NOT_FOUND = "- ERROR - There was something wrong getting host GPU details\n" +RESPONSE_LS_GPU_DETAILS_VALUE_ERROR = ( + "- INFO - GPU Summary:\n" + "- INFO - Model: AMD Instinct MI300X OAM\n" + "- INFO - Current GPU's on host:\n" + "- ERROR - There was something wrong getting host GPU detailed values.\n" +) DELL_REDFISH_ROOT_OEM_RESP = """ {"Oem": {"Dell": diff --git a/tests/test_ls_gpu.py b/tests/test_ls_gpu.py new file mode 100644 index 0000000..5548a44 --- /dev/null +++ b/tests/test_ls_gpu.py @@ -0,0 +1,89 @@ +import pytest +from unittest.mock import patch + +from src.badfish.main import BadfishException +from tests.config import ( + INIT_RESP, + GPU_MEMBERS_RESP, + GPU_MEMBERS_RESP_FAULTY, + GPU_DATA_RESP1, + GPU_DATA_RESP2, + GPU_SUMMARY_RESP, + RESPONSE_LS_GPU, + GPU_SUMMARY_RESP_FAULTY, + RESPONSE_LS_GPU_SUMMARY_DATA_ERROR, + RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR, + RESPONSE_LS_GPU_DETAILS_NOT_FOUND, + RESPONSE_LS_GPU_DETAILS_VALUE_ERROR, RESPONSE_LS_GPU_SUMMARY_BAD_JSON, GPU_DATA_RESP_FAULTY, +) +from tests.test_base import TestBase + + +class TestLsGpu(TestBase): + option_arg = "--ls-gpu" + + @patch("aiohttp.ClientSession.delete") + @patch("aiohttp.ClientSession.post") + @patch("aiohttp.ClientSession.get") + def test_ls_gpu(self, mock_get, mock_post, mock_delete): + responses_add = [ + GPU_MEMBERS_RESP, + GPU_DATA_RESP1, + GPU_DATA_RESP2 + ] + responses = INIT_RESP + responses_add + self.set_mock_response(mock_delete,200, "OK") + self.set_mock_response(mock_post,200, "OK") + self.set_mock_response(mock_get, 200, responses) + self.args = [self.option_arg] + _, err = self.badfish_call() + assert err == RESPONSE_LS_GPU + + @patch("aiohttp.ClientSession.post") + @patch("aiohttp.ClientSession.get") + def test_ls_gpu_data_not_available( + self, mock_get, mock_post + ): + responses_add = [ + GPU_SUMMARY_RESP_FAULTY, + ] + responses = INIT_RESP + responses_add + self.set_mock_response(mock_post, 200, "OK") + self.set_mock_response(mock_get, [200,200,200,200,200,404], responses) + self.args = [self.option_arg] + _, err = self.badfish_call() + assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR + + @patch("aiohttp.ClientSession.post") + @patch("aiohttp.ClientSession.get") + def test_ls_gpu_summary_data_error( + self, mock_get, mock_post + ): + responses_add = [ + GPU_MEMBERS_RESP, + GPU_DATA_RESP1, + GPU_DATA_RESP_FAULTY, + ] + responses = INIT_RESP + responses_add + self.set_mock_response(mock_get, [200,200,200,200,200,404,200,200], responses) + self.set_mock_response(mock_post, 200, "OK") + self.args = [self.option_arg] + _, err = self.badfish_call() + assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR + + @patch("aiohttp.ClientSession.delete") + @patch("aiohttp.ClientSession.post") + @patch("aiohttp.ClientSession.get") + def test_ls_gpu_summary_value_error(self, mock_get, mock_post, mock_delete): + responses_add = [ + GPU_MEMBERS_RESP, + GPU_DATA_RESP1, + GPU_DATA_RESP_FAULTY, + ] + responses = INIT_RESP + responses_add + self.set_mock_response(mock_get, 200, responses) + self.set_mock_response(mock_post, 200, "OK") + self.set_mock_response(mock_delete, 200, "OK") + self.args = [self.option_arg] + _, err = self.badfish_call() + assert err == RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR