Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 108 additions & 2 deletions src/badfish/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1759,7 +1759,8 @@
processors = []
if data.get("Members"):
for member in data["Members"]:
processors.append(member["@odata.id"])
if "CPU" in member["@odata.id"]:
processors.append(member["@odata.id"])

proc_details = {}
for processor in processors:
Expand Down Expand Up @@ -1793,6 +1794,83 @@

return proc_details

async def get_gpu_data(self):
_url = "%s%s/Processors" % (self.host_uri, self.system_resource)
_response = await self.get_request(_url)

if _response.status == 404:
raise BadfishException("GPU endpoint not available on host.")

try:
raw = await _response.text("utf-8", "ignore")
data = json.loads(raw.strip())

except (ValueError, AttributeError):
raise BadfishException("There was something wrong getting GPU data")

Check warning on line 1809 in src/badfish/main.py

View check run for this annotation

Codecov / codecov/patch

src/badfish/main.py#L1808-L1809

Added lines #L1808 - L1809 were not covered by tests
return data

async def get_gpu_responses(self, data):
gpu_responses = []
gpu_endpoints = []
try:
if data.get("Members"):
for member in data["Members"]:
if "Video" in member["@odata.id"] or "ProcAccelerator" in member["@odata.id"]:
gpu_endpoints.append(member["@odata.id"])

for gpu in gpu_endpoints:
gpu_url = "%s%s" % (self.host_uri, gpu)
gpu_response = await self.get_request(gpu_url)
gpu_raw = await gpu_response.text("utf-8", "ignore")
gpu_data = json.loads(gpu_raw.strip())
gpu_responses.append(gpu_data)

except (ValueError, AttributeError): # pragma: no cover
raise BadfishException("There was something wrong getting host GPU details")

return gpu_responses

async def get_gpu_summary(self, gpu_responses):
gpu_summary = {}
try:
for gpu_data in gpu_responses:

gpu_model = gpu_data["Model"]

if not gpu_summary.get(gpu_model):
gpu_summary[gpu_model] = 1
else:
gpu_summary[gpu_model] = gpu_summary[gpu_model] + 1

except (ValueError, AttributeError, KeyError):
raise BadfishException("There was something wrong getting GPU summary values.")
return gpu_summary

async def get_gpu_details(self, gpu_responses):
try:
gpu_details = {}
for gpu_data in gpu_responses:

gpu_name = gpu_data.get("Id")
fields = [
"Model",
"Manufacturer",
"ProcessorType",
]

values = {}
for field in fields:
value = gpu_data.get(field)
if value:
values[field] = value

gpu_details.update({gpu_name: values})

except (ValueError, AttributeError): # pragma: no cover
raise BadfishException("There was something wrong getting host GPU details values.")

return gpu_details

async def get_memory_summary(self):
_url = "%s%s" % (self.host_uri, self.system_resource)
_response = await self.get_request(_url)
Expand Down Expand Up @@ -1916,6 +1994,27 @@

return True

async def list_gpu(self):
data = await self.get_gpu_data()
gpu_responses = await self.get_gpu_responses(data)

summary = await self.get_gpu_summary(gpu_responses)

self.logger.info("GPU Summary:")
for _key, _value in summary.items():
self.logger.info(f" Model: {_key} (Count: {_value})")

self.logger.info("Current GPU's on host:")

gpu_data = await self.get_gpu_details(gpu_responses)

for _gpu, _properties in gpu_data.items():
self.logger.info(f" {_gpu}:")
for _key, _value in _properties.items():
self.logger.info(f" {_key}: {_value}")

return True

async def list_memory(self):
data = await self.get_memory_summary()

Expand Down Expand Up @@ -2404,7 +2503,6 @@

await self.reboot_server()


async def execute_badfish(_host, _args, logger, format_handler=None):
_username = _args["u"]
_password = _args["p"]
Expand All @@ -2431,6 +2529,7 @@
check_job = _args["check_job"]
list_jobs = _args["ls_jobs"]
list_interfaces = _args["ls_interfaces"]
list_gpu = _args["ls_gpu"]
list_processors = _args["ls_processors"]
list_memory = _args["ls_memory"]
list_serial = _args["ls_serial"]
Expand Down Expand Up @@ -2521,6 +2620,8 @@
await badfish.list_interfaces()
elif list_processors:
await badfish.list_processors()
elif list_gpu:
await badfish.list_gpu()
elif list_memory:
await badfish.list_memory()
elif list_serial:
Expand Down Expand Up @@ -2716,6 +2817,11 @@
help="List Processor Summary",
action="store_true",
)
parser.add_argument(
"--ls-gpu",
help="List GPU's on host",
action="store_true",
)
parser.add_argument(
"--ls-memory",
help="List Memory Summary",
Expand Down
55 changes: 55 additions & 0 deletions tests/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,62 @@ def render_device_dict(index, device):
"- INFO - Model: Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz\n"
"- ERROR - There was something wrong getting processor details\n"
)
GPU_SUMMARY_RESP = (
'{"GPUSummary":"AMD Instinct MI300X": 2,}'

)
GPU_SUMMARY_RESP_FAULTY = (
'{"GPUSummary":"Unknown: 1"}'
)
GPU_MEMBERS_RESP = (
'{"Members": ['
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.21-1"},'
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.22-1"}]}'
)
GPU_MEMBERS_RESP_FAULTY = (
'{"Members": ['
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.21-1"},'
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.22-1"}]}'
)
GPU_DATA_RESP1 = (
'{"Model": "AMD Instinct MI300X",'
'"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",'
'"ProcessorType": "Accelerator",'
'"Id": "ProcAccelerator.Slot.21-1"}'
)
GPU_DATA_RESP2 = (
'{"Model": "AMD Instinct MI300X",'
'"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",'
'"ProcessorType": "Accelerator",'
'"Id": "ProcAccelerator.Slot.22-1"}'
)
GPU_DATA_RESP_FAULTY = (
'{"GPU":"" }'
)
RESPONSE_LS_GPU = (
"- INFO - GPU Summary:\n"
"- INFO - Model: AMD Instinct MI300X (Count: 2)\n"
"- INFO - Current GPU's on host:\n"
"- INFO - ProcAccelerator.Slot.21-1:\n"
"- INFO - Model: AMD Instinct MI300X\n"
"- INFO - Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n"
"- INFO - ProcessorType: Accelerator\n"
"- INFO - ProcAccelerator.Slot.22-1:\n"
"- INFO - Model: AMD Instinct MI300X\n"
"- INFO - Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n"
"- INFO - ProcessorType: Accelerator\n"
)

RESPONSE_LS_GPU_SUMMARY_DATA_ERROR = "- ERROR - GPU endpoint not available on host.\n"
RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR = "- ERROR - There was something wrong getting GPU summary values.\n"
RESPONSE_LS_GPU_SUMMARY_BAD_JSON = "- ERROR - There was something wrong getting GPU data\n"
RESPONSE_LS_GPU_DETAILS_NOT_FOUND = "- ERROR - There was something wrong getting host GPU details\n"
RESPONSE_LS_GPU_DETAILS_VALUE_ERROR = (
"- INFO - GPU Summary:\n"
"- INFO - Model: AMD Instinct MI300X OAM\n"
"- INFO - Current GPU's on host:\n"
"- ERROR - There was something wrong getting host GPU detailed values.\n"
)
DELL_REDFISH_ROOT_OEM_RESP = """
{"Oem":
{"Dell":
Expand Down
89 changes: 89 additions & 0 deletions tests/test_ls_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pytest
from unittest.mock import patch

from src.badfish.main import BadfishException
from tests.config import (
INIT_RESP,
GPU_MEMBERS_RESP,
GPU_MEMBERS_RESP_FAULTY,
GPU_DATA_RESP1,
GPU_DATA_RESP2,
GPU_SUMMARY_RESP,
RESPONSE_LS_GPU,
GPU_SUMMARY_RESP_FAULTY,
RESPONSE_LS_GPU_SUMMARY_DATA_ERROR,
RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR,
RESPONSE_LS_GPU_DETAILS_NOT_FOUND,
RESPONSE_LS_GPU_DETAILS_VALUE_ERROR, RESPONSE_LS_GPU_SUMMARY_BAD_JSON, GPU_DATA_RESP_FAULTY,
)
from tests.test_base import TestBase


class TestLsGpu(TestBase):
option_arg = "--ls-gpu"

@patch("aiohttp.ClientSession.delete")
@patch("aiohttp.ClientSession.post")
@patch("aiohttp.ClientSession.get")
def test_ls_gpu(self, mock_get, mock_post, mock_delete):
responses_add = [
GPU_MEMBERS_RESP,
GPU_DATA_RESP1,
GPU_DATA_RESP2
]
responses = INIT_RESP + responses_add
self.set_mock_response(mock_delete,200, "OK")
self.set_mock_response(mock_post,200, "OK")
self.set_mock_response(mock_get, 200, responses)
self.args = [self.option_arg]
_, err = self.badfish_call()
assert err == RESPONSE_LS_GPU

@patch("aiohttp.ClientSession.post")
@patch("aiohttp.ClientSession.get")
def test_ls_gpu_data_not_available(
self, mock_get, mock_post
):
responses_add = [
GPU_SUMMARY_RESP_FAULTY,
]
responses = INIT_RESP + responses_add
self.set_mock_response(mock_post, 200, "OK")
self.set_mock_response(mock_get, [200,200,200,200,200,404], responses)
self.args = [self.option_arg]
_, err = self.badfish_call()
assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR

@patch("aiohttp.ClientSession.post")
@patch("aiohttp.ClientSession.get")
def test_ls_gpu_summary_data_error(
self, mock_get, mock_post
):
responses_add = [
GPU_MEMBERS_RESP,
GPU_DATA_RESP1,
GPU_DATA_RESP_FAULTY,
]
responses = INIT_RESP + responses_add
self.set_mock_response(mock_get, [200,200,200,200,200,404,200,200], responses)
self.set_mock_response(mock_post, 200, "OK")
self.args = [self.option_arg]
_, err = self.badfish_call()
assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR

@patch("aiohttp.ClientSession.delete")
@patch("aiohttp.ClientSession.post")
@patch("aiohttp.ClientSession.get")
def test_ls_gpu_summary_value_error(self, mock_get, mock_post, mock_delete):
responses_add = [
GPU_MEMBERS_RESP,
GPU_DATA_RESP1,
GPU_DATA_RESP_FAULTY,
]
responses = INIT_RESP + responses_add
self.set_mock_response(mock_get, 200, responses)
self.set_mock_response(mock_post, 200, "OK")
self.set_mock_response(mock_delete, 200, "OK")
self.args = [self.option_arg]
_, err = self.badfish_call()
assert err == RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR