Skip to content

Commit 1180ed4

Browse files
nvidia-gpu: add support for PCIe port metrics
Add xyz.openbmc_project.Metric.Value interface for each of the following PCIe port metric of a ConnectX device. PCIeErrors.CorrectableErrorCount PCIeErrors.NonFatalErrorCount PCIeErrors.FatalErrorCount PCIeErrors.L0ToRecoveryCount PCIeErrors.ReplayCount PCIeErrors.ReplayRolloverCount PCIeErrors.NAKSentCount PCIeErrors.NAKReceivedCount PCIeErrors.UnsupportedRequestCount PDI Patch - https://gerrit.openbmc.org/c/openbmc/phosphor-dbus-interfaces/+/84839 Tested: Build an image for nvl32-obmc machine with the following patch cherry picked. https://gerrit.openbmc.org/c/openbmc/openbmc/+/85490 The patch cherry-picks the following patches that are currently under review. ``` 1. device tree https://lore.kernel.org/all/aRbLqH8pLWCQryhu@molberding.nvidia.com/ 2. mctpd patches CodeConstruct/mctp#85 3. u-boot changes https://lore.kernel.org/openbmc/20251121-msx4-v1-0-fc0118b666c1@nvidia.com/T/#t 4. kernel changes as specified in the openbmc patch (for espi) 5. entity-manager changes https://gerrit.openbmc.org/c/openbmc/entity-manager/+/85455 6. platform-init changes https://gerrit.openbmc.org/c/openbmc/platform-init/+/85456 7. spi changes https://lore.kernel.org/all/20251121-w25q01jv_fixup-v1-1-3d175050db73@nvidia.com/ ``` ``` root@nvl32-obmc:~# busctl tree xyz.openbmc_project.GpuSensor `- /xyz `- /xyz/openbmc_project |- /xyz/openbmc_project/inventory | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe/DOWN_0 | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe/DOWN_1 | | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_0_PCIe/UP_0 | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe/DOWN_0 | | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe/DOWN_1 | | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_2_PCIe/UP_0 | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe/DOWN_0 | |- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe/DOWN_1 | `- /xyz/openbmc_project/inventory/Nvidia_ConnectX_3_PCIe/UP_0 |- /xyz/openbmc_project/metric | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0 | | `- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/correctable_error_count | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/fatal_error_count | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/l0_to_recovery_count | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/nak_received_count | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/nak_sent_count | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/non_fatal_error_count | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/replay_count | | |- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/replay_rollover_count | | `- /xyz/openbmc_project/metric/port_Nvidia_ConnectX_0_PCIe_DOWN_0/pcie/unsupported_request_count root@nvl32-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/metric/port_Nvidia_ConnectX_3_PCIe_DOWN_1/pcie/l0_to_recovery_count NAME TYPE SIGNATURE RESULT/VALUE FLAGS org.freedesktop.DBus.Introspectable interface - - - .Introspect method - s - org.freedesktop.DBus.Peer interface - - - .GetMachineId method - s - .Ping method - - - org.freedesktop.DBus.Properties interface - - - .Get method ss v - .GetAll method s a{sv} - .Set method ssv - - .PropertiesChanged signal sa{sv}as - - xyz.openbmc_project.Association.Definitions interface - - - .Associations property a(sss) 1 "measuring" "measured_by" "/xyz/ope... emits-change xyz.openbmc_project.Metric.Value interface - - - .Unit property s "xyz.openbmc_project.Metric.Value.Uni... emits-change .Value property d 1 emits-change ``` Change-Id: I3379c09346653d6a6bf2921bf765f0adf5a22098 Signed-off-by: Harshit Aghera <haghera@nvidia.com>
1 parent 6df2fc0 commit 1180ed4

File tree

7 files changed

+379
-5
lines changed

7 files changed

+379
-5
lines changed

src/nvidia-gpu/NvidiaGpuSensorMain.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ int main()
5656
objectServer.add_manager("/xyz/openbmc_project/sensors");
5757
objectServer.add_manager("/xyz/openbmc_project/inventory");
5858
objectServer.add_manager("/xyz/openbmc_project/software");
59+
objectServer.add_manager("/xyz/openbmc_project/metric");
5960
systemBus->request_name("xyz.openbmc_project.GpuSensor");
6061

6162
mctp::MctpRequester mctpRequester(io);

src/nvidia-gpu/NvidiaPcieDevice.cpp

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "NvidiaGpuMctpVdm.hpp"
1010
#include "NvidiaPcieInterface.hpp"
1111
#include "NvidiaPciePort.hpp"
12+
#include "NvidiaPciePortMetrics.hpp"
1213
#include "Utils.hpp"
1314

1415
#include <MctpRequester.hpp>
@@ -104,8 +105,10 @@ void PcieDevice::processPciePortCountsResponse(
104105

105106
void PcieDevice::makeSensors()
106107
{
108+
const std::string pcieDeviceName = name + "_PCIe";
109+
107110
pcieInterface = std::make_shared<NvidiaPcieInterface>(
108-
conn, mctpRequester, name, path, eid, objectServer);
111+
conn, mctpRequester, pcieDeviceName, path, eid, objectServer);
109112

110113
uint64_t downstreamPortIndex = 0;
111114

@@ -114,7 +117,19 @@ void PcieDevice::makeSensors()
114117
const std::string portName = std::format("UP_{}", i);
115118

116119
pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
117-
conn, mctpRequester, portName, name, path, eid,
120+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
121+
gpu::PciePortType::UPSTREAM, i, i, objectServer));
122+
123+
pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
124+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
125+
gpu::PciePortType::UPSTREAM, i, i, objectServer));
126+
127+
pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
128+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
129+
gpu::PciePortType::UPSTREAM, i, i, objectServer));
130+
131+
pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
132+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
118133
gpu::PciePortType::UPSTREAM, i, i, objectServer));
119134

120135
for (uint64_t j = 0; j < pcieDeviceInfo.numDownstreamPorts[i]; ++j)
@@ -123,7 +138,22 @@ void PcieDevice::makeSensors()
123138
std::format("DOWN_{}", downstreamPortIndex);
124139

125140
pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
126-
conn, mctpRequester, portName, name, path, eid,
141+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
142+
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
143+
objectServer));
144+
145+
pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
146+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
147+
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
148+
objectServer));
149+
150+
pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
151+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
152+
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
153+
objectServer));
154+
155+
pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
156+
conn, mctpRequester, portName, pcieDeviceName, path, eid,
127157
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
128158
objectServer));
129159

@@ -146,6 +176,11 @@ void PcieDevice::read()
146176
port->update();
147177
}
148178

179+
for (auto& portMetrics : pciePortMetrics)
180+
{
181+
portMetrics->update();
182+
}
183+
149184
waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
150185
waitTimer.async_wait([this](const boost::system::error_code& ec) {
151186
if (ec)

src/nvidia-gpu/NvidiaPcieDevice.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "NvidiaPcieInterface.hpp"
1111

1212
#include <NvidiaPciePort.hpp>
13+
#include <NvidiaPciePortMetrics.hpp>
1314
#include <boost/asio/io_context.hpp>
1415
#include <boost/asio/steady_timer.hpp>
1516
#include <sdbusplus/asio/connection.hpp>
@@ -21,8 +22,7 @@
2122
#include <string>
2223
#include <vector>
2324

24-
constexpr const char* pcieDevicePathPrefix =
25-
"/xyz/openbmc_project/inventory/pcie_devices/";
25+
constexpr const char* pcieDevicePathPrefix = "/xyz/openbmc_project/inventory/";
2626

2727
struct PcieDeviceInfo
2828
{
@@ -83,4 +83,5 @@ class PcieDevice : public std::enable_shared_from_this<PcieDevice>
8383
std::shared_ptr<NvidiaPcieInterface> pcieInterface;
8484

8585
std::vector<std::shared_ptr<NvidiaPciePortInfo>> pciePorts;
86+
std::vector<std::shared_ptr<NvidiaPciePortMetrics>> pciePortMetrics;
8687
};
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright OpenBMC Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#include "NvidiaPciePortMetrics.hpp"
7+
8+
#include "NvidiaUtils.hpp"
9+
#include "Utils.hpp"
10+
11+
#include <bits/basic_string.h>
12+
13+
#include <MctpRequester.hpp>
14+
#include <NvidiaGpuMctpVdm.hpp>
15+
#include <NvidiaPcieDevice.hpp>
16+
#include <OcpMctpVdm.hpp>
17+
#include <phosphor-logging/lg2.hpp>
18+
#include <sdbusplus/asio/connection.hpp>
19+
#include <sdbusplus/asio/object_server.hpp>
20+
#include <sdbusplus/message/native_types.hpp>
21+
22+
#include <cstddef>
23+
#include <cstdint>
24+
#include <format>
25+
#include <functional>
26+
#include <memory>
27+
#include <span>
28+
#include <string>
29+
#include <system_error>
30+
#include <vector>
31+
32+
using std::string;
33+
34+
using namespace std::literals;
35+
36+
constexpr const char* metricInterface = "xyz.openbmc_project.Metric.Value";
37+
38+
NvidiaPciePortMetrics::NvidiaPciePortMetrics(
39+
std::shared_ptr<sdbusplus::asio::connection>& conn,
40+
mctp::MctpRequester& mctpRequester, const std::string& name,
41+
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
42+
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
43+
sdbusplus::asio::object_server& objectServer, uint8_t scalarGroupId,
44+
const std::vector<NvidiaMetricInfo>& metricsInfo) :
45+
eid(eid), portType(portType), upstreamPortNumber(upstreamPortNumber),
46+
portNumber(portNumber), scalarGroupId(scalarGroupId), path(path),
47+
conn(conn), mctpRequester(mctpRequester)
48+
{
49+
const std::string metricsDbusPathPrefix =
50+
metricPath + std::format("port_{}_{}", pcieDeviceName, name);
51+
52+
const sdbusplus::message::object_path portDbusPath =
53+
sdbusplus::message::object_path(pcieDevicePathPrefix) / pcieDeviceName /
54+
name;
55+
56+
for (const auto& [id, name] : metricsInfo)
57+
{
58+
const std::string metricsDbusPath = metricsDbusPathPrefix + name;
59+
60+
metricValueInterfaces[id] =
61+
objectServer.add_interface(metricsDbusPath, metricInterface);
62+
metricValueInterfaces[id]->register_property(
63+
"Unit", "xyz.openbmc_project.Metric.Value.Unit.Count"s);
64+
metricValueInterfaces[id]->register_property("Value", 0.0);
65+
66+
std::vector<Association> associations;
67+
associations.emplace_back("measuring", "measured_by", portDbusPath);
68+
69+
metricAssociationInterfaces[id] =
70+
objectServer.add_interface(metricsDbusPath, association::interface);
71+
metricAssociationInterfaces[id]->register_property("Associations",
72+
associations);
73+
74+
if (!metricValueInterfaces[id]->initialize())
75+
{
76+
lg2::error(
77+
"Error initializing PCIe Port Metric Interface for EID={EID}, "
78+
"PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
79+
"EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
80+
portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
81+
"MN", name);
82+
}
83+
84+
if (!metricAssociationInterfaces[id]->initialize())
85+
{
86+
lg2::error(
87+
"Error initializing PCIe Port Metric Association Interface for EID={EID}, "
88+
"PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
89+
"EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
90+
portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
91+
"MN", name);
92+
}
93+
}
94+
}
95+
96+
void NvidiaPciePortMetrics::processResponse(
97+
const std::error_code& sendRecvMsgResult, std::span<const uint8_t> response)
98+
{
99+
if (sendRecvMsgResult)
100+
{
101+
lg2::error(
102+
"Error updating PCIe Port Metrics: sending message over MCTP failed, "
103+
"rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
104+
"RC", sendRecvMsgResult.message(), "EID", eid, "PT",
105+
static_cast<uint8_t>(portType), "PN", portNumber, "SG",
106+
scalarGroupId);
107+
return;
108+
}
109+
110+
ocp::accelerator_management::CompletionCode cc{};
111+
uint16_t reasonCode = 0;
112+
size_t numTelemetryValue = 0;
113+
114+
int rc = gpu::decodeQueryScalarGroupTelemetryV2Response(
115+
response, cc, reasonCode, numTelemetryValue, telemetryValues);
116+
117+
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
118+
{
119+
lg2::error(
120+
"Error updating PCIe Port Errors: decode failed, "
121+
"rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
122+
"RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
123+
eid, "PT", static_cast<uint8_t>(portType), "PN", portNumber, "SG",
124+
scalarGroupId);
125+
return;
126+
}
127+
128+
for (size_t i = 0; i < numTelemetryValue; ++i)
129+
{
130+
if (metricValueInterfaces[i] != nullptr)
131+
{
132+
metricValueInterfaces[i]->set_property(
133+
"Value", static_cast<double>(telemetryValues[i]));
134+
}
135+
}
136+
}
137+
138+
void NvidiaPciePortMetrics::update()
139+
{
140+
auto rc = gpu::encodeQueryScalarGroupTelemetryV2Request(
141+
0, portType, upstreamPortNumber, portNumber, scalarGroupId, request);
142+
143+
if (rc != 0)
144+
{
145+
lg2::error(
146+
"Error updating PCIe Port Errors: encode failed, rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
147+
"RC", rc, "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
148+
portNumber, "SG", scalarGroupId);
149+
return;
150+
}
151+
152+
mctpRequester.sendRecvMsg(
153+
eid, request,
154+
[weak{weak_from_this()}](const std::error_code& ec,
155+
std::span<const uint8_t> buffer) {
156+
std::shared_ptr<NvidiaPciePortMetrics> self = weak.lock();
157+
if (!self)
158+
{
159+
lg2::error("Invalid reference to NvidiaPciePortErrors");
160+
return;
161+
}
162+
self->processResponse(ec, buffer);
163+
});
164+
}
165+
166+
std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortErrors(
167+
std::shared_ptr<sdbusplus::asio::connection>& conn,
168+
mctp::MctpRequester& mctpRequester, const std::string& name,
169+
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
170+
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
171+
sdbusplus::asio::object_server& objectServer)
172+
{
173+
static constexpr uint8_t nvidiaPciePortErrorScalarGroupId = 2;
174+
175+
return std::make_shared<NvidiaPciePortMetrics>(
176+
conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
177+
upstreamPortNumber, portNumber, objectServer,
178+
nvidiaPciePortErrorScalarGroupId,
179+
std::vector<NvidiaMetricInfo>{
180+
{0, "/pcie/non_fatal_error_count"},
181+
{1, "/pcie/fatal_error_count"},
182+
{2, "/pcie/unsupported_request_count"},
183+
{3, "/pcie/correctable_error_count"},
184+
});
185+
}
186+
187+
std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortCounters(
188+
std::shared_ptr<sdbusplus::asio::connection>& conn,
189+
mctp::MctpRequester& mctpRequester, const std::string& name,
190+
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
191+
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
192+
sdbusplus::asio::object_server& objectServer)
193+
{
194+
static constexpr uint8_t nvidiaPciePortCounterScalarGroupId = 4;
195+
196+
return std::make_shared<NvidiaPciePortMetrics>(
197+
conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
198+
upstreamPortNumber, portNumber, objectServer,
199+
nvidiaPciePortCounterScalarGroupId,
200+
std::vector<NvidiaMetricInfo>{
201+
{1, "/pcie/nak_received_count"},
202+
{2, "/pcie/nak_sent_count"},
203+
{4, "/pcie/replay_rollover_count"},
204+
{6, "/pcie/replay_count"},
205+
});
206+
}
207+
208+
std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortL0ToRecoveryCount(
209+
std::shared_ptr<sdbusplus::asio::connection>& conn,
210+
mctp::MctpRequester& mctpRequester, const std::string& name,
211+
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
212+
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
213+
sdbusplus::asio::object_server& objectServer)
214+
{
215+
static constexpr uint8_t nvidiaPciePortL0ToRecoveryCountScalarGroupId = 3;
216+
217+
return std::make_shared<NvidiaPciePortMetrics>(
218+
conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
219+
upstreamPortNumber, portNumber, objectServer,
220+
nvidiaPciePortL0ToRecoveryCountScalarGroupId,
221+
std::vector<NvidiaMetricInfo>{
222+
{0, "/pcie/l0_to_recovery_count"},
223+
});
224+
}

0 commit comments

Comments
 (0)