Skip to content

Commit 18d809e

Browse files
PalakTripathi1brenns10
authored andcommitted
rds.py: Added RDS QP states diagnostics in rds.py Script
Added a new helper to extract and display detailed RDS QP state, including source/destination IPs. This enhances visibility into per-connection RDMA hardware state via ib_connection traversal. Useful for debugging, states and RDMA mapping. Orabug: 38221449 Signed-off-by: Palak Tripathi <[email protected]> Reviewed-by: Anand Khoje <[email protected]> Reviewed-by: Stephen Brennan <[email protected]>
1 parent 928e8ae commit 18d809e

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

drgn_tools/rds.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import drgn
2222
from drgn import cast
2323
from drgn import container_of
24+
from drgn import FaultError
2425
from drgn import Object
2526
from drgn import PlatformFlags
2627
from drgn import Program
@@ -889,6 +890,7 @@ def rds_info_verbose(
889890
"Tos",
890891
"SL",
891892
"SrcQPNo",
893+
"SrcQPState",
892894
"DstQPNo",
893895
"Cache_allocs",
894896
"Recv_alloc_ctr",
@@ -909,6 +911,7 @@ def rds_info_verbose(
909911
"RCV IRQN",
910912
]
911913
]
914+
qp_state_type = prog.type("enum ib_qp_state")
912915
for rds_ib_dev in for_each_rds_ib_device(prog):
913916
dev_name = rds_ib_dev.dev.name.string_().decode("utf-8")
914917
debuginfo = ensure_mlx_core_ib_debuginfo(prog, dev_name)
@@ -924,6 +927,14 @@ def rds_info_verbose(
924927
dstqpnum: Any = int(con.i_dst_qp_num)
925928
except AttributeError:
926929
dstqpnum = "N/A"
930+
try:
931+
ibqp = con.i_cm_id.qp
932+
mlx5_srcqp = container_of(ibqp, "struct mlx5_ib_qp", "ibqp")
933+
srcqpstate = str(cast(qp_state_type, mlx5_srcqp.state)).split(
934+
"_"
935+
)[-1]
936+
except FaultError:
937+
srcqpstate = "N/A"
927938
sl = int(con.i_sl)
928939
cache_allocs = int(con.i_cache_allocs.counter)
929940
recv_free_ctr = int(con.i_recv_ring.w_free_ctr.counter)
@@ -1041,6 +1052,7 @@ def rds_info_verbose(
10411052
conn_tos,
10421053
sl,
10431054
srcqpnum,
1055+
srcqpstate,
10441056
dstqpnum,
10451057
cache_allocs,
10461058
recv_alloc_ctr,
@@ -1085,6 +1097,80 @@ def rds_info_verbose(
10851097
return None
10861098

10871099

1100+
def rds_conn_cq_eq_info(
1101+
prog: drgn.Program,
1102+
outfile: Optional[str] = None,
1103+
report: bool = False,
1104+
) -> None:
1105+
"""
1106+
Display CQ and EQ info per RDS IB connection in a table format
1107+
"""
1108+
msg = ensure_debuginfo(prog, ["rds", "mlx5_core", "mlx5_ib"])
1109+
if msg:
1110+
print(msg)
1111+
return
1112+
1113+
table = Table(
1114+
[
1115+
"LocalAddr",
1116+
"RemoteAddr",
1117+
"SCQNo",
1118+
"SCQ_ptr",
1119+
"RCQNo",
1120+
"RCQ_ptr",
1121+
"SCQ_EQNo",
1122+
"SCQ_EQ_ptr",
1123+
"RCQ_EQNo",
1124+
"RCQ_EQ_ptr",
1125+
],
1126+
outfile=outfile,
1127+
report=report,
1128+
)
1129+
1130+
for dev in for_each_rds_ib_device(prog):
1131+
for ib_conn in list_for_each_entry(
1132+
"struct rds_ib_connection", dev.conn_list.address_of_(), "ib_node"
1133+
):
1134+
conn = ib_conn.conn
1135+
1136+
try:
1137+
src_ip = rds_inet_ntoa(conn.c_laddr)
1138+
dst_ip = rds_inet_ntoa(conn.c_faddr)
1139+
1140+
scq = ib_conn.i_scq
1141+
rcq = ib_conn.i_rcq
1142+
scq_no = int(ib_conn.i_scq_vector)
1143+
rcq_no = int(ib_conn.i_rcq_vector)
1144+
1145+
scq_parent = container_of(scq, "struct mlx5_ib_cq", "ibcq")
1146+
rcq_parent = container_of(rcq, "struct mlx5_ib_cq", "ibcq")
1147+
1148+
scq_eq_no = int(scq_parent.eq.eqn)
1149+
rcq_eq_no = int(rcq_parent.eq.eqn)
1150+
scq_eq_ptr = hex(scq_parent.eq.value_())
1151+
rcq_eq_ptr = hex(rcq_parent.eq.value_())
1152+
scq_ptr = hex(scq.value_())
1153+
rcq_ptr = hex(rcq.value_())
1154+
except Exception:
1155+
continue
1156+
1157+
table.row(
1158+
src_ip,
1159+
dst_ip,
1160+
scq_no,
1161+
scq_ptr,
1162+
rcq_no,
1163+
rcq_ptr,
1164+
scq_eq_no,
1165+
scq_eq_ptr,
1166+
rcq_eq_no,
1167+
rcq_eq_ptr,
1168+
)
1169+
1170+
print("\nRDS conn CQ/EQ info:")
1171+
table.write()
1172+
1173+
10881174
def rds_sock_info(
10891175
prog: drgn.Program,
10901176
ret: bool = False,
@@ -1578,6 +1664,7 @@ def report(prog: drgn.Program, outfile: Optional[str] = None) -> None:
15781664
rds_sock_info(prog, outfile=outfile, report=True)
15791665
rds_conn_info(prog, outfile=outfile, report=True)
15801666
rds_info_verbose(prog, outfile=outfile, report=True)
1667+
rds_conn_cq_eq_info(prog, outfile, report=True)
15811668
rds_stats(prog, outfile=outfile, report=True)
15821669
rds_print_msg_queue(prog, queue="All", outfile=outfile, report=True)
15831670

0 commit comments

Comments
 (0)