Skip to content

Commit 8be9663

Browse files
jcphillrbuch
authored andcommitted
Fix verbs on new InfiniBand hardware/drivers (#2712)
Verbs started rejecting setting retry_cnt to 20 because this is only a 3-bit field, so the maximum legal value is 7. This caused a retry using "QLOGIC" values, which attempted to modify the mtu, but the mtu flag was not set and mtu could not be modified at this point of setup anyway, so the value was only changed for future nodes attempting to connect and was in general inconsistent, resulting in "Work completion error in sendCq" Charm++ aborts along with mlx5 driver "got completion with error" messages. Fix is to always set retry_cnt to 7 and don't try to change mtu.
1 parent 7427330 commit 8be9663

File tree

1 file changed

+1
-8
lines changed

1 file changed

+1
-8
lines changed

src/arch/verbs/machine-ibverbs.C

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -976,11 +976,10 @@ struct infiOtherNodeData *initInfiOtherNodeData(int node,int addr[3]){
976976
attr.qp_state = IBV_QPS_RTS;
977977
#if ! QLOGIC
978978
attr.timeout = 26;
979-
attr.retry_cnt = 20;
980979
#else
981980
attr.timeout = 14;
982-
attr.retry_cnt = 7;
983981
#endif
982+
attr.retry_cnt = 7;
984983
attr.rnr_retry = 7;
985984
attr.sq_psn = context->localAddr[node].psn;
986985
attr.max_rd_atomic = 1;
@@ -999,15 +998,9 @@ struct infiOtherNodeData *initInfiOtherNodeData(int node,int addr[3]){
999998
if(err == 22) {
1000999
//use inverted logic
10011000
#if QLOGIC
1002-
mtu = IBV_MTU_2048;
1003-
attr.path_mtu = mtu;
10041001
attr.timeout = 26;
1005-
attr.retry_cnt = 20;
10061002
#else
1007-
mtu = IBV_MTU_4096;
1008-
attr.path_mtu = mtu;
10091003
attr.timeout = 14;
1010-
attr.retry_cnt = 7;
10111004
#endif
10121005

10131006
MACHSTATE3(3,"Retry:dlid 0x%x qp 0x%x psn 0x%x",attr.ah_attr.dlid,attr.dest_qp_num,attr.sq_psn);

0 commit comments

Comments
 (0)