Skip to content

Commit 0751f42

Browse files
authored
Merge pull request #11656 from wzamazon/btl_ofi_fix_flush
btl/ofi: increase outstanding_rdma eariler for flush
2 parents 479c746 + 585462d commit 0751f42

File tree

2 files changed

+17
-9
lines changed

2 files changed

+17
-9
lines changed

opal/mca/btl/ofi/btl_ofi_atomics.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ int mca_btl_ofi_afop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endp
6363
mca_btl_ofi_rdma_completion_t *comp = NULL;
6464
mca_btl_ofi_context_t *ofi_context;
6565

66+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
6667
ofi_context = get_ofi_context(ofi_btl);
6768

6869
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
@@ -87,16 +88,16 @@ int mca_btl_ofi_afop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endp
8788
fi_datatype, fi_op, &comp->comp_ctx);
8889

8990
if (rc == -FI_EAGAIN) {
91+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
9092
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
9193
return OPAL_ERR_OUT_OF_RESOURCE;
9294
} else if (rc < 0) {
95+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
9396
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
9497
BTL_ERROR(("fi_fetch_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
9598
MCA_BTL_OFI_ABORT();
9699
}
97100

98-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
99-
100101
return OPAL_SUCCESS;
101102
}
102103

@@ -114,6 +115,7 @@ int mca_btl_ofi_aop(struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *
114115
mca_btl_ofi_rdma_completion_t *comp = NULL;
115116
mca_btl_ofi_context_t *ofi_context;
116117

118+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
117119
ofi_context = get_ofi_context(ofi_btl);
118120

119121
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
@@ -136,16 +138,16 @@ int mca_btl_ofi_aop(struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *
136138
fi_datatype, fi_op, &comp->comp_ctx);
137139

138140
if (rc == -FI_EAGAIN) {
141+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
139142
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
140143
return OPAL_ERR_OUT_OF_RESOURCE;
141144
} else if (rc < 0) {
145+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
142146
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
143147
BTL_ERROR(("fi_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
144148
MCA_BTL_OFI_ABORT();
145149
}
146150

147-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
148-
149151
return OPAL_SUCCESS;
150152
}
151153

@@ -165,6 +167,7 @@ int mca_btl_ofi_acswap(struct mca_btl_base_module_t *btl, struct mca_btl_base_en
165167
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t *) endpoint;
166168
mca_btl_ofi_context_t *ofi_context;
167169

170+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
168171
ofi_context = get_ofi_context(ofi_btl);
169172

170173
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
@@ -188,15 +191,15 @@ int mca_btl_ofi_acswap(struct mca_btl_base_module_t *btl, struct mca_btl_base_en
188191
fi_datatype, FI_CSWAP, &comp->comp_ctx);
189192

190193
if (rc == -FI_EAGAIN) {
194+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
191195
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
192196
return OPAL_ERR_OUT_OF_RESOURCE;
193197
} else if (rc < 0) {
198+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
194199
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
195200
BTL_ERROR(("fi_compare_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
196201
MCA_BTL_OFI_ABORT();
197202
}
198203

199-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
200-
201204
return OPAL_SUCCESS;
202205
}

opal/mca/btl/ofi/btl_ofi_rdma.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ int mca_btl_ofi_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
6666
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t *) endpoint;
6767
mca_btl_ofi_context_t *ofi_context;
6868

69+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
70+
6971
ofi_context = get_ofi_context(ofi_btl);
7072

7173
/* create completion context */
@@ -84,17 +86,18 @@ int mca_btl_ofi_get(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
8486
&comp->comp_ctx); /* completion context */
8587

8688
if (-FI_EAGAIN == rc) {
89+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
8790
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
8891
return OPAL_ERR_OUT_OF_RESOURCE;
8992
}
9093

9194
if (0 != rc) {
95+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
9296
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
9397
BTL_ERROR(("fi_read failed with %d:%s", rc, fi_strerror(-rc)));
9498
MCA_BTL_OFI_ABORT();
9599
}
96100

97-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
98101

99102
return OPAL_SUCCESS;
100103
}
@@ -111,6 +114,8 @@ int mca_btl_ofi_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
111114
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t *) endpoint;
112115
mca_btl_ofi_context_t *ofi_context;
113116

117+
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
118+
114119
ofi_context = get_ofi_context(ofi_btl);
115120

116121
/* create completion context */
@@ -127,18 +132,18 @@ int mca_btl_ofi_put(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoin
127132
&comp->comp_ctx); /* completion context */
128133

129134
if (-FI_EAGAIN == rc) {
135+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
130136
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
131137
return OPAL_ERR_OUT_OF_RESOURCE;
132138
}
133139

134140
if (0 != rc) {
141+
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
135142
opal_free_list_return(comp->base.my_list, (opal_free_list_item_t *) comp);
136143
BTL_ERROR(("fi_write failed with %d:%s", rc, fi_strerror(-rc)));
137144
MCA_BTL_OFI_ABORT();
138145
}
139146

140-
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
141-
142147
return OPAL_SUCCESS;
143148
}
144149

0 commit comments

Comments
 (0)