Skip to content

Commit 6f5a6f4

Browse files
authored
Merge pull request ceph#58179 from cbodley/wip-rgw-assert-async
rgw: test that no asio threads are blocked on null_yield Reviewed-by: Adam Emerson <[email protected]>
2 parents 0daa092 + b0d0596 commit 6f5a6f4

37 files changed

+241
-144
lines changed

src/common/options/rgw.yaml.in

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,6 +2238,14 @@ options:
22382238
services:
22392239
- rgw
22402240
with_legacy: true
2241+
- name: rgw_asio_assert_yielding
2242+
type: bool
2243+
level: dev
2244+
desc: Trigger an assertion failure if an operation would block an asio thread
2245+
default: false
2246+
services:
2247+
- rgw
2248+
with_legacy: true
22412249
- name: rgw_user_quota_bucket_sync_interval
22422250
type: int
22432251
level: advanced

src/rgw/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ set(librgw_common_srcs
6363
rgw_acl_swift.cc
6464
rgw_aio.cc
6565
rgw_aio_throttle.cc
66+
rgw_asio_thread.cc
6667
rgw_auth.cc
6768
rgw_auth_s3.cc
6869
rgw_arn.cc

src/rgw/driver/rados/rgw_data_sync.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine {
277277
return io_block(0);
278278
}
279279
yield {
280-
op_ret = http_op->wait(shard_info, null_yield);
280+
op_ret = http_op->wait(dpp, shard_info, null_yield);
281281
http_op->put();
282282
}
283283

@@ -377,7 +377,7 @@ class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
377377
}
378378
yield {
379379
timer.reset();
380-
op_ret = http_op->wait(&response, null_yield);
380+
op_ret = http_op->wait(dpp, &response, null_yield);
381381
http_op->put();
382382
}
383383

@@ -495,7 +495,7 @@ class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine {
495495
}
496496

497497
int request_complete() override {
498-
int ret = http_op->wait(result, null_yield);
498+
int ret = http_op->wait(sync_env->dpp, result, null_yield);
499499
http_op->put();
500500
if (ret < 0 && ret != -ENOENT) {
501501
ldpp_dout(sync_env->dpp, 5) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl;

src/rgw/driver/rados/rgw_lc_tier.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
269269
}
270270

271271
/* fetch headers */
272-
ret = tier_ctx.conn.complete_request(in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
272+
ret = tier_ctx.conn.complete_request(tier_ctx.dpp, in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
273273
if (ret < 0 && ret != -ENOENT) {
274274
ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl;
275275
return ret;
@@ -704,8 +704,7 @@ RGWGetDataCB *RGWLCCloudStreamPut::get_cb() {
704704
}
705705

706706
int RGWLCCloudStreamPut::complete_request() {
707-
int ret = conn.complete_request(out_req, etag, &obj_properties.mtime, null_yield);
708-
return ret;
707+
return conn.complete_request(dpp, out_req, etag, &obj_properties.mtime, null_yield);
709708
}
710709

711710
/* Read local copy and write to Cloud endpoint */

src/rgw/driver/rados/rgw_notify.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ class Manager : public DoutPrefixProvider {
248248
<< " retry_number: "
249249
<< entry_persistency_tracker.retires_num
250250
<< " current time: " << time_now << dendl;
251-
const auto ret = push_endpoint->send(event_entry.event, yield);
251+
const auto ret = push_endpoint->send(this, event_entry.event, yield);
252252
if (ret < 0) {
253253
ldpp_dout(this, 5) << "WARNING: push entry marker: " << entry.marker
254254
<< " failed. error: " << ret
@@ -1266,7 +1266,7 @@ int publish_commit(rgw::sal::Object* obj,
12661266
dpp->get_cct());
12671267
ldpp_dout(res.dpp, 20) << "INFO: push endpoint created: "
12681268
<< topic.cfg.dest.push_endpoint << dendl;
1269-
const auto ret = push_endpoint->send(event_entry.event, res.yield);
1269+
const auto ret = push_endpoint->send(dpp, event_entry.event, res.yield);
12701270
if (ret < 0) {
12711271
ldpp_dout(dpp, 1)
12721272
<< "ERROR: failed to push sync notification event with error: "

src/rgw/driver/rados/rgw_pubsub_push.cc

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "common/Formatter.h"
1010
#include "common/iso_8601.h"
1111
#include "common/async/completion.h"
12+
#include "rgw_asio_thread.h"
1213
#include "rgw_common.h"
1314
#include "rgw_data_sync.h"
1415
#include "rgw_pubsub.h"
@@ -88,7 +89,8 @@ class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
8889
}
8990
}
9091

91-
int send(const rgw_pubsub_s3_event& event, optional_yield y) override {
92+
int send(const DoutPrefixProvider* dpp, const rgw_pubsub_s3_event& event,
93+
optional_yield y) override {
9294
std::shared_lock lock(s_http_manager_mutex);
9395
if (!s_http_manager) {
9496
ldout(cct, 1) << "ERROR: send failed. http endpoint manager not running" << dendl;
@@ -114,7 +116,7 @@ class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
114116
if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending);
115117
auto rc = s_http_manager->add_request(&request);
116118
if (rc == 0) {
117-
rc = request.wait(y);
119+
rc = request.wait(dpp, y);
118120
}
119121
if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending);
120122
// TODO: use read_bl to process return code and handle according to ack level
@@ -144,7 +146,7 @@ class Waiter {
144146
mutable std::condition_variable cond;
145147

146148
public:
147-
int wait(optional_yield y) {
149+
int wait(const DoutPrefixProvider* dpp, optional_yield y) {
148150
std::unique_lock l{lock};
149151
if (done) {
150152
return ret;
@@ -160,6 +162,8 @@ class Waiter {
160162
}, token, yield.get_executor());
161163
return -ec.value();
162164
}
165+
maybe_warn_about_blocking(dpp);
166+
163167
cond.wait(l, [this]{return (done==true);});
164168
return ret;
165169
}
@@ -247,7 +251,7 @@ class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
247251
}
248252
}
249253

250-
int send(const rgw_pubsub_s3_event& event, optional_yield y) override {
254+
int send(const DoutPrefixProvider* dpp, const rgw_pubsub_s3_event& event, optional_yield y) override {
251255
if (ack_level == ack_level_t::None) {
252256
return amqp::publish(conn_id, topic, json_format_pubsub_event(event));
253257
} else {
@@ -262,7 +266,7 @@ class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
262266
// failed to publish, does not wait for reply
263267
return rc;
264268
}
265-
return w->wait(y);
269+
return w->wait(dpp, y);
266270
}
267271
}
268272

@@ -320,7 +324,8 @@ class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
320324
}
321325
}
322326

323-
int send(const rgw_pubsub_s3_event& event, optional_yield y) override {
327+
int send(const DoutPrefixProvider* dpp, const rgw_pubsub_s3_event& event,
328+
optional_yield y) override {
324329
if (ack_level == ack_level_t::None) {
325330
return kafka::publish(conn_id, topic, json_format_pubsub_event(event));
326331
} else {
@@ -332,7 +337,7 @@ class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
332337
// failed to publish, does not wait for reply
333338
return rc;
334339
}
335-
return w->wait(y);
340+
return w->wait(dpp, y);
336341
}
337342
}
338343

src/rgw/driver/rados/rgw_pubsub_push.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "include/common_fwd.h"
99
#include "common/async/yield_context.h"
1010

11+
class DoutPrefixProvider;
1112
class RGWHTTPArgs;
1213
struct rgw_pubsub_s3_event;
1314

@@ -28,7 +29,9 @@ class RGWPubSubEndpoint {
2829

2930
// this method is used in order to send notification and wait for completion
3031
// in async manner via a coroutine when invoked in the frontend environment
31-
virtual int send(const rgw_pubsub_s3_event& event, optional_yield y) = 0;
32+
virtual int send(const DoutPrefixProvider* dpp,
33+
const rgw_pubsub_s3_event& event,
34+
optional_yield y) = 0;
3235

3336
// present as string
3437
virtual std::string to_str() const = 0;

src/rgw/driver/rados/rgw_rados.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4084,7 +4084,7 @@ int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
40844084
return ret;
40854085
}
40864086

4087-
ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
4087+
ret = conn->complete_request(dpp, in_stream_req, nullptr, &set_mtime, psize,
40884088
nullptr, pheaders, y);
40894089
if (ret < 0) {
40904090
if (ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) {
@@ -4319,7 +4319,7 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
43194319
goto set_err_state;
43204320
}
43214321

4322-
ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
4322+
ret = conn->complete_request(rctx.dpp, in_stream_req, &etag, &set_mtime,
43234323
&accounted_size, nullptr, nullptr, rctx.y);
43244324
if (ret < 0) {
43254325
if (ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) {
@@ -4580,7 +4580,7 @@ int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
45804580
return ret;
45814581
}
45824582

4583-
ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, y);
4583+
ret = rest_master_conn->complete_request(dpp, out_stream_req, etag, mtime, y);
45844584
if (ret < 0) {
45854585
if (ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) {
45864586
ldpp_dout(dpp, 20) << __func__ << "(): failed to put_obj_async_init. retries=" << tries << dendl;
@@ -7792,7 +7792,7 @@ int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
77927792
} // if taking of lock succeeded
77937793
} // block to encapsulate recovery from incomplete reshard
77947794

7795-
ret = reshard_wait->wait(y);
7795+
ret = reshard_wait->wait(dpp, y);
77967796
if (ret < 0) {
77977797
ldpp_dout(dpp, 0) << __func__ <<
77987798
" ERROR: bucket is still resharding, please retry" << dendl;

src/rgw/driver/rados/rgw_reshard.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "rgw_zone.h"
99
#include "driver/rados/rgw_bucket.h"
10+
#include "rgw_asio_thread.h"
1011
#include "rgw_reshard.h"
1112
#include "rgw_sal.h"
1213
#include "rgw_sal_rados.h"
@@ -1261,7 +1262,7 @@ int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const str
12611262
return 0;
12621263
}
12631264

1264-
int RGWReshardWait::wait(optional_yield y)
1265+
int RGWReshardWait::wait(const DoutPrefixProvider* dpp, optional_yield y)
12651266
{
12661267
std::unique_lock lock(mutex);
12671268

@@ -1285,6 +1286,7 @@ int RGWReshardWait::wait(optional_yield y)
12851286
waiters.erase(waiters.iterator_to(waiter));
12861287
return -ec.value();
12871288
}
1289+
maybe_warn_about_blocking(dpp);
12881290

12891291
cond.wait_for(lock, duration);
12901292

src/rgw/driver/rados/rgw_reshard.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ class RGWReshardWait {
266266
~RGWReshardWait() {
267267
ceph_assert(going_down);
268268
}
269-
int wait(optional_yield y);
269+
int wait(const DoutPrefixProvider* dpp, optional_yield y);
270270
// unblock any threads waiting on reshard
271271
void stop();
272272
};

0 commit comments

Comments
 (0)