Skip to content

Commit c907601

Browse files
authored
Merge pull request ceph#64933 from soumyakoduri/wip-skoduri-restore-progress
rgw/restore: Persistently store the restore state for cloud-s3 tier Reviewed-by: Jiffin Tony Thottan <[email protected]> Reviewed-by: Matt Benjamin <[email protected]>
2 parents 3f68e70 + 0f98740 commit c907601

File tree

7 files changed

+50
-49
lines changed

7 files changed

+50
-49
lines changed

qa/suites/rgw/cloud-transition/overrides.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@ overrides:
55
setuser: ceph
66
setgroup: ceph
77
debug rgw: 20
8+
debug rgw_restore: 20
89
rgw crypt s3 kms backend: testing
910
rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
1011
rgw crypt require ssl: false
1112
rgw sts key: abcdefghijklmnop
1213
rgw s3 auth use sts: true
1314
rgw lc debug interval: 10
1415
rgw_restore_debug_interval: 20
16+
rgw_restore_processor_period: 10
1517
rgw:
1618
storage classes:
1719
LUKEWARM:

qa/suites/rgw/cloud-transition/tasks/restore/cloud_restore_s3tests.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ tasks:
77
client.1:
88
port: 8001
99
- rgw-cloudtier:
10+
client.1:
1011
client.0:
1112
cloud_storage_class: CLOUDTIER-CLIENT0
1213
cloud_client: client.1
@@ -27,5 +28,6 @@ tasks:
2728
extra_attrs: ["cloud_restore"]
2829
lc_debug_interval: 10
2930
rgw_restore_debug_interval: 20
31+
rgw_restore_processor_period: 10
3032
lifecycle_tests: True
3133
cloudtier_tests: True

qa/suites/rgw/cloud-transition/tasks/transition/cloud_transition_s3tests.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ tasks:
1111
#client.3:
1212
#port: 8003
1313
- rgw-cloudtier:
14+
client.1:
1415
client.0:
1516
# cloudtier storage class params
1617
# retain_head_object = false

qa/tasks/rgw_cloudtier.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ def setup(self):
5454
clients = self.config.keys() # http://tracker.ceph.com/issues/20417
5555
for client in clients:
5656
client_config = self.config.get(client)
57-
if client_config is None:
58-
client_config = {}
5957

6058
if client_config is not None:
6159
log.info('client %s - cloudtier config is -----------------%s ', client, client_config)
@@ -118,13 +116,14 @@ def setup(self):
118116

119117
log.info('Finished Configuring rgw cloudtier ...')
120118

121-
cluster_name, daemon_type, client_id = teuthology.split_role(client)
122-
client_with_id = daemon_type + '.' + client_id
123-
self.ctx.daemons.get_daemon('rgw', client_with_id, cluster_name).restart()
124-
log.info('restarted rgw daemon ...')
119+
for client in clients:
120+
cluster_name, daemon_type, client_id = teuthology.split_role(client)
121+
client_with_id = daemon_type + '.' + client_id
122+
self.ctx.daemons.get_daemon('rgw', client_with_id, cluster_name).restart()
123+
log.info('restarted rgw daemon ...')
125124

126-
(remote,) = self.ctx.cluster.only(client).remotes.keys()
127-
wait_for_radosgw(endpoint.url(), remote)
125+
(remote,) = self.ctx.cluster.only(client).remotes.keys()
126+
wait_for_radosgw(endpoint.url(), remote)
128127

129128

130129
task = RGWCloudTier

qa/tasks/s3tests.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,10 @@ def configure(ctx, config):
372372
if rgw_restore_debug_interval:
373373
s3tests_conf['s3 main']['rgw_restore_debug_interval'] = rgw_restore_debug_interval
374374

375+
rgw_restore_processor_period = properties.get('rgw_restore_processor_period')
376+
if rgw_restore_processor_period:
377+
s3tests_conf['s3 main']['rgw_restore_processor_period'] = rgw_restore_processor_period
378+
375379
if ctx.rgw_cloudtier is not None:
376380
log.info(' ctx.rgw_cloudtier config is %s ...', ctx.rgw_cloudtier.config)
377381
client_rgw_config = ctx.rgw_cloudtier.config.get(client)

src/rgw/driver/rados/rgw_rados.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5554,6 +5554,7 @@ int RGWRados::restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx,
55545554
ret = rgw_cloud_tier_get_object(tier_ctx, false, headers,
55555555
&set_mtime, etag, accounted_size,
55565556
attrs, &cb);
5557+
in_progress = false;
55575558
}
55585559

55595560
if (ret < 0) {

src/rgw/rgw_restore.cc

Lines changed: 33 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -341,13 +341,14 @@ int Restore::process(int index, int max_secs, optional_yield y)
341341
ret = process_restore_entry(entry, y);
342342

343343
if (!ret && entry.status == rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress) {
344-
r_entries.push_back(entry);
344+
r_entries.push_back(entry);
345345
ldpp_dout(this, 20) << __PRETTY_FUNCTION__ << ": re-pushing entry: '" << entry
346346
<< "' on shard:"
347347
<< obj_names[index] << dendl;
348348
}
349349

350-
if (ret < 0)
350+
// Skip the entry of object/bucket which no longer exists
351+
if (ret < 0 && (ret != -ENOENT))
351352
goto done;
352353

353354
///process all entries, trim and re-add
@@ -391,14 +392,16 @@ int Restore::process(int index, int max_secs, optional_yield y)
391392
int Restore::process_restore_entry(RestoreEntry& entry, optional_yield y)
392393
{
393394
int ret = 0;
394-
bool in_progress = true;
395395
std::unique_ptr<rgw::sal::Bucket> bucket;
396396
std::unique_ptr<rgw::sal::Object> obj;
397397
std::unique_ptr<rgw::sal::PlacementTier> tier;
398398
std::optional<uint64_t> days = entry.days;
399399
rgw::sal::RGWRestoreStatus restore_status = rgw::sal::RGWRestoreStatus::None;
400400
rgw_placement_rule target_placement;
401401

402+
// mark in_progress as false if the entry is being processed first time
403+
bool in_progress = ((entry.status == rgw::sal::RGWRestoreStatus::None) ? false : true);
404+
402405
// Ensure its the same source zone processing temp entries as we do not
403406
// replicate temp restored copies
404407
if (days) { // temp copy
@@ -429,6 +432,7 @@ int Restore::process_restore_entry(RestoreEntry& entry, optional_yield y)
429432
return ret;
430433
}
431434

435+
ldpp_dout(this, 10) << "Restore:: Processing restore entry of object(" << obj->get_key() << ") entry: " << entry << dendl;
432436
target_placement.inherit_from(bucket->get_placement_rule());
433437

434438
auto& attrs = obj->get_attrs();
@@ -439,6 +443,7 @@ int Restore::process_restore_entry(RestoreEntry& entry, optional_yield y)
439443
using ceph::decode;
440444
decode(restore_status, iter);
441445
}
446+
// check if its still in Progress state
442447
if (restore_status != rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress) {
443448
ldpp_dout(this, 5) << __PRETTY_FUNCTION__ << ": Restore of object " << obj->get_key()
444449
<< " not in progress state" << dendl;
@@ -450,11 +455,13 @@ int Restore::process_restore_entry(RestoreEntry& entry, optional_yield y)
450455
attr_iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
451456
if (attr_iter != attrs.end()) {
452457
target_placement.storage_class = attr_iter->second.to_str();
458+
} else {
459+
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": ERROR: Attr RGW_ATTR_STORAGE_CLASS not found for object: " << obj->get_key() << dendl;
453460
}
454461
ret = driver->get_zone()->get_zonegroup().get_placement_tier(target_placement, &tier);
455462

456463
if (ret < 0) {
457-
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": ERROR: failed to fetch tier placement handle, ret = " << ret << dendl;
464+
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": ERROR: failed to fetch tier placement handle, target_placement = " << target_placement << ", for zonegroup = " << driver->get_zone()->get_zonegroup().get_name() << ", ret = " << ret << dendl;
458465
goto done;
459466
} else {
460467
ldpp_dout(this, 20) << __PRETTY_FUNCTION__ << ": getting tier placement handle"
@@ -625,52 +632,37 @@ int Restore::restore_obj_from_cloud(rgw::sal::Bucket* pbucket,
625632
return ret;
626633
}
627634

628-
// now go ahead with restoring object
629-
bool in_progress = false;
630-
ret = pobj->restore_obj_from_cloud(pbucket, tier, cct, days, in_progress, dpp, y);
635+
// now add the entry to the restore list to be processed by Restore worker thread
636+
// asynchronoudly
637+
RestoreEntry entry;
638+
entry.bucket = pbucket->get_key();
639+
entry.obj_key = pobj->get_key();
640+
// for first time mark status as None
641+
entry.status = rgw::sal::RGWRestoreStatus::None;
642+
entry.days = days;
643+
entry.zone_id = driver->get_zone()->get_id();
644+
645+
ldpp_dout(this, 10) << "Restore:: Adding restore entry of object(" << pobj->get_key() << ") entry: " << entry << dendl;
646+
647+
int index = choose_oid(entry);
648+
ldpp_dout(this, 10) << __PRETTY_FUNCTION__ << ": Adding restore entry of object(" << pobj->get_key() << ") entry: " << entry << ", to shard:" << obj_names[index] << dendl;
649+
650+
std::vector<rgw::restore::RestoreEntry> r_entries;
651+
r_entries.push_back(entry);
652+
ret = sal_restore->add_entries(this, y, index, r_entries);
631653

632654
if (ret < 0) {
633-
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": ERROR: object " << pobj->get_key() << " fetching failed" << ret << dendl;
634-
auto reset_ret = set_cloud_restore_status(this, pobj, y, rgw::sal::RGWRestoreStatus::RestoreFailed);
655+
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": ERROR: Adding restore entry of object(" << pobj->get_key() << ") failed" << ret << dendl;
635656

657+
auto reset_ret = set_cloud_restore_status(this, pobj, y, rgw::sal::RGWRestoreStatus::RestoreFailed);
636658
if (reset_ret < 0) {
637-
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": Setting restore status to RestoreFailed failed for object(" << pobj->get_key() << ") " << reset_ret << dendl;
659+
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": Setting restore status as RestoreFailed failed for object(" << pobj->get_key() << ") " << reset_ret << dendl;
638660
}
639661

640662
return ret;
641663
}
642664

643-
if (in_progress) {
644-
// add restore entry to the list
645-
RestoreEntry entry;
646-
entry.bucket = pbucket->get_key();
647-
entry.obj_key = pobj->get_key();
648-
entry.status = rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress;
649-
entry.days = days;
650-
entry.zone_id = driver->get_zone()->get_id();
651-
652-
ldpp_dout(this, 10) << "Restore:: Adding restore entry of object(" << pobj->get_key() << ") entry: " << entry << dendl;
653-
654-
int index = choose_oid(entry);
655-
ldpp_dout(this, 10) << __PRETTY_FUNCTION__ << ": Adding restore entry of object(" << pobj->get_key() << ") entry: " << entry << ", to shard:" << obj_names[index] << dendl;
656-
657-
std::vector<rgw::restore::RestoreEntry> r_entries;
658-
r_entries.push_back(entry);
659-
ret = sal_restore->add_entries(this, y, index, r_entries);
660-
661-
if (ret < 0) {
662-
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": ERROR: Adding restore entry of object(" << pobj->get_key() << ") failed" << ret << dendl;
663-
664-
auto reset_ret = set_cloud_restore_status(this, pobj, y, rgw::sal::RGWRestoreStatus::RestoreFailed);
665-
if (reset_ret < 0) {
666-
ldpp_dout(this, -1) << __PRETTY_FUNCTION__ << ": Setting restore status as RestoreFailed failed for object(" << pobj->get_key() << ") " << reset_ret << dendl;
667-
}
668-
669-
return ret;
670-
}
671-
}
672-
673-
ldpp_dout(this, 10) << __PRETTY_FUNCTION__ << ": Restore of object " << pobj->get_key() << (in_progress ? " is in progress" : " succeeded") << dendl;
665+
ldpp_dout(this, 10) << __PRETTY_FUNCTION__ << ": Restore of object " << pobj->get_key() << " is in progress." << dendl;
674666
return ret;
675667
}
676668

0 commit comments

Comments
 (0)