Skip to content

Commit 081bda7

Browse files
committed
rgw/dedup: full object dedup
Design Document: https://docs.google.com/document/d/152VyCTR2NlZ6ongbe6-CJfP4qxr1_zH83FB_WukWD7c Signed-off-by: Gabriel BenHanokh <[email protected]>
1 parent 46aa467 commit 081bda7

31 files changed

+9750
-5
lines changed

doc/radosgw/s3_objects_dedup.rst

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
======================
2+
Full RGW Object Dedup:
3+
======================
4+
Add a radosgw-admin command to collect and report deduplication stats
5+
6+
.. note:: This utility doesn’t perform dedup and doesn’t make any
7+
change to the existing system and will only collect
8+
statistics and report them.
9+
10+
----
11+
12+
***************
13+
Admin commands:
14+
***************
15+
- ``radosgw-admin dedup stats``:
16+
Collects & displays last dedup statistics
17+
- ``radosgw-admin dedup pause``:
18+
Pauses active dedup session (dedup resources are not released)
19+
- ``radosgw-admin dedup resume``:
20+
Resumes a paused dedup session
21+
- ``radosgw-admin dedup abort``:
22+
Aborts active dedup session and release all resources used by it
23+
- ``radosgw-admin dedup estimate``
24+
Starts a new dedup estimate session (aborting first existing session if exists)
25+
26+
----
27+
28+
****************
29+
Skipped Objects:
30+
****************
31+
Dedup Estimates skips the following objects:
32+
33+
- Objects smaller than 4MB (unless they are multipart)
34+
- Objects with different placement rules
35+
- Objects with different pools
36+
- Objects with different same storage-classes
37+
38+
The Dedup process itself (which will be released later) will also skip
39+
**compressed** and **user-encrypted** objects, but the estimate
40+
process will accept them (since we don't have access to that
41+
information during the estimate process)
42+
43+
----
44+
45+
********************
46+
Estimate Processing:
47+
********************
48+
The Dedup Estimate process collects all the needed information directly from
49+
the bucket-indices reading one full bucket-index object with 1000's of
50+
entries at a time.
51+
52+
The Bucket-Indices objects are sharded between the participating
53+
members so every bucket-index object is read exactly one time.
54+
The sharding allow processing to scale almost linearly spliting the
55+
load evenly between the participating members.
56+
57+
The Dedup Estimate process does not access the objects themselves
58+
(data/metadata) which means its processing time won't be affected by
59+
the underlined media storing the objects (SSD/HDD) since the bucket-indices are
60+
virtually always stored on a fast medium (SSD with heavy memory
61+
caching)
62+
63+
----
64+
65+
*************
66+
Memory Usage:
67+
*************
68+
+---------------++-----------+
69+
| RGW Obj Count | Memory |
70+
+===============++===========+
71+
| | ____1M | | ___8MB |
72+
| | ____4M | | __16MB |
73+
| | ___16M | | __32MB |
74+
| | ___64M | | __64MB |
75+
| | __256M | | _128MB |
76+
| | _1024M( 1G) | | _256MB |
77+
| | _4096M( 4G) | | _512MB |
78+
| | 16384M(16G) | | 1024MB |
79+
+---------------+------------+

src/common/subsys.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ SUBSYS(seastore_device, 0, 5)
104104
SUBSYS(seastore_backref, 0, 5)
105105
SUBSYS(alienstore, 0, 5)
106106
SUBSYS(mclock, 1, 5)
107+
SUBSYS(rgw_dedup, 1, 5)
107108
SUBSYS(cyanstore, 0, 5)
108109
SUBSYS(ceph_exporter, 1, 5)
109110
SUBSYS(memstore, 1, 5)

src/rgw/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ set(librgw_common_srcs
159159
rgw_bucket_encryption.cc
160160
rgw_tracer.cc
161161
rgw_lua_background.cc
162+
rgw_dedup.cc
163+
rgw_dedup_table.cc
164+
rgw_dedup_store.cc
165+
rgw_dedup_utils.cc
166+
rgw_dedup_cluster.cc
162167
rgw_data_access.cc
163168
rgw_realm_watcher.cc
164169
driver/rados/account.cc

src/rgw/radosgw-admin/radosgw-admin.cc

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ extern "C" {
4848

4949
#include "radosgw-admin/orphan.h"
5050
#include "radosgw-admin/sync_checkpoint.h"
51-
5251
#include "rgw_user.h"
5352
#include "rgw_otp.h"
5453
#include "rgw_rados.h"
@@ -78,7 +77,7 @@ extern "C" {
7877
#include "rgw_data_access.h"
7978
#include "rgw_account.h"
8079
#include "rgw_bucket_logging.h"
81-
80+
#include "rgw_dedup_cluster.h"
8281
#include "services/svc_sync_modules.h"
8382
#include "services/svc_cls.h"
8483
#include "services/svc_bilog_rados.h"
@@ -151,6 +150,12 @@ void usage()
151150
cout << " user policy list attached list attached managed policies\n";
152151
cout << " caps add add user capabilities\n";
153152
cout << " caps rm remove user capabilities\n";
153+
cout << " dedup stats Display dedup statistics from the last run\n";
154+
cout << " dedup estimate Runs dedup in estimate mode (no changes will be made)\n";
155+
cout << " dedup restart Restart dedup\n";
156+
cout << " dedup abort Abort dedup\n";
157+
cout << " dedup pause Pause dedup\n";
158+
cout << " dedup resume Resume paused dedup\n";
154159
cout << " subuser create create a new subuser\n" ;
155160
cout << " subuser modify modify subuser\n";
156161
cout << " subuser rm remove subuser\n";
@@ -742,6 +747,12 @@ enum class OPT {
742747
QUOTA_SET,
743748
QUOTA_ENABLE,
744749
QUOTA_DISABLE,
750+
DEDUP_STATS,
751+
DEDUP_ESTIMATE,
752+
DEDUP_ABORT,
753+
DEDUP_RESTART,
754+
DEDUP_PAUSE,
755+
DEDUP_RESUME,
745756
GC_LIST,
746757
GC_PROCESS,
747758
LC_LIST,
@@ -989,6 +1000,12 @@ static SimpleCmd::Commands all_cmds = {
9891000
{ "ratelimit set", OPT::RATELIMIT_SET },
9901001
{ "ratelimit enable", OPT::RATELIMIT_ENABLE },
9911002
{ "ratelimit disable", OPT::RATELIMIT_DISABLE },
1003+
{ "dedup stats", OPT::DEDUP_STATS },
1004+
{ "dedup estimate", OPT::DEDUP_ESTIMATE },
1005+
{ "dedup abort", OPT::DEDUP_ABORT },
1006+
{ "dedup restart", OPT::DEDUP_RESTART },
1007+
{ "dedup pause", OPT::DEDUP_PAUSE },
1008+
{ "dedup resume", OPT::DEDUP_RESUME },
9921009
{ "gc list", OPT::GC_LIST },
9931010
{ "gc process", OPT::GC_PROCESS },
9941011
{ "lc list", OPT::LC_LIST },
@@ -4509,6 +4526,12 @@ int main(int argc, const char **argv)
45094526
OPT::BI_LIST,
45104527
OPT::OLH_GET,
45114528
OPT::OLH_READLOG,
4529+
OPT::DEDUP_STATS,
4530+
OPT::DEDUP_ESTIMATE,
4531+
OPT::DEDUP_ABORT, // TBD - not READ-ONLY
4532+
OPT::DEDUP_RESTART, // TBD - not READ-ONLY
4533+
OPT::DEDUP_PAUSE,
4534+
OPT::DEDUP_RESUME,
45124535
OPT::GC_LIST,
45134536
OPT::LC_LIST,
45144537
OPT::ORPHANS_LIST_JOBS,
@@ -9168,6 +9191,71 @@ int main(int argc, const char **argv)
91689191
}
91699192
}
91709193

9194+
if (opt_cmd == OPT::DEDUP_STATS ||
9195+
opt_cmd == OPT::DEDUP_ESTIMATE ||
9196+
opt_cmd == OPT::DEDUP_ABORT ||
9197+
opt_cmd == OPT::DEDUP_PAUSE ||
9198+
opt_cmd == OPT::DEDUP_RESUME ||
9199+
opt_cmd == OPT::DEDUP_RESTART) {
9200+
9201+
using namespace rgw::dedup;
9202+
rgw::sal::RadosStore *store = dynamic_cast<rgw::sal::RadosStore*>(driver);
9203+
if (!store) {
9204+
cerr << "ERROR: this command can only work when the cluster has a RADOS "
9205+
<< "backing store." << std::endl;
9206+
return EPERM;
9207+
}
9208+
9209+
if (opt_cmd == OPT::DEDUP_STATS) {
9210+
int ret = cluster::collect_all_shard_stats(store, formatter.get(), dpp());
9211+
if (ret == 0) {
9212+
formatter->flush(cout);
9213+
}
9214+
else {
9215+
cerr << "ERROR: Failed reading stat counters" << std::endl;
9216+
}
9217+
return ret;
9218+
}
9219+
9220+
if (opt_cmd == OPT::DEDUP_ABORT || opt_cmd == OPT::DEDUP_PAUSE || opt_cmd == OPT::DEDUP_RESUME) {
9221+
urgent_msg_t urgent_msg;
9222+
if (opt_cmd == OPT::DEDUP_ABORT) {
9223+
urgent_msg = URGENT_MSG_ABORT;
9224+
}
9225+
else if (opt_cmd == OPT::DEDUP_PAUSE) {
9226+
urgent_msg = URGENT_MSG_PASUE;
9227+
}
9228+
else {
9229+
urgent_msg = URGENT_MSG_RESUME;
9230+
}
9231+
return cluster::dedup_control(store, dpp(), urgent_msg);
9232+
}
9233+
9234+
if (opt_cmd == OPT::DEDUP_RESTART || opt_cmd == OPT::DEDUP_ESTIMATE) {
9235+
dedup_req_type_t dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE;
9236+
if (opt_cmd == OPT::DEDUP_ESTIMATE) {
9237+
dedup_type = dedup_req_type_t::DEDUP_TYPE_ESTIMATE;
9238+
}
9239+
else {
9240+
dedup_type = dedup_req_type_t::DEDUP_TYPE_FULL;
9241+
#ifndef FULL_DEDUP_SUPPORT
9242+
std::cerr << "Only dedup estimate is supported!" << std::endl;
9243+
return EPERM;
9244+
#endif
9245+
}
9246+
9247+
int ret = cluster::dedup_restart_scan(store, dedup_type, dpp());
9248+
if (ret == 0) {
9249+
std::cout << "Dedup was restarted successfully" << std::endl;
9250+
}
9251+
else {
9252+
std::cerr << "Dedup failed to restart" << std::endl;
9253+
std::cerr << "Error is: " << ret << "::" << cpp_strerror(ret) << std::endl;
9254+
}
9255+
return ret;
9256+
}
9257+
}
9258+
91719259
if (opt_cmd == OPT::GC_LIST) {
91729260
int index = 0;
91739261
bool truncated;

src/rgw/rgw_appmain.cc

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
#include "rgw_asio_frontend.h"
6767
#include "rgw_dmclock_scheduler_ctx.h"
6868
#include "rgw_lua.h"
69+
#include "rgw_dedup.h"
6970
#ifdef WITH_RADOSGW_DBSTORE
7071
#include "rgw_sal_dbstore.h"
7172
#endif
@@ -534,7 +535,9 @@ int rgw::AppMain::init_frontends2(RGWLib* rgwlib)
534535
if (env.lua.background) {
535536
rgw_pauser->add_pauser(env.lua.background);
536537
}
537-
538+
if (dedup_background) {
539+
rgw_pauser->add_pauser(dedup_background.get());
540+
}
538541
need_context_pool();
539542
reloader = std::make_unique<RGWRealmReloader>(
540543
env, *implicit_tenant_context, service_map_meta, rgw_pauser.get(), *context_pool);
@@ -558,7 +561,6 @@ void rgw::AppMain::init_lua()
558561
rgw::sal::Driver* driver = env.driver;
559562
int r{0};
560563
std::string install_dir;
561-
562564
#ifdef WITH_RADOSGW_LUA_PACKAGES
563565
rgw::lua::packages_t failed_packages;
564566
r = rgw::lua::install_packages(dpp, driver, null_yield, g_conf().get_val<std::string>("rgw_luarocks_location"),
@@ -583,6 +585,21 @@ void rgw::AppMain::init_lua()
583585
}
584586
} /* init_lua */
585587

588+
void rgw::AppMain::init_dedup()
589+
{
590+
rgw::sal::Driver* driver = env.driver;
591+
if (driver->get_name() == "rados") { /* Supported for only RadosStore */
592+
try {
593+
dedup_background = std::make_unique<rgw::dedup::Background>(driver, dpp->get_cct());
594+
dedup_background->start();
595+
dedup_background->watch_reload(dpp);
596+
}
597+
catch (const std::runtime_error&) {
598+
ldpp_dout(dpp, 0) << __func__ << "::failed create dedup background job" << dendl;
599+
}
600+
}
601+
}
602+
586603
void rgw::AppMain::shutdown(std::function<void(void)> finalize_async_signals)
587604
{
588605
// stop the realm reloader
@@ -596,6 +613,10 @@ void rgw::AppMain::shutdown(std::function<void(void)> finalize_async_signals)
596613
if (g_conf().get_val<bool>("rgw_lua_enable"))
597614
static_cast<rgw::sal::RadosLuaManager*>(env.lua.manager.get())->
598615
unwatch_reload(dpp);
616+
617+
if (dedup_background) {
618+
dedup_background->unwatch_reload(dpp);
619+
}
599620
}
600621

601622
for (auto& fe : fes) {
@@ -605,6 +626,10 @@ void rgw::AppMain::shutdown(std::function<void(void)> finalize_async_signals)
605626
ldh.reset(nullptr); // deletes ldap helper if it was created
606627
rgw_log_usage_finalize();
607628

629+
if (dedup_background) {
630+
dedup_background->shutdown();
631+
}
632+
608633
if (lua_background) {
609634
lua_background->shutdown();
610635
}

src/rgw/rgw_common.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ using ceph::crypto::MD5;
8787
#define RGW_ATTR_LC RGW_ATTR_PREFIX "lc"
8888
#define RGW_ATTR_CORS RGW_ATTR_PREFIX "cors"
8989
#define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag"
90-
#define RGW_ATTR_CKSUM RGW_ATTR_PREFIX "cksum"
90+
#define RGW_ATTR_CKSUM RGW_ATTR_PREFIX "cksum"
91+
#define RGW_ATTR_SHA256 RGW_ATTR_PREFIX "x-amz-content-sha256"
9192
#define RGW_ATTR_BUCKETS RGW_ATTR_PREFIX "buckets"
9293
#define RGW_ATTR_META_PREFIX RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX
9394
#define RGW_ATTR_CONTENT_TYPE RGW_ATTR_PREFIX "content_type"
@@ -102,6 +103,7 @@ using ceph::crypto::MD5;
102103
#define RGW_ATTR_SHADOW_OBJ RGW_ATTR_PREFIX "shadow_name"
103104
#define RGW_ATTR_MANIFEST RGW_ATTR_PREFIX "manifest"
104105
#define RGW_ATTR_USER_MANIFEST RGW_ATTR_PREFIX "user_manifest"
106+
#define RGW_ATTR_SHARE_MANIFEST RGW_ATTR_PREFIX "shared_manifest"
105107
#define RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION RGW_ATTR_PREFIX RGW_AMZ_WEBSITE_REDIRECT_LOCATION
106108
#define RGW_ATTR_SLO_MANIFEST RGW_ATTR_PREFIX "slo_manifest"
107109
/* Information whether an object is SLO or not must be exposed to

0 commit comments

Comments
 (0)