Skip to content

Commit b5cbc46

Browse files
committed
Replace SHA256 with faster hash calculation using BLAKE3
Signed-off-by: Gabriel BenHanokh <[email protected]>
1 parent b2d2787 commit b5cbc46

File tree

12 files changed

+206
-194
lines changed

12 files changed

+206
-194
lines changed

doc/radosgw/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,4 @@ Cluster with one API and then retrieve that data with the other API.
9191
Metrics <metrics>
9292
UADK Acceleration for Compression <uadk-accel>
9393
Bucket Logging <bucket_logging>
94-
94+
Full Object Deduplication <s3_objects_dedup>

doc/radosgw/s3_objects_dedup.rst

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,28 @@
11
=====================
22
Full RGW Object Dedup
33
=====================
4-
Adds a ``radosgw-admin`` command to collect and report deduplication stats.
5-
6-
.. note:: This utility doesn't perform dedup and doesn't make any
7-
change to the existing system and will only collect
8-
statistics and report them.
4+
Adds ``radosgw-admin`` commands to remove duplicated RGW tail-objects and to collect and report deduplication stats.
95

106
**************
117
Admin commands
128
**************
13-
- ``radosgw-admin dedup stats``:
14-
Collects & displays last dedup statistics.
9+
- ``radosgw-admin dedup estimate``:
10+
Starts a new dedup estimate session (aborting first existing session if exists).
11+
12+
It doesn't make any change to the existing system and will only collect statistics and report them.
13+
- ``radosgw-admin dedup restart --yes-i-really-mean-it``:
14+
Starts a new dedup session (aborting first existing session if exists).
15+
It will perfrom a full dedup, finding duplicated tail-objects and removing them.
16+
17+
This command can lead to **data-loss** and should not be used on production data!!
1518
- ``radosgw-admin dedup pause``:
1619
Pauses an active dedup session (dedup resources are not released).
1720
- ``radosgw-admin dedup resume``:
1821
Resumes a paused dedup session.
1922
- ``radosgw-admin dedup abort``:
2023
Aborts an active dedup session and release all resources used by it.
21-
- ``radosgw-admin dedup estimate``:
22-
Starts a new dedup estimate session (aborting first existing session if exists).
24+
- ``radosgw-admin dedup stats``:
25+
Collects & displays last dedup statistics.
2326

2427
***************
2528
Skipped Objects
@@ -31,10 +34,7 @@ Dedup Estimate process skips the following objects:
3134
- Objects with different pools.
3235
- Objects with different storage classes.
3336

34-
The dedup process itself (which will be released in a later Ceph release) will also skip
35-
**compressed** and **user-encrypted** objects, but the estimate
36-
process will accept them (since we don't have access to that
37-
information during the estimate process).
37+
The full dedup process skips all the above and it also skips **compressed** and **user-encrypted** objects.
3838

3939
*******************
4040
Estimate Processing
@@ -54,6 +54,24 @@ the underlying media storing the objects (SSD/HDD) since the bucket indices are
5454
virtually always stored on a fast medium (SSD with heavy memory
5555
caching).
5656

57+
*********************
58+
Full Dedup Processing
59+
*********************
60+
The Full Dedup process begins by constructing a dedup table from the bucket indices, similar to the estimate process above.
61+
62+
This table is then scanned linearly to purge objects without duplicates, leaving only dedup candidates.
63+
64+
Next, we iterate through these dedup candidate objects, reading their complete information from the object metadata (a per-object RADOS operation).
65+
During this step, we filter out **compressed** and **user-encrypted** objects.
66+
67+
Following this, we calculate a strong-hash of the object data, which involves a full-object read and is a resource-intensive operation.
68+
This strong-hash ensures that the dedup candidates are indeed perfect matches.
69+
If they are, we proceed with the deduplication:
70+
71+
- incrementing the reference count on the source tail-objects one by one.
72+
- copying the manifest from the source to the target.
73+
- removing all tail-objects on the target.
74+
5775
************
5876
Memory Usage
5977
************

src/rgw/radosgw-admin/radosgw-admin.cc

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ void usage()
152152
cout << " caps rm remove user capabilities\n";
153153
cout << " dedup stats Display dedup statistics from the last run\n";
154154
cout << " dedup estimate Runs dedup in estimate mode (no changes will be made)\n";
155-
cout << " dedup restart Restart dedup\n";
155+
cout << " dedup restart Restart dedup; must include --yes-i-really-mean-it to activate\n";
156156
cout << " dedup abort Abort dedup\n";
157157
cout << " dedup pause Pause dedup\n";
158158
cout << " dedup resume Resume paused dedup\n";
@@ -3652,7 +3652,6 @@ int main(int argc, const char **argv)
36523652
int skip_zero_entries = false; // log show
36533653
int purge_keys = false;
36543654
int yes_i_really_mean_it = false;
3655-
int tech_preview = false;
36563655
int delete_child_objects = false;
36573656
int fix = false;
36583657
int remove_bad = false;
@@ -4113,8 +4112,6 @@ int main(int argc, const char **argv)
41134112
// do nothing
41144113
} else if (ceph_argparse_binary_flag(args, i, &yes_i_really_mean_it, NULL, "--yes-i-really-mean-it", (char*)NULL)) {
41154114
// do nothing
4116-
} else if (ceph_argparse_binary_flag(args, i, &tech_preview, NULL, "--tech-preview", (char*)NULL)) {
4117-
// do nothing
41184115
} else if (ceph_argparse_binary_flag(args, i, &fix, NULL, "--fix", (char*)NULL)) {
41194116
// do nothing
41204117
} else if (ceph_argparse_binary_flag(args, i, &remove_bad, NULL, "--remove-bad", (char*)NULL)) {
@@ -9246,12 +9243,6 @@ int main(int argc, const char **argv)
92469243
<< std::endl;
92479244
return EINVAL;
92489245
}
9249-
if (!tech_preview) {
9250-
cerr << "Full Dedup is supplied as a tech-preview only and should not be used on production systems!\n"
9251-
<< "Please acknowledge that you understand this is a tech preview (requires --tech-preview)"
9252-
<< std::endl;
9253-
return EINVAL;
9254-
}
92559246
dedup_type = dedup_req_type_t::DEDUP_TYPE_FULL;
92569247
#ifndef FULL_DEDUP_SUPPORT
92579248
std::cerr << "Only dedup estimate is supported!" << std::endl;

src/rgw/rgw_common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ using ceph::crypto::MD5;
8989
#define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag"
9090
#define RGW_ATTR_CKSUM RGW_ATTR_PREFIX "cksum"
9191
#define RGW_ATTR_SHA256 RGW_ATTR_PREFIX "x-amz-content-sha256"
92+
#define RGW_ATTR_BLAKE3 RGW_ATTR_PREFIX "blake3"
9293
#define RGW_ATTR_BUCKETS RGW_ATTR_PREFIX "buckets"
9394
#define RGW_ATTR_META_PREFIX RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX
9495
#define RGW_ATTR_CONTENT_TYPE RGW_ATTR_PREFIX "content_type"

src/rgw/rgw_dedup.cc

Lines changed: 52 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -704,23 +704,23 @@ namespace rgw::dedup {
704704
//---------------------------------------------------------------------------
705705
static void init_cmp_pairs(const disk_record_t *p_rec,
706706
const bufferlist &etag_bl,
707-
bufferlist &sha256_bl, // OUT PARAM
707+
bufferlist &hash_bl, // OUT PARAM
708708
librados::ObjectWriteOperation *p_op)
709709
{
710710
p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl);
711711
// TBD: do we really need the secondary compare using the full manifest?
712712
// Can replace it with something cheaper like size/version?
713713
p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl);
714714

715-
// SHA has 256 bit splitted into multiple 64bit units
715+
// BLAKE3 hash has 256 bit splitted into multiple 64bit units
716716
const unsigned units = (256 / (sizeof(uint64_t)*8));
717717
static_assert(units == 4);
718718
for (unsigned i = 0; i < units; i++) {
719-
ceph::encode(p_rec->s.sha256[i], sha256_bl);
719+
ceph::encode(p_rec->s.hash[i], hash_bl);
720720
}
721721

722-
if (!p_rec->s.flags.sha256_calculated()) {
723-
p_op->cmpxattr(RGW_ATTR_SHA256, CEPH_OSD_CMPXATTR_OP_EQ, sha256_bl);
722+
if (!p_rec->s.flags.hash_calculated()) {
723+
p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl);
724724
}
725725
}
726726

@@ -755,17 +755,17 @@ namespace rgw::dedup {
755755
ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
756756
<< "::ETAG=" << etag_bl.to_str() << dendl;
757757

758-
bufferlist hash_bl, manifest_hash_bl, tgt_sha256_bl;
758+
bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl;
759759
crypto::digest<crypto::SHA1>(p_src_rec->manifest_bl).encode(hash_bl);
760760
// Use a shorter hash (64bit instead of 160bit)
761761
hash_bl.splice(0, 8, &manifest_hash_bl);
762762
librados::ObjectWriteOperation tgt_op;
763-
init_cmp_pairs(p_tgt_rec, etag_bl, tgt_sha256_bl, &tgt_op);
763+
init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
764764
tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
765765
tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
766-
if (p_tgt_rec->s.flags.sha256_calculated()) {
767-
tgt_op.setxattr(RGW_ATTR_SHA256, tgt_sha256_bl);
768-
p_stats->set_sha256_attrs++;
766+
if (p_tgt_rec->s.flags.hash_calculated()) {
767+
tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
768+
p_stats->set_hash_attrs++;
769769
}
770770

771771
std::string src_oid, tgt_oid;
@@ -800,13 +800,13 @@ namespace rgw::dedup {
800800
// disk-record (as require an expensive random-disk-write).
801801
// When deduping C we can trust the shared_manifest state in the table and
802802
// skip a redundant update to SRC object attribute
803-
bufferlist src_sha256_bl;
803+
bufferlist src_hash_bl;
804804
librados::ObjectWriteOperation src_op;
805-
init_cmp_pairs(p_src_rec, etag_bl, src_sha256_bl, &src_op);
805+
init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op);
806806
src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
807-
if (p_src_rec->s.flags.sha256_calculated()) {
808-
src_op.setxattr(RGW_ATTR_SHA256, src_sha256_bl);
809-
p_stats->set_sha256_attrs++;
807+
if (p_src_rec->s.flags.hash_calculated()) {
808+
src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl);
809+
p_stats->set_hash_attrs++;
810810
}
811811

812812
ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl;
@@ -824,57 +824,49 @@ namespace rgw::dedup {
824824
return ret;
825825
}
826826

827-
using ceph::crypto::SHA256;
828827
//---------------------------------------------------------------------------
829-
int Background::calc_object_sha256(const disk_record_t *p_rec, uint8_t *p_sha256)
828+
int Background::calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash)
830829
{
831-
ldpp_dout(dpp, 20) << __func__ << "::p_rec->obj_name=" << p_rec->obj_name << dendl;
832-
// Open questions -
833-
// 1) do we need the secret if so what is the correct one to use?
834-
// 2) are we passing the head/tail objects in the correct order?
830+
ldpp_dout(dpp, 20) << __func__ << "::obj_name=" << p_rec->obj_name << dendl;
835831
RGWObjManifest manifest;
836832
try {
837833
auto bl_iter = p_rec->manifest_bl.cbegin();
838834
decode(manifest, bl_iter);
839835
} catch (buffer::error& err) {
840-
ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl;
836+
ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest for: "
837+
<< p_rec->obj_name << dendl;
841838
return -EINVAL;
842839
}
843-
std::string oid;
844-
build_oid(p_rec->bucket_id, p_rec->obj_name, &oid);
845-
librados::IoCtx head_ioctx;
846-
const char *secret = "0555b35654ad1656d804f1b017cd26e9";
847-
TOPNSPC::crypto::HMACSHA256 hmac((const uint8_t*)secret, strlen(secret));
840+
841+
blake3_hasher hmac;
842+
blake3_hasher_init(&hmac);
848843
for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) {
849844
rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
850845
rgw_rados_ref obj;
851846
int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
852847
if (ret < 0) {
853-
ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for raw_obj="
854-
<< raw_obj << dendl;
848+
ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: "
849+
<< raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
855850
return ret;
856851
}
857852

858-
if (oid == raw_obj.oid) {
859-
ldpp_dout(dpp, 20) << __func__ << "::manifest: head object=" << oid << dendl;
860-
head_ioctx = obj.ioctx;
861-
}
862853
bufferlist bl;
863854
librados::IoCtx ioctx = obj.ioctx;
864855
// read full object
865856
ret = ioctx.read(raw_obj.oid, bl, 0, 0);
866857
if (ret > 0) {
867858
for (const auto& bptr : bl.buffers()) {
868-
hmac.Update((const unsigned char *)bptr.c_str(), bptr.length());
859+
blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length());
869860
}
870861
}
871862
else {
872-
ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << oid
863+
ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid
873864
<< ", error is " << cpp_strerror(-ret) << dendl;
874865
return ret;
875866
}
876867
}
877-
hmac.Final(p_sha256);
868+
869+
blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN);
878870
return 0;
879871
}
880872

@@ -977,33 +969,33 @@ namespace rgw::dedup {
977969
memset(&p_rec->s.shared_manifest, 0, sizeof(p_rec->s.shared_manifest));
978970
}
979971

980-
itr = attrs.find(RGW_ATTR_SHA256);
972+
itr = attrs.find(RGW_ATTR_BLAKE3);
981973
if (itr != attrs.end()) {
982974
try {
983975
auto bl_iter = itr->second.cbegin();
984-
// SHA has 256 bit splitted into multiple 64bit units
976+
// BLAKE3 hash 256 bit splitted into multiple 64bit units
985977
const unsigned units = (256 / (sizeof(uint64_t)*8));
986978
static_assert(units == 4);
987979
for (unsigned i = 0; i < units; i++) {
988980
uint64_t val;
989981
ceph::decode(val, bl_iter);
990-
p_rec->s.sha256[i] = val;
982+
p_rec->s.hash[i] = val;
991983
}
992-
p_stats->valid_sha256_attrs++;
984+
p_stats->valid_hash_attrs++;
993985
return 0;
994986
} catch (buffer::error& err) {
995-
ldpp_dout(dpp, 1) << __func__ << "::ERR: failed SHA256 decode" << dendl;
987+
ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl;
996988
return -EINVAL;
997989
}
998990
}
999991

1000-
p_stats->invalid_sha256_attrs++;
992+
p_stats->invalid_hash_attrs++;
1001993
// TBD: redundant memset...
1002-
memset(p_rec->s.sha256, 0, sizeof(p_rec->s.sha256));
1003-
// CEPH_CRYPTO_HMACSHA256_DIGESTSIZE is 32 Bytes (32*8=256)
1004-
int ret = calc_object_sha256(p_rec, (uint8_t*)p_rec->s.sha256);
994+
memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash));
995+
// BLAKE3_OUT_LEN is 32 Bytes
996+
int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash);
1005997
if (ret == 0) {
1006-
p_rec->s.flags.set_sha256_calculated();
998+
p_rec->s.flags.set_hash_calculated();
1007999
}
10081000

10091001
return ret;
@@ -1177,18 +1169,18 @@ namespace rgw::dedup {
11771169
}
11781170

11791171
//---------------------------------------------------------------------------
1180-
static int write_sha256_object_attribute(const DoutPrefixProvider* const dpp,
1172+
static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp,
11811173
rgw::sal::Driver* driver,
11821174
RGWRados* rados,
11831175
const disk_record_t *p_rec)
11841176
{
11851177
bufferlist etag_bl;
1186-
bufferlist sha256_bl;
1178+
bufferlist hash_bl;
11871179
librados::ObjectWriteOperation op;
11881180
etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts,
11891181
&etag_bl);
1190-
init_cmp_pairs(p_rec, etag_bl, sha256_bl /*OUT PARAM*/, &op);
1191-
op.setxattr(RGW_ATTR_SHA256, sha256_bl);
1182+
init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
1183+
op.setxattr(RGW_ATTR_BLAKE3, hash_bl);
11921184

11931185
std::string oid;
11941186
librados::IoCtx ioctx;
@@ -1304,17 +1296,17 @@ namespace rgw::dedup {
13041296
return 0;
13051297
}
13061298

1307-
if (memcmp(src_rec.s.sha256, p_tgt_rec->s.sha256, sizeof(src_rec.s.sha256)) != 0) {
1308-
p_stats->sha256_mismatch++;
1309-
ldpp_dout(dpp, 10) << __func__ << "::SHA256 mismatch" << dendl;
1310-
// TBD: set sha256 attributes on head objects to save calc next time
1311-
if (src_rec.s.flags.sha256_calculated()) {
1312-
write_sha256_object_attribute(dpp, driver, rados, &src_rec);
1313-
p_stats->set_sha256_attrs++;
1299+
if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) {
1300+
p_stats->hash_mismatch++;
1301+
ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
1302+
// TBD: set hash attributes on head objects to save calc next time
1303+
if (src_rec.s.flags.hash_calculated()) {
1304+
write_blake3_object_attribute(dpp, driver, rados, &src_rec);
1305+
p_stats->set_hash_attrs++;
13141306
}
1315-
if (p_tgt_rec->s.flags.sha256_calculated()) {
1316-
write_sha256_object_attribute(dpp, driver, rados, p_tgt_rec);
1317-
p_stats->set_sha256_attrs++;
1307+
if (p_tgt_rec->s.flags.hash_calculated()) {
1308+
write_blake3_object_attribute(dpp, driver, rados, p_tgt_rec);
1309+
p_stats->set_hash_attrs++;
13181310
}
13191311
return 0;
13201312
}

src/rgw/rgw_dedup.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ namespace rgw::dedup {
179179
remapper_t *remapper);
180180

181181
#ifdef FULL_DEDUP_SUPPORT
182-
int calc_object_sha256(const disk_record_t *p_rec, uint8_t *p_sha256);
182+
int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash);
183183
int add_obj_attrs_to_record(rgw_bucket *p_rb,
184184
disk_record_t *p_rec,
185185
const rgw::sal::Attrs &attrs,

0 commit comments

Comments
 (0)