ArbitCode
diff --git a/‎PendingReleaseNotes‎
Lines changed: 8 additions & 0 deletions b/‎PendingReleaseNotes‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎doc/radosgw/notifications.rst‎
Lines changed: 12 additions & 0 deletions b/‎doc/radosgw/notifications.rst‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/common/options/rgw.yaml.in‎
Lines changed: 9 additions & 0 deletions b/‎src/common/options/rgw.yaml.in‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/rgw/driver/rados/rgw_notify.cc‎
Lines changed: 63 additions & 29 deletions b/‎src/rgw/driver/rados/rgw_notify.cc‎
Lines changed: 63 additions & 29 deletions
diff --git a/‎src/rgw/driver/rados/rgw_notify.h‎
Lines changed: 5 additions & 3 deletions b/‎src/rgw/driver/rados/rgw_notify.h‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/rgw/radosgw-admin/radosgw-admin.cc‎
Lines changed: 36 additions & 32 deletions b/‎src/rgw/radosgw-admin/radosgw-admin.cc‎
Lines changed: 36 additions & 32 deletions
@@ -764,3 +764,11 @@ Relevant tracker: https://tracker.ceph.com/issues/64777
 when decoding, resulting in Linux codes on the wire, and host codes on the receiver.
 All CEPHFS_E* defines have been removed across Ceph (including the Python binding).
 Relevant tracker: https://tracker.ceph.com/issues/64611
+
+* RGW: Persistent bucket notifications will use queues with multiple shards instead of one queue. Number of shards
+can be configured using the `rgw` option `rgw_bucket_persistent_notif_num_shards`. Note that pre-existing topics will continue to function as is, i.e, they are mapped to only one RADOS object.   
+  
+  For more details, see: 
+  https://docs.ceph.com/en/latest/radosgw/notifications/
+
+Relevant tracker: https://tracker.ceph.com/issues/71677
@@ -85,6 +85,18 @@ which tells the client that it may retry later.
 .. tip:: To minimize the latency added by asynchronous notification, we
    recommended placing the "log" pool on fast media.
 
+Persistent bucket notifications are managed by the following central configuration options:
+
+.. confval:: rgw_bucket_persistent_notif_num_shards
+
+.. note:: When a topic is created during a Ceph upgrade, per-key reordering of notifications may
+   happen on any bucket mapped to that topic.
+   
+.. note:: Persistent topics that were created on a radosgw that does not support sharding, will be treated as a single shard topics
+
+.. tip:: It is also recommended that you avoid modifying or deleting topics created during 
+   upgrades, as this might result in orphan RADOS objects that will not be deleted when the topic is deleted.
+
 
 Topic Management via CLI
 ------------------------
 
@@ -4514,3 +4514,12 @@ options:
   services:
   - rgw
   with_legacy: true
+- name: rgw_bucket_persistent_notif_num_shards
+  type: uint
+  level: advanced
+  desc: Number of shards for a persistent topic.
+  long_desc: Number of shards of persistent topics. The notifications will be sharded by a combination of 
+    the bucket and key name. Changing the number effect only new topics and does not change exiting ones.
+  default: 11
+  services: 
+    - rgw
@@ -1084,6 +1084,19 @@ static inline bool notification_match(reservation_t& res,
   return true;
 }
 
+
+static inline uint64_t get_target_shard(const DoutPrefixProvider* dpp, const std::string& bucket_name, const std::string& object_key, const uint64_t num_shards) {
+  std::hash<std::string> hash_fn; 
+  std::string hash_key = fmt::format("{}:{}", bucket_name, object_key);
+  size_t hash = hash_fn(hash_key); 
+  ldpp_dout(dpp, 20) << "INFO: Hash Value (hash) is:  " << hash << ". Hash Key: " << bucket_name << ":" << object_key << dendl; 
+  return hash % num_shards; 
+}
+
+static inline std::string get_shard_name(const std::string& topic_name, const uint64_t& shard_id) {
+  return (shard_id == 0) ? topic_name : fmt::format("{}.{}", topic_name, shard_id);
+}
+
 int publish_reserve(const DoutPrefixProvider* dpp,
                     const SiteConfig& site,
                     const EventTypeList& event_types,
@@ -1145,22 +1158,29 @@ int publish_reserve(const DoutPrefixProvider* dpp,
       }
 
       cls_2pc_reservation::id_t res_id = cls_2pc_reservation::NO_ID;
+      uint64_t target_shard = 0; 
       if (topic_cfg.dest.persistent) {
         // TODO: take default reservation size from conf
         constexpr auto DEFAULT_RESERVATION = 4 * 1024U;  // 4K
         res.size = DEFAULT_RESERVATION;
         librados::ObjectWriteOperation op;
         bufferlist obl;
         int rval;
-        const auto& queue_name = topic_cfg.dest.persistent_queue;
+        const std::string bucket_name = res.bucket->get_name(); 
+        const std::string object_key = res.object_name ? *res.object_name : res.object->get_name();
+        const uint64_t num_shards = topic_cfg.dest.num_shards; 
+        target_shard = get_target_shard(
+            dpp, bucket_name, object_key, num_shards); 
+        const auto shard_name = get_shard_name(topic_cfg.dest.persistent_queue, target_shard);
+        ldpp_dout(res.dpp, 1) << "INFO: target_shard: " << shard_name << dendl;       
         cls_2pc_queue_reserve(op, res.size, 1, &obl, &rval);
         auto ret = rgw_rados_operate(
-            res.dpp, res.store->getRados()->get_notif_pool_ctx(), queue_name,
+            res.dpp, res.store->getRados()->get_notif_pool_ctx(), shard_name,
             std::move(op), res.yield, librados::OPERATION_RETURNVEC);
         if (ret < 0) {
           ldpp_dout(res.dpp, 1)
               << "ERROR: failed to reserve notification on queue: "
-              << queue_name << ". error: " << ret << dendl;
+              << shard_name << ". error: " << ret << dendl;
           // if no space is left in queue we ask client to slow down
           return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
         }
@@ -1173,7 +1193,7 @@ int publish_reserve(const DoutPrefixProvider* dpp,
         }
       }
 
-      res.topics.emplace_back(topic_filter.s3_id, topic_cfg, res_id, event_type);
+      res.topics.emplace_back(topic_filter.s3_id, topic_cfg, res_id, event_type, target_shard);
     }
   }
   return 0;
@@ -1209,25 +1229,27 @@ int publish_commit(rgw::sal::Object* obj,
       event_entry.retry_sleep_duration = topic.cfg.dest.retry_sleep_duration;
       bufferlist bl;
       encode(event_entry, bl);
-      const auto& queue_name = topic.cfg.dest.persistent_queue;
+      uint64_t target_shard = topic.shard_id;
+      const auto shard_name = get_shard_name(topic.cfg.dest.persistent_queue, target_shard);  
+      ldpp_dout(res.dpp, 1) << "INFO: target_shard: " << shard_name << dendl;   
       if (bl.length() > res.size) {
         // try to make a larger reservation, fail only if this is not possible
         ldpp_dout(dpp, 5) << "WARNING: committed size: " << bl.length()
 			  << " exceeded reserved size: " << res.size
 			  <<
-          " . trying to make a larger reservation on queue:" << queue_name
+          " . trying to make a larger reservation on queue:" << shard_name
 			  << dendl;
         // first cancel the existing reservation
         librados::ObjectWriteOperation op;
         cls_2pc_queue_abort(op, topic.res_id);
         auto ret = rgw_rados_operate(
 	  dpp, res.store->getRados()->get_notif_pool_ctx(),
-	  queue_name, std::move(op),
+	  shard_name, std::move(op),
 	  res.yield);
         if (ret < 0) {
           ldpp_dout(dpp, 1) << "ERROR: failed to abort reservation: "
 			    << topic.res_id << 
-            " when trying to make a larger reservation on queue: " << queue_name
+            " when trying to make a larger reservation on queue: " << shard_name
 			    << ". error: " << ret << dendl;
           return ret;
         }
@@ -1238,10 +1260,10 @@ int publish_commit(rgw::sal::Object* obj,
         cls_2pc_queue_reserve(op, bl.length(), 1, &obl, &rval);
         ret = rgw_rados_operate(
 	  dpp, res.store->getRados()->get_notif_pool_ctx(),
-          queue_name, std::move(op), res.yield, librados::OPERATION_RETURNVEC);
+          shard_name, std::move(op), res.yield, librados::OPERATION_RETURNVEC);
         if (ret < 0) {
           ldpp_dout(dpp, 1) << "ERROR: failed to reserve extra space on queue: "
-			    << queue_name
+			    << shard_name
 			    << ". error: " << ret << dendl;
           return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
         }
@@ -1256,12 +1278,12 @@ int publish_commit(rgw::sal::Object* obj,
       librados::ObjectWriteOperation op;
       cls_2pc_queue_commit(op, bl_data_vec, topic.res_id);
       topic.res_id = cls_2pc_reservation::NO_ID;
-      auto pcc_arg = make_unique<PublishCommitCompleteArg>(queue_name, dpp->get_cct());
+      auto pcc_arg = make_unique<PublishCommitCompleteArg>(shard_name, dpp->get_cct());
       aio_completion_ptr completion{librados::Rados::aio_create_completion(pcc_arg.get(), publish_commit_completion)};
       auto& io_ctx = res.store->getRados()->get_notif_pool_ctx();
-      if (const int ret = io_ctx.aio_operate(queue_name, completion.get(), &op); ret < 0) {
+      if (const int ret = io_ctx.aio_operate(shard_name, completion.get(), &op); ret < 0) {
         ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: "
-                          << queue_name << ". error: " << ret << dendl;
+                          << shard_name << ". error: " << ret << dendl;
         return ret;
       }
       // args will be released inside the callback
@@ -1304,16 +1326,18 @@ int publish_abort(reservation_t& res) {
       // nothing to abort or already committed/aborted
       continue;
     }
-    const auto& queue_name = topic.cfg.dest.persistent_queue;
+    uint64_t target_shard = topic.shard_id;   
+    const auto shard_name = get_shard_name(topic.cfg.dest.persistent_queue, target_shard);   
+    ldpp_dout(res.dpp, 1) << "INFO: target_shard: " << shard_name << dendl;
     librados::ObjectWriteOperation op;
     cls_2pc_queue_abort(op, topic.res_id);
     const auto ret = rgw_rados_operate(
       res.dpp, res.store->getRados()->get_notif_pool_ctx(),
-      queue_name, std::move(op), res.yield);
+      shard_name, std::move(op), res.yield);
     if (ret < 0) {
       ldpp_dout(res.dpp, 1) << "ERROR: failed to abort reservation: "
 			    << topic.res_id <<
-        " from queue: " << queue_name << ". error: " << ret << dendl;
+        " from queue: " << shard_name << ". error: " << ret << dendl;
       return ret;
     }
     topic.res_id = cls_2pc_reservation::NO_ID;
@@ -1322,23 +1346,33 @@ int publish_abort(reservation_t& res) {
 }
 
 int get_persistent_queue_stats(const DoutPrefixProvider *dpp, librados::IoCtx &rados_ioctx,
-                               const std::string &queue_name, rgw_topic_stats &stats, optional_yield y)
+                               ShardNamesView shards, rgw_topic_stats &stats, optional_yield y)
 {
   // TODO: use optional_yield instead calling rados_ioctx.operate() synchronously
   cls_2pc_reservations reservations;
-  auto ret = cls_2pc_queue_list_reservations(rados_ioctx, queue_name, reservations);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to read queue list reservation: " << ret << dendl;
-    return ret;
-  }
-  stats.queue_reservations = reservations.size();
-
-  ret = cls_2pc_queue_get_topic_stats(rados_ioctx, queue_name, stats.queue_entries, stats.queue_size);
-  if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: failed to get the queue size or the number of entries: " << ret << dendl;
-    return ret;
+  uint32_t shard_entries; 
+  uint64_t shard_size;
+
+  stats.queue_reservations = 0; 
+  stats.queue_size = 0; 
+  stats.queue_entries = 0; 
+  for(const auto& shard_name: shards){
+    auto ret = cls_2pc_queue_list_reservations(rados_ioctx, shard_name, reservations);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "ERROR: failed to read shard: "<< shard_name << "'s list reservation: " << ret << dendl;
+      return ret;
+    }
+    stats.queue_reservations += reservations.size();
+    shard_entries = 0; 
+    shard_size = 0; 
+    ret = cls_2pc_queue_get_topic_stats(rados_ioctx, shard_name, shard_entries, shard_size);
+    stats.queue_size += shard_size; 
+    stats.queue_entries += shard_entries;
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "ERROR: failed to get the size or number of entries for queue shard: " << shard_name << ret << dendl;
+      return ret;
+    }
   }
-
   return 0;
 }
 
 
@@ -47,17 +47,19 @@ struct reservation_t {
   struct topic_t {
     topic_t(const std::string& _configurationId, const rgw_pubsub_topic& _cfg,
             cls_2pc_reservation::id_t _res_id,
-            rgw::notify::EventType _event_type)
+            rgw::notify::EventType _event_type, uint64_t shard_id)
         : configurationId(_configurationId),
           cfg(_cfg),
           res_id(_res_id),
-          event_type(_event_type) {}
+          event_type(_event_type),
+          shard_id(shard_id){}
 
     const std::string configurationId;
     const rgw_pubsub_topic cfg;
     // res_id is reset after topic is committed/aborted
     cls_2pc_reservation::id_t res_id;
     rgw::notify::EventType event_type;
+    uint64_t shard_id; 
   };
 
   const DoutPrefixProvider* const dpp;
@@ -132,7 +134,7 @@ int publish_commit(rgw::sal::Object* obj,
 int publish_abort(reservation_t& reservation);
 
 int get_persistent_queue_stats(const DoutPrefixProvider *dpp, librados::IoCtx &rados_ioctx,
-                               const std::string &queue_name, rgw_topic_stats &stats, optional_yield y);
+                               ShardNamesView shards, rgw_topic_stats &stats, optional_yield y);
 
 }
 
@@ -11930,9 +11930,9 @@ int main(int argc, const char **argv)
     rgw::notify::rgw_topic_stats stats;
     ret = rgw::notify::get_persistent_queue_stats(
         dpp(), ioctx,
-        topic.dest.persistent_queue, stats, null_yield);
+        topic.dest.get_shard_names(), stats, null_yield);
     if (ret < 0) {
-      cerr << "ERROR: could not get persistent queue: " << cpp_strerror(-ret) << std::endl;
+      cerr << "ERROR: could not get persistent queues: " << cpp_strerror(-ret) << std::endl;
       return -ret;
     }
     encode_json("", stats, formatter.get());
@@ -11964,37 +11964,41 @@ int main(int argc, const char **argv)
     std::string end_marker;
     librados::ObjectReadOperation rop;
     std::vector<cls_queue_entry> queue_entries;
-    bool truncated = true;
+    bool truncated;
     formatter->open_array_section("eventEntries");
-    while (truncated) {
-      bufferlist bl;
-      int rc;
-      cls_2pc_queue_list_entries(rop, marker, max_entries, &bl, &rc);
-      ioctx.operate(topic.dest.persistent_queue, &rop, nullptr);
-      if (rc < 0 ) {
-        cerr << "ERROR: could not list entries from queue. error: " << cpp_strerror(-ret) << std::endl;
-        return -rc;
-      }
-      rc = cls_2pc_queue_list_entries_result(bl, queue_entries, &truncated, end_marker);
-      if (rc < 0) {
-        cerr << "ERROR: failed to parse list entries from queue (skipping). error: " << cpp_strerror(-ret) << std::endl;
-        return -rc;
-      }
-
-      std::for_each(queue_entries.cbegin(), 
-        queue_entries.cend(), 
-        [&formatter](const auto& queue_entry) {
-          rgw::notify::event_entry_t event_entry;
-          bufferlist::const_iterator iter{&queue_entry.data};
-          try {
-            event_entry.decode(iter);
-            encode_json("", event_entry, formatter.get());
-          } catch (const buffer::error& e) {
-            cerr << "ERROR: failed to decode queue entry. error: " << e.what() << std::endl;
-          }
-        });
-      formatter->flush(cout);
-      marker = end_marker;
+
+    for (const auto& shard_name: topic.dest.get_shard_names()){
+      truncated = true; 
+      marker.clear();
+      while (truncated) {
+        bufferlist bl;
+        int rc;
+        cls_2pc_queue_list_entries(rop, marker, max_entries, &bl, &rc);
+        ioctx.operate(shard_name, &rop, nullptr);
+        if (rc < 0 ) {
+          cerr << "ERROR: could not list entries from queue. error: " << cpp_strerror(-ret) << std::endl;
+          return -rc;
+        }
+        rc = cls_2pc_queue_list_entries_result(bl, queue_entries, &truncated, end_marker);
+        if (rc < 0) {
+          cerr << "ERROR: failed to parse list entries from queue (skipping). error: " << cpp_strerror(-ret) << std::endl;
+          return -rc;
+        }
+        std::for_each(queue_entries.cbegin(), 
+          queue_entries.cend(), 
+          [&formatter](const auto& queue_entry) {
+            rgw::notify::event_entry_t event_entry;
+            bufferlist::const_iterator iter{&queue_entry.data};
+            try {
+              event_entry.decode(iter);
+              encode_json("", event_entry, formatter.get());
+            } catch (const buffer::error& e) {
+              cerr << "ERROR: failed to decode queue entry. error: " << e.what() << std::endl;
+            }
+          });
+        formatter->flush(cout);
+        marker = end_marker;
+      }
     }
     formatter->close_section();
     formatter->flush(cout);