ArbitCode
diff --git a/‎src/crimson/osd/pg.h‎
Lines changed: 6 additions & 2 deletions b/‎src/crimson/osd/pg.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/mon/OSDMonitor.cc‎
Lines changed: 3 additions & 1 deletion b/‎src/mon/OSDMonitor.cc‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/osd/ECBackend.cc‎
Lines changed: 101 additions & 67 deletions b/‎src/osd/ECBackend.cc‎
Lines changed: 101 additions & 67 deletions
diff --git a/‎src/osd/ECBackend.h‎
Lines changed: 1 addition & 1 deletion b/‎src/osd/ECBackend.h‎
Lines changed: 1 addition & 1 deletion
@@ -478,9 +478,13 @@ class PG : public boost::intrusive_ref_counter<
     void trim(const pg_log_entry_t &entry) override {
       // TODO
     }
-    void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override {
+    void partial_write(pg_info_t *info,
+                       eversion_t previous_version,
+                       const pg_log_entry_t &entry
+      ) override {
       // TODO
-      ceph_assert(entry.written_shards.empty() && info->partial_writes_last_complete.empty());
+      ceph_assert(entry.written_shards.empty() &&
+                  info->partial_writes_last_complete.empty());
     }
   };
   PGLog::LogEntryHandlerRef get_log_handler(
 
@@ -1524,9 +1524,11 @@ void OSDMonitor::prime_pg_temp(
   {
     std::lock_guard l(prime_pg_temp_lock);
     // do not touch a mapping if a change is pending
+    std::vector<int> pg_temp = pool ? next.pgtemp_primaryfirst(*pool, acting) :
+                                      acting;
     pending_inc.new_pg_temp.emplace(
       pgid,
-      mempool::osdmap::vector<int>(acting.begin(), acting.end()));
+      mempool::osdmap::vector<int>(pg_temp.begin(), pg_temp.end()));
   }
 }
 
 
@@ -200,7 +200,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
   bool is_repair) {
   if (get_parent()->check_failsafe_full()) {
     dout(10) << __func__ << " Out of space (failsafe) processing push request."
- << dendl;
+             << dendl;
     ceph_abort();
   }
 
@@ -242,7 +242,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
   }
 
   if (op.before_progress.first) {
-    ceph_assert(op.attrset.count(string("_")));
+    ceph_assert(op.attrset.contains(OI_ATTR));
     m->t.setattrs(
       coll,
       tobj,
@@ -290,23 +290,23 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
 }
 
 void ECBackend::RecoveryBackend::handle_recovery_push_reply(
-  const PushReplyOp &op,
-  pg_shard_t from,
-  RecoveryMessages *m) {
+    const PushReplyOp &op,
+    pg_shard_t from,
+    RecoveryMessages *m) {
   if (!recovery_ops.count(op.soid))
     return;
   RecoveryOp &rop = recovery_ops[op.soid];
-  ceph_assert(rop.waiting_on_pushes.count(from));
+  ceph_assert(rop.waiting_on_pushes.contains(from));
   rop.waiting_on_pushes.erase(from);
   continue_recovery_op(rop, m);
 }
 
 void ECBackend::RecoveryBackend::handle_recovery_read_complete(
-  const hobject_t &hoid,
-  ECUtil::shard_extent_map_t &&buffers_read,
-  std::optional<map<string, bufferlist, less<>>> attrs,
-  const ECUtil::shard_extent_set_t &want_to_read,
-  RecoveryMessages *m) {
+    const hobject_t &hoid,
+    ECUtil::shard_extent_map_t &&buffers_read,
+    std::optional<map<string, bufferlist, less<>>> attrs,
+    const ECUtil::shard_extent_set_t &want_to_read,
+    RecoveryMessages *m) {
   dout(10) << __func__ << ": returned " << hoid << " " << buffers_read << dendl;
   ceph_assert(recovery_ops.contains(hoid));
   RecoveryBackend::RecoveryOp &op = recovery_ops[hoid];
@@ -373,14 +373,10 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
     }
   }
 
-  uint64_t aligned_size = ECUtil::align_page_next(op.obc->obs.oi.size);
+  uint64_t aligned_size = ECUtil::align_next(op.obc->obs.oi.size);
 
   int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
   ceph_assert(r == 0);
-  // We are never appending here, so we never need hinfo.
-  op.returned_data->insert_parity_buffers();
-  r = op.returned_data->encode(ec_impl, NULL, 0);
-  ceph_assert(r==0);
 
   // Finally, we don't want to write any padding, so truncate the buffer
   // to remove it.
@@ -393,8 +389,10 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
     }
   }
 
-  dout(20) << __func__ << ": oid=" << op.hoid << " "
-           << op.returned_data->debug_string(2048, 8) << dendl;
+  dout(20) << __func__ << ": oid=" << op.hoid << dendl;
+  dout(30) << __func__ << "EC_DEBUG_BUFFERS: "
+           << op.returned_data->debug_string(2048, 8)
+           << dendl;
 
   continue_recovery_op(op, m);
 }
@@ -536,12 +534,30 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
 
       op.state = RecoveryOp::READING;
 
-      // We always read the recovery chunk size (default 8MiB + parity). If that
-      // amount of data is not available, then the backend will truncate the
-      // response.
+      /* When beginning recovery, the OI may not be known. As such the object
+       * size is not known. For the first read, attempt to read the default
+       * size.  If this is larger than the object sizes, then the OSD will
+       * return truncated reads.  If the object size is known, then attempt
+       * correctly sized reads.
+       */
+      uint64_t read_size = get_recovery_chunk_size();
+      if (op.obc) {
+        uint64_t read_to_end = ECUtil::align_next(op.obc->obs.oi.size) -
+          op.recovery_progress.data_recovered_to;
+
+        if (read_to_end < read_size) {
+          read_size = read_to_end;
+        }
+      }
       sinfo.ro_range_to_shard_extent_set_with_parity(
-        op.recovery_progress.data_recovered_to,
-        get_recovery_chunk_size(), want);
+        op.recovery_progress.data_recovered_to, read_size, want);
+
+      op.recovery_progress.data_recovered_to += read_size;
+
+      // We only need to recover shards that are missing.
+      for (auto shard : shard_id_set::difference(sinfo.get_all_shards(), op.missing_on_shards)) {
+        want.erase(shard);
+      }
 
       if (op.recovery_progress.first && op.obc) {
         op.xattrs = op.obc->attr_cache;
@@ -591,9 +607,15 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
       }
       if (read_request.shard_reads.empty()) {
         ceph_assert(op.obc);
-        ceph_assert(0 == op.obc->obs.oi.size);
-        dout(10) << __func__ << "Zero size object recovery, skipping reads."
-                 << op << dendl;
+        /* This can happen for several reasons
+         * - A zero-sized object.
+         * - The missing shards have no data.
+         * - The previous recovery did not need the last data shard. In this
+         *   case, data_recovered_to may indicate that the last shard still
+         *   needs recovery, when it does not.
+         * We can just skip the read and fall through below.
+         */
+        dout(10) << __func__ << " No reads required " << op << dendl;
         // Create an empty read result and fall through.
         op.returned_data.emplace(&sinfo);
       } else {
@@ -612,7 +634,6 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
       dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl;
       op.state = RecoveryOp::WRITING;
       ObjectRecoveryProgress after_progress = op.recovery_progress;
-      after_progress.data_recovered_to = op.returned_data->get_ro_end();
       after_progress.first = false;
       if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
         after_progress.data_complete = true;
@@ -621,7 +642,7 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
         m->pushes[pg_shard].push_back(PushOp());
         PushOp &pop = m->pushes[pg_shard].back();
         pop.soid = op.hoid;
-        pop.version = op.v;
+        pop.version = op.recovery_info.oi.get_version_for_shard(pg_shard.shard);
         op.returned_data->get_shard_first_buffer(pg_shard.shard, pop.data);
         dout(10) << __func__ << ": pop shard=" << pg_shard
                  << ", oid=" << pop.soid
@@ -634,7 +655,26 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
             op.returned_data->get_shard_first_offset(pg_shard.shard),
             pop.data.length());
         if (op.recovery_progress.first) {
-          pop.attrset = op.xattrs;
+          if (sinfo.is_nonprimary_shard(pg_shard.shard)) {
+            if (pop.version == op.recovery_info.oi.version) {
+              dout(10) << __func__ << ": copy OI attr only" << dendl;
+              pop.attrset[OI_ATTR] = op.xattrs[OI_ATTR];
+            } else {
+              // We are recovering a partial write - make sure we push the correct
+              // version in the OI or a scrub error will occur.
+              object_info_t oi(op.recovery_info.oi);
+              oi.shard_versions.clear();
+              oi.version = pop.version;
+              dout(10) << __func__ << ": partial write OI attr: oi=" << oi << dendl;
+              bufferlist bl;
+              oi.encode(bl, get_osdmap()->get_features(
+                CEPH_ENTITY_TYPE_OSD, nullptr));
+              pop.attrset[OI_ATTR] = bl;
+            }
+          } else {
+            dout(10) << __func__ << ": push all attrs (not nonprimary)" << dendl;
+            pop.attrset = op.xattrs;
+          }
         }
         pop.recovery_info = op.recovery_info;
         pop.before_progress = op.recovery_progress;
@@ -1050,8 +1090,7 @@ void ECBackend::handle_sub_read(
 		  << dendl;
         } else {
           get_parent()->clog_error() << "Error " << r
-            << " reading object "
-            << hoid;
+            << " reading object " << hoid;
           dout(5) << __func__ << ": Error " << r
 		  << " reading " << hoid << dendl;
         }
@@ -1085,8 +1124,7 @@ void ECBackend::handle_sub_read(
         if (!hinfo) {
           r = -EIO;
           get_parent()->clog_error() << "Corruption detected: object "
-            << hoid
-            << " is missing hash_info";
+            << hoid << " is missing hash_info";
           dout(5) << __func__ << ": No hinfo for " << hoid << dendl;
           goto error;
         }
@@ -1102,8 +1140,8 @@ void ECBackend::handle_sub_read(
               << hex << h.digest() << " expected 0x" << hinfo->
               get_chunk_hash(shard) << dec;
             dout(5) << __func__ << ": Bad hash for " << hoid << " digest 0x"
-		    << hex << h.digest() << " expected 0x" << hinfo->
-get_chunk_hash(shard) << dec << dendl;
+		    << hex << h.digest() << " expected 0x"
+                    << hinfo->get_chunk_hash(shard) << dec << dendl;
             r = -EIO;
             goto error;
           }
@@ -1172,9 +1210,9 @@ void ECBackend::handle_sub_write_reply(
 }
 
 void ECBackend::handle_sub_read_reply(
-  pg_shard_t from,
-  ECSubReadReply &op,
-  const ZTracer::Trace &trace) {
+    pg_shard_t from,
+    ECSubReadReply &op,
+    const ZTracer::Trace &trace) {
   trace.event("ec sub read reply");
   dout(10) << __func__ << ": reply " << op << dendl;
   map<ceph_tid_t, ReadOp>::iterator iter = read_pipeline.tid_to_read_map.
@@ -1227,26 +1265,19 @@ void ECBackend::handle_sub_read_reply(
       rop.complete.emplace(hoid, &sinfo);
     }
     auto &complete = rop.complete.at(hoid);
-    for (auto &&[shard, read]: std::as_const(req.shard_reads)) {
-      if (complete.errors.contains(read.pg_shard)) continue;
-
-      complete.processed_read_requests[shard].union_of(read.extents);
-
-      if (!rop.complete.contains(hoid) ||
-        !complete.buffers_read.contains(shard)) {
-        if (!read.extents.empty()) continue; // Complete the actual read first.
-
-        // If we are first here, populate the completion.
-        if (!rop.complete.contains(hoid)) {
-          rop.complete.emplace(hoid, read_result_t(&sinfo));
-        }
-      }
+    if (!req.shard_reads.contains(from.shard)) {
+      continue;
+    }
+    const shard_read_t &read = req.shard_reads.at(from.shard);
+    if (!complete.errors.contains(from)) {
+      dout(20) << __func__ <<" read:" << read << dendl;
+      complete.processed_read_requests[from.shard].union_of(read.extents);
     }
   }
   for (auto &&[hoid, attr]: op.attrs_read) {
     ceph_assert(!op.errors.count(hoid));
     // if read error better not have sent an attribute
-    if (!rop.to_read.count(hoid)) {
+    if (!rop.to_read.contains(hoid)) {
       // We canceled this read! @see filter_read_op
       dout(20) << __func__ << " to_read skipping" << dendl;
       continue;
@@ -1290,6 +1321,8 @@ void ECBackend::handle_sub_read_reply(
       rop.to_read.at(oid).shard_want_to_read.
           populate_shard_id_set(want_to_read);
 
+      dout(20) << __func__ << " read_result: " << read_result << dendl;
+
       int err = ec_impl->minimum_to_decode(want_to_read, have, dummy_minimum,
                                             nullptr);
       if (err) {
@@ -1305,7 +1338,7 @@ void ECBackend::handle_sub_read_reply(
               // We found that new reads are required to do a decode.
               need_resend = true;
               continue;
-            } else if (r >  0) {
+            } else if (r > 0) {
               // No new reads were requested. This means that some parity
               // shards can be assumed to be zeros.
               err = 0;
@@ -1340,7 +1373,8 @@ void ECBackend::handle_sub_read_reply(
             rop.complete.at(oid).errors.clear();
           }
         }
-        // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects
+        // avoid re-read for completed object as we may send remaining reads for
+        // uncompleted objects
         rop.to_read.at(oid).shard_reads.clear();
         rop.to_read.at(oid).want_attrs = false;
         ++is_complete;
@@ -1599,28 +1633,28 @@ void ECBackend::submit_transaction(
 }
 
 int ECBackend::objects_read_sync(
-  const hobject_t &hoid,
-  uint64_t off,
-  uint64_t len,
-  uint32_t op_flags,
-  bufferlist *bl) {
+    const hobject_t &hoid,
+    uint64_t off,
+    uint64_t len,
+    uint32_t op_flags,
+    bufferlist *bl) {
   return -EOPNOTSUPP;
 }
 
 void ECBackend::objects_read_async(
-  const hobject_t &hoid,
-  uint64_t object_size,
-  const list<pair<ec_align_t,
-                  pair<bufferlist*, Context*>>> &to_read,
-  Context *on_complete,
-  bool fast_read) {
+    const hobject_t &hoid,
+    uint64_t object_size,
+    const list<pair<ec_align_t,
+                    pair<bufferlist*, Context*>>> &to_read,
+    Context *on_complete,
+    bool fast_read) {
   map<hobject_t, std::list<ec_align_t>> reads;
 
   uint32_t flags = 0;
   extent_set es;
   for (const auto &[read, ctx]: to_read) {
     pair<uint64_t, uint64_t> tmp;
-    if (!cct->_conf->osd_ec_partial_reads || fast_read) {
+    if (!cct->_conf->osd_ec_partial_reads) {
       tmp = sinfo.ro_offset_len_to_stripe_ro_offset_len(read.offset, read.size);
     } else {
       tmp.first = read.offset;
 
@@ -252,7 +252,7 @@ class ECBackend : public ECCommon {
       hobject_t hoid;
       eversion_t v;
       std::set<pg_shard_t> missing_on;
-      std::set<shard_id_t> missing_on_shards;
+      shard_id_set missing_on_shards;
 
       ObjectRecoveryInfo recovery_info;
       ObjectRecoveryProgress recovery_progress;
Original file line number	Diff line number	Diff line change
`@@ -1524,9 +1524,11 @@ void OSDMonitor::prime_pg_temp(`
`1524`	`1524`	`{`
`1525`	`1525`	`std::lock_guard l(prime_pg_temp_lock);`
`1526`	`1526`	`// do not touch a mapping if a change is pending`
	`1527`	`+ std::vector<int> pg_temp = pool ? next.pgtemp_primaryfirst(*pool, acting) :`
	`1528`	`+ acting;`
`1527`	`1529`	`pending_inc.new_pg_temp.emplace(`
`1528`	`1530`	`pgid,`
`1529`		`- mempool::osdmap::vector<int>(acting.begin(), acting.end()));`
	`1531`	`+ mempool::osdmap::vector<int>(pg_temp.begin(), pg_temp.end()));`
`1530`	`1532`	`}`
`1531`	`1533`	`}`
`1532`	`1534`