Skip to content

Commit 2053c2e

Browse files
authored
Merge pull request ceph#65771 from aainscow/ec_direct_reads_pr_1
EC Direct Reads: First PR, background work Reviewed-by: Radoslaw Zarzynski <[email protected]> Reviewed-by: Bill Scales <[email protected]>
2 parents 7e93afe + d0724a2 commit 2053c2e

31 files changed

+1103
-123
lines changed

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ set(libcommon_files
532532
osdc/Striper.cc
533533
osdc/Objecter.cc
534534
osdc/error_code.cc
535+
osdc/SplitOp.cc
535536
librbd/Features.cc
536537
librbd/io/IoOperations.cc
537538
${mds_files})

src/crimson/osd/osd_operations/client_request.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,8 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
209209
pg.get_perf_logger().inc(l_osd_replica_read_redirect_missing);
210210
co_await reply_op_error(pgref, -EAGAIN);
211211
co_return;
212-
} else if (!pg.get_peering_state().can_serve_replica_read(m->get_hobj())) {
213-
// Note: can_serve_replica_read checks for writes on the head object
212+
} else if (!pg.get_peering_state().can_serve_read(m->get_hobj())) {
213+
// Note: can_serve_read checks for writes on the head object
214214
// as writes can only occur to head.
215215
DEBUGDPP("{}.{}: unstable write on replica, bouncing to primary",
216216
pg, *this, this_instance_id);

src/crimson/osd/pg.cc

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1388,7 +1388,7 @@ void PG::log_operation(
13881388

13891389
if (!is_primary()) { // && !is_ec_pg()
13901390
DEBUGDPP("on replica, clearing obc", *this);
1391-
replica_clear_repop_obc(logv);
1391+
clear_repop_obc(logv);
13921392
}
13931393
if (!logv.empty()) {
13941394
scrubber.on_log_update(logv.rbegin()->version);
@@ -1402,9 +1402,9 @@ void PG::log_operation(
14021402
false);
14031403
}
14041404

1405-
void PG::replica_clear_repop_obc(
1405+
void PG::clear_repop_obc(
14061406
const std::vector<pg_log_entry_t> &logv) {
1407-
LOG_PREFIX(PG::replica_clear_repop_obc);
1407+
LOG_PREFIX(PG::clear_repop_obc);
14081408
DEBUGDPP("clearing obc for {} log entries", *this, logv.size());
14091409
for (auto &&e: logv) {
14101410
DEBUGDPP("clearing entry for {} from: {} to: {}",
@@ -1615,8 +1615,7 @@ bool PG::can_discard_op(const MOSDOp& m) const {
16151615
return true;
16161616
}
16171617

1618-
if ((m.get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1619-
CEPH_OSD_FLAG_LOCALIZE_READS))
1618+
if ((m.get_flags() & CEPH_OSD_FLAGS_DIRECT_READ)
16201619
&& !is_primary()
16211620
&& (m.get_map_epoch() <
16221621
peering_state.get_info().history.same_interval_since))

src/crimson/osd/pg.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -675,7 +675,7 @@ class PG : public boost::intrusive_ref_counter<
675675
bool transaction_applied,
676676
ObjectStore::Transaction &txn,
677677
bool async = false);
678-
void replica_clear_repop_obc(
678+
void clear_repop_obc(
679679
const std::vector<pg_log_entry_t> &logv);
680680
void handle_rep_op_reply(const MOSDRepOpReply& m);
681681
interruptible_future<> do_update_log_missing(

src/erasure-code/ErasureCodeInterface.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,11 @@ namespace ceph {
685685
* to decode a parity CRC to get the CRC of a data shard.
686686
*/
687687
FLAG_EC_PLUGIN_CRC_ENCODE_DECODE_SUPPORT = 1<<7,
688+
/* This plugin supports the ability for the client to read directly from
689+
* the OSD containing a shard. This currently requires that raw shard ==
690+
* shard and that the data shards are simply striped.
691+
*/
692+
FLAG_EC_PLUGIN_DIRECT_READS = 1<<8,
688693
};
689694
static const char *get_optimization_flag_name(const plugin_flags flag) {
690695
switch (flag) {
@@ -697,6 +702,8 @@ namespace ceph {
697702
case FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED: return "optimizedsupport";
698703
case FLAG_EC_PLUGIN_CRC_ENCODE_DECODE_SUPPORT:
699704
return "crcencodedecode";
705+
case FLAG_EC_PLUGIN_DIRECT_READS:
706+
return "directreads";
700707
default: return "???";
701708
}
702709
}

src/erasure-code/isa/ErasureCodeIsa.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ class ErasureCodeIsa : public ceph::ErasureCode {
6969
FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
7070
FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
7171
FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
72-
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
72+
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION |
73+
FLAG_EC_PLUGIN_DIRECT_READS;
7374

7475
if (technique == "reed_sol_van"sv) {
7576
flags |= FLAG_EC_PLUGIN_CRC_ENCODE_DECODE_SUPPORT;

src/erasure-code/jerasure/ErasureCodeJerasure.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ class ErasureCodeJerasure : public ceph::ErasureCode {
5151
flags = FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
5252
FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
5353
FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
54-
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
54+
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION |
55+
FLAG_EC_PLUGIN_DIRECT_READS;
5556

5657
if (technique == "reed_sol_van"sv) {
5758
flags |= FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED;

src/include/rados.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,12 @@ enum {
481481
CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */
482482
CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */
483483
CEPH_OSD_FLAG_SUPPORTSPOOLEIO = 0x8000000, /* client understands pool EIO flag */
484+
CEPH_OSD_FLAG_EC_DIRECT_READ = 0x10000000, /* Erasure code doing a partial read direct to OSD. */
484485
};
485486

487+
// Indicates an IO which is direct-to-OSD and may not be on the primary.
488+
#define CEPH_OSD_FLAGS_DIRECT_READ (CEPH_OSD_FLAG_BALANCE_READS | CEPH_OSD_FLAG_LOCALIZE_READS | CEPH_OSD_FLAG_EC_DIRECT_READ)
489+
486490
enum {
487491
CEPH_OSD_OP_FLAG_EXCL = 0x1, /* EXCL object create */
488492
CEPH_OSD_OP_FLAG_FAILOK = 0x2, /* continue despite failure */

src/messages/MOSDRepOp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class MOSDRepOp final : public MOSDFastDispatchOp {
6565
* Because updates <= pg_committed_to cannot become divergent, replicas
6666
* may safely serve reads on objects which do not have more recent updates.
6767
*
68-
* See PeeringState::pg_committed_to, PeeringState::can_serve_replica_read
68+
* See PeeringState::pg_committed_to, PeeringState::can_serve_read
6969
*
7070
* Historical note: Prior to early 2024, this field was named
7171
* min_last_complete_ondisk. The replica, however, only actually relied on

src/mon/OSDMonitor.cc

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8357,6 +8357,8 @@ int OSDMonitor::prepare_new_pool(string& name,
83578357
enable_pool_ec_optimizations(*pi, nullptr, true);
83588358
}
83598359

8360+
enable_pool_ec_direct_reads(*pi);
8361+
83608362
pending_inc.new_pool_names[pool] = name;
83618363
return 0;
83628364
}
@@ -8451,6 +8453,29 @@ int OSDMonitor::enable_pool_ec_optimizations(pg_pool_t &p,
84518453
return 0;
84528454
}
84538455

8456+
void OSDMonitor::enable_pool_ec_direct_reads(pg_pool_t &p) {
8457+
if (p.is_erasure()) {
8458+
ErasureCodeInterfaceRef erasure_code;
8459+
stringstream tmp;
8460+
int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8461+
8462+
// Once this feature is finished, we will replace this with upgrade code.
8463+
// The upgrade code will enable the split read flag once all OSDs are at
8464+
// Umbrella. For now, if the plugin does not support direct reads, we just
8465+
// disable it. All plugins and techniques should be capable of supporting
8466+
// direct reads, but we put in place this capability to reduce the test
8467+
// matrix for less important plugins/techniques.
8468+
//
8469+
// To enable direct reads in development, set the osd_pool_default_flags to
8470+
// 1<<20 = 0x100000 = 1048576
8471+
if (err != 0 || !p.allows_ecoptimizations() ||
8472+
(erasure_code->get_supported_optimizations() &
8473+
ErasureCodeInterface::FLAG_EC_PLUGIN_DIRECT_READS) == 0) {
8474+
p.flags &= ~pg_pool_t::FLAG_CLIENT_SPLIT_READS;
8475+
}
8476+
}
8477+
}
8478+
84548479
int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
84558480
stringstream& ss)
84568481
{

0 commit comments

Comments
 (0)