Skip to content

Commit c2cedfb

Browse files
authored
Merge pull request ceph#47974 from zhscn/wip-evict
crimson/os/seastore: evict cold data to slower devices Reviewed-by: Yingxin Cheng <yingxin.cheng@intel.com> Reviewed-by: Samuel Just <sjust@redhat.com>
2 parents 0217230 + c53446d commit c2cedfb

26 files changed

+1178
-292
lines changed

doc/dev/crimson/crimson.rst

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -154,16 +154,16 @@ To facilitate the development of crimson, following options would be handy when
154154
using ``vstart.sh``,
155155

156156
``--crimson``
157-
start ``crimson-osd`` instead of ``ceph-osd``
157+
Start ``crimson-osd`` instead of ``ceph-osd``.
158158

159159
``--nodaemon``
160-
do not daemonize the service
160+
Do not daemonize the service.
161161

162162
``--redirect-output``
163-
redirect the stdout and stderr of service to ``out/$type.$num.stdout``.
163+
Tedirect the stdout and stderr of service to ``out/$type.$num.stdout``.
164164

165165
``--osd-args``
166-
pass extra command line options to crimson-osd or ceph-osd. It's quite
166+
Pass extra command line options to crimson-osd or ceph-osd. It's quite
167167
useful for passing Seastar options to crimson-osd. For instance, you could
168168
use ``--osd-args "--memory 2G"`` to set the memory to use. Please refer
169169
the output of::
@@ -173,14 +173,31 @@ using ``vstart.sh``,
173173
for more Seastar specific command line options.
174174

175175
``--cyanstore``
176-
use CyanStore as the object store backend.
176+
Use CyanStore as the object store backend.
177177

178178
``--bluestore``
179-
use the alienized BlueStore as the object store backend. This is the default
179+
Use the alienized BlueStore as the object store backend. This is the default
180180
setting, if not specified otherwise.
181181

182182
``--memstore``
183-
use the alienized MemStore as the object store backend.
183+
Use the alienized MemStore as the object store backend.
184+
185+
``--seastore``
186+
Use SeaStore as the back end object store.
187+
188+
``--seastore-devs``
189+
Specify the block device used by SeaStore.
190+
191+
``--seastore-secondary-devs``
192+
Optional. SeaStore supports multiple devices. Enable this feature by
193+
passing the block device to this option.
194+
195+
``--seastore-secondary-devs-type``
196+
Optional. Specify device type of secondary devices. When the secondary
197+
device is slower than main device passed to ``--seastore-devs``, the cold
198+
data in faster device will be evicted to the slower devices over time.
199+
Valid types include ``HDD``, ``SSD``(default), ``ZNS``, and ``RANDOM_BLOCK_SSD``
200+
Note secondary devices should not be faster than the main device.
184201

185202
``--seastore``
186203
use SeaStore as the object store backend.
@@ -194,6 +211,15 @@ So, a typical command to start a single-crimson-node cluster is::
194211

195212
Where we assign 4 GiB memory, a single thread running on core-0 to crimson-osd.
196213

214+
Another SeaStore example::
215+
216+
$ MGR=1 MON=1 OSD=1 MDS=0 RGW=0 ../src/vstart.sh -n -x \
217+
--without-dashboard --seastore \
218+
--crimson --redirect-output \
219+
--seastore-devs /dev/sda \
220+
--seastore-secondary-devs /dev/sdb \
221+
--seastore-secondary-devs-type HDD
222+
197223
You could stop the vstart cluster using::
198224

199225
$ ../src/stop.sh --crimson

src/common/options/crimson.yaml.in

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,18 @@ options:
102102
level: dev
103103
desc: Total size to use for CircularBoundedJournal if created, it is valid only if seastore_main_device_type is RANDOM_BLOCK
104104
default: 5_G
105+
- name: seastore_multiple_tiers_stop_evict_ratio
106+
type: float
107+
level: advanced
108+
desc: When the used ratio of main tier is less than this value, then stop evict cold data to the cold tier.
109+
default: 0.5
110+
- name: seastore_multiple_tiers_default_evict_ratio
111+
type: float
112+
level: advanced
113+
desc: Begin evicting cold data to the cold tier when the used ratio of the main tier reaches this value.
114+
default: 0.6
115+
- name: seastore_multiple_tiers_fast_evict_ratio
116+
type: float
117+
level: advanced
118+
desc: Begin fast eviction when the used ratio of the main tier reaches this value.
119+
default: 0.7

src/crimson/os/seastore/async_cleaner.cc

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -830,13 +830,15 @@ SegmentCleaner::SegmentCleaner(
830830
config_t config,
831831
SegmentManagerGroupRef&& sm_group,
832832
BackrefManager &backref_manager,
833-
bool detailed)
833+
SegmentSeqAllocator &segment_seq_allocator,
834+
bool detailed,
835+
bool is_cold)
834836
: detailed(detailed),
837+
is_cold(is_cold),
835838
config(config),
836839
sm_group(std::move(sm_group)),
837840
backref_manager(backref_manager),
838-
ool_segment_seq_allocator(
839-
new SegmentSeqAllocator(segment_type_t::OOL))
841+
ool_segment_seq_allocator(segment_seq_allocator)
840842
{
841843
config.validate();
842844
}
@@ -854,7 +856,13 @@ void SegmentCleaner::register_metrics()
854856
i = get_bucket_index(UTIL_STATE_EMPTY);
855857
stats.segment_util.buckets[i].count = segments.get_num_segments();
856858

857-
metrics.add_group("segment_cleaner", {
859+
std::string prefix;
860+
if (is_cold) {
861+
prefix.append("cold_");
862+
}
863+
prefix.append("segment_cleaner");
864+
865+
metrics.add_group(prefix, {
858866
sm::make_counter("segments_number",
859867
[this] { return segments.get_num_segments(); },
860868
sm::description("the number of segments")),
@@ -1053,8 +1061,12 @@ SegmentCleaner::do_reclaim_space(
10531061
&pin_list, &reclaimed, &runs] {
10541062
reclaimed = 0;
10551063
runs++;
1064+
auto src = Transaction::src_t::CLEANER_MAIN;
1065+
if (is_cold) {
1066+
src = Transaction::src_t::CLEANER_COLD;
1067+
}
10561068
return extent_callback->with_transaction_intr(
1057-
Transaction::src_t::CLEANER,
1069+
src,
10581070
"clean_reclaim_space",
10591071
[this, &backref_extents, &pin_list, &reclaimed](auto &t)
10601072
{
@@ -1137,6 +1149,7 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space()
11371149
{
11381150
LOG_PREFIX(SegmentCleaner::clean_space);
11391151
assert(background_callback->is_ready());
1152+
ceph_assert(can_clean_space());
11401153
if (!reclaim_state) {
11411154
segment_id_t seg_id = get_next_reclaim_segment();
11421155
auto &segment_info = segments[seg_id];

src/crimson/os/seastore/async_cleaner.h

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -544,14 +544,6 @@ class JournalTrimmerImpl : public JournalTrimmer {
544544
journal_alloc_tail = JOURNAL_SEQ_NULL;
545545
}
546546

547-
bool should_trim_dirty() const {
548-
return get_dirty_tail_target() > journal_dirty_tail;
549-
}
550-
551-
bool should_trim_alloc() const {
552-
return get_alloc_tail_target() > journal_alloc_tail;
553-
}
554-
555547
bool should_trim() const {
556548
return should_trim_alloc() || should_trim_dirty();
557549
}
@@ -596,6 +588,14 @@ class JournalTrimmerImpl : public JournalTrimmer {
596588
friend std::ostream &operator<<(std::ostream &, const stat_printer_t &);
597589

598590
private:
591+
bool should_trim_dirty() const {
592+
return get_dirty_tail_target() > journal_dirty_tail;
593+
}
594+
595+
bool should_trim_alloc() const {
596+
return get_alloc_tail_target() > journal_alloc_tail;
597+
}
598+
599599
using trim_ertr = crimson::errorator<
600600
crimson::ct_error::input_output_error>;
601601
trim_ertr::future<> trim_dirty();
@@ -1144,12 +1144,18 @@ class AsyncCleaner {
11441144

11451145
virtual bool should_block_io_on_clean() const = 0;
11461146

1147+
virtual bool can_clean_space() const = 0;
1148+
11471149
virtual bool should_clean_space() const = 0;
11481150

11491151
using clean_space_ertr = base_ertr;
11501152
using clean_space_ret = clean_space_ertr::future<>;
11511153
virtual clean_space_ret clean_space() = 0;
11521154

1155+
virtual const std::set<device_id_t>& get_device_ids() const = 0;
1156+
1157+
virtual std::size_t get_reclaim_size_per_cycle() const = 0;
1158+
11531159
// test only
11541160
virtual bool check_usage() = 0;
11551161

@@ -1210,11 +1216,9 @@ class SegmentCleaner : public SegmentProvider, public AsyncCleaner {
12101216
config_t config,
12111217
SegmentManagerGroupRef&& sm_group,
12121218
BackrefManager &backref_manager,
1213-
bool detailed);
1214-
1215-
SegmentSeqAllocator& get_ool_segment_seq_allocator() {
1216-
return *ool_segment_seq_allocator;
1217-
}
1219+
SegmentSeqAllocator &segment_seq_allocator,
1220+
bool detailed,
1221+
bool is_cold);
12181222

12191223
void set_journal_trimmer(JournalTrimmer &_trimmer) {
12201224
trimmer = &_trimmer;
@@ -1224,9 +1228,12 @@ class SegmentCleaner : public SegmentProvider, public AsyncCleaner {
12241228
config_t config,
12251229
SegmentManagerGroupRef&& sm_group,
12261230
BackrefManager &backref_manager,
1227-
bool detailed) {
1231+
SegmentSeqAllocator &ool_seq_allocator,
1232+
bool detailed,
1233+
bool is_cold = false) {
12281234
return std::make_unique<SegmentCleaner>(
1229-
config, std::move(sm_group), backref_manager, detailed);
1235+
config, std::move(sm_group), backref_manager,
1236+
ool_seq_allocator, detailed, is_cold);
12301237
}
12311238

12321239
/*
@@ -1312,6 +1319,11 @@ class SegmentCleaner : public SegmentProvider, public AsyncCleaner {
13121319
return aratio < config.available_ratio_hard_limit;
13131320
}
13141321

1322+
bool can_clean_space() const final {
1323+
assert(background_callback->is_ready());
1324+
return get_segments_reclaimable() > 0;
1325+
}
1326+
13151327
bool should_clean_space() const final {
13161328
assert(background_callback->is_ready());
13171329
if (get_segments_reclaimable() == 0) {
@@ -1328,6 +1340,14 @@ class SegmentCleaner : public SegmentProvider, public AsyncCleaner {
13281340

13291341
clean_space_ret clean_space() final;
13301342

1343+
const std::set<device_id_t>& get_device_ids() const final {
1344+
return sm_group->get_device_ids();
1345+
}
1346+
1347+
std::size_t get_reclaim_size_per_cycle() const final {
1348+
return config.reclaim_bytes_per_cycle;
1349+
}
1350+
13311351
// Testing interfaces
13321352

13331353
bool check_usage() final;
@@ -1521,11 +1541,12 @@ class SegmentCleaner : public SegmentProvider, public AsyncCleaner {
15211541
auto new_usage = calc_utilization(segment);
15221542
adjust_segment_util(old_usage, new_usage);
15231543
if (s_type == segment_type_t::OOL) {
1524-
ool_segment_seq_allocator->set_next_segment_seq(seq);
1544+
ool_segment_seq_allocator.set_next_segment_seq(seq);
15251545
}
15261546
}
15271547

15281548
const bool detailed;
1549+
const bool is_cold;
15291550
const config_t config;
15301551

15311552
SegmentManagerGroupRef sm_group;
@@ -1574,7 +1595,7 @@ class SegmentCleaner : public SegmentProvider, public AsyncCleaner {
15741595
BackgroundListener *background_callback = nullptr;
15751596

15761597
// TODO: drop once paddr->journal_seq_t is introduced
1577-
SegmentSeqAllocatorRef ool_segment_seq_allocator;
1598+
SegmentSeqAllocator &ool_segment_seq_allocator;
15781599
};
15791600

15801601
class RBMCleaner;
@@ -1635,12 +1656,24 @@ class RBMCleaner : public AsyncCleaner {
16351656
return false;
16361657
}
16371658

1659+
bool can_clean_space() const final {
1660+
return false;
1661+
}
1662+
16381663
bool should_clean_space() const final {
16391664
return false;
16401665
}
16411666

16421667
clean_space_ret clean_space() final;
16431668

1669+
const std::set<device_id_t>& get_device_ids() const final {
1670+
return rb_group->get_device_ids();
1671+
}
1672+
1673+
std::size_t get_reclaim_size_per_cycle() const final {
1674+
return 0;
1675+
}
1676+
16441677
RandomBlockManager* get_rbm(paddr_t paddr) {
16451678
auto rbs = rb_group->get_rb_managers();
16461679
for (auto p : rbs) {

src/crimson/os/seastore/cache.cc

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Cache::Cache(
4646
LOG_PREFIX(Cache::Cache);
4747
INFO("created, lru_size={}", lru.get_capacity());
4848
register_metrics();
49+
segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr);
4950
}
5051

5152
Cache::~Cache()
@@ -135,7 +136,8 @@ void Cache::register_metrics()
135136
{src_t::READ, sm::label_instance("src", "READ")},
136137
{src_t::TRIM_DIRTY, sm::label_instance("src", "TRIM_DIRTY")},
137138
{src_t::TRIM_ALLOC, sm::label_instance("src", "TRIM_ALLOC")},
138-
{src_t::CLEANER, sm::label_instance("src", "CLEANER")},
139+
{src_t::CLEANER_MAIN, sm::label_instance("src", "CLEANER_MAIN")},
140+
{src_t::CLEANER_COLD, sm::label_instance("src", "CLEANER_COLD")},
139141
};
140142
assert(labels_by_src.size() == (std::size_t)src_t::MAX);
141143

@@ -624,8 +626,10 @@ void Cache::register_metrics()
624626
src2 == Transaction::src_t::READ) ||
625627
(src1 == Transaction::src_t::TRIM_DIRTY &&
626628
src2 == Transaction::src_t::TRIM_DIRTY) ||
627-
(src1 == Transaction::src_t::CLEANER &&
628-
src2 == Transaction::src_t::CLEANER) ||
629+
(src1 == Transaction::src_t::CLEANER_MAIN &&
630+
src2 == Transaction::src_t::CLEANER_MAIN) ||
631+
(src1 == Transaction::src_t::CLEANER_COLD &&
632+
src2 == Transaction::src_t::CLEANER_COLD) ||
629633
(src1 == Transaction::src_t::TRIM_ALLOC &&
630634
src2 == Transaction::src_t::TRIM_ALLOC)) {
631635
continue;
@@ -1113,12 +1117,13 @@ record_t Cache::prepare_record(
11131117
auto stype = segment_type_t::NULL_SEG;
11141118

11151119
// FIXME: This is specific to the segmented implementation
1116-
if (segment_provider != nullptr &&
1117-
i->get_paddr().get_addr_type() == paddr_types_t::SEGMENT) {
1120+
if (i->get_paddr().get_addr_type() == paddr_types_t::SEGMENT) {
11181121
auto sid = i->get_paddr().as_seg_paddr().get_segment_id();
1119-
auto &sinfo = segment_provider->get_seg_info(sid);
1120-
sseq = sinfo.seq;
1121-
stype = sinfo.type;
1122+
auto sinfo = get_segment_info(sid);
1123+
if (sinfo) {
1124+
sseq = sinfo->seq;
1125+
stype = sinfo->type;
1126+
}
11221127
}
11231128

11241129
record.push_back(
@@ -1389,7 +1394,8 @@ record_t Cache::prepare_record(
13891394
auto &rewrite_version_stats = t.get_rewrite_version_stats();
13901395
if (trans_src == Transaction::src_t::TRIM_DIRTY) {
13911396
stats.committed_dirty_version.increment_stat(rewrite_version_stats);
1392-
} else if (trans_src == Transaction::src_t::CLEANER) {
1397+
} else if (trans_src == Transaction::src_t::CLEANER_MAIN ||
1398+
trans_src == Transaction::src_t::CLEANER_COLD) {
13931399
stats.committed_reclaim_version.increment_stat(rewrite_version_stats);
13941400
} else {
13951401
assert(rewrite_version_stats.is_clear());
@@ -1656,21 +1662,22 @@ Cache::replay_delta(
16561662
* safetly skip these deltas because the extent must already
16571663
* have been rewritten.
16581664
*/
1659-
if (segment_provider != nullptr &&
1660-
delta.paddr != P_ADDR_NULL &&
1665+
if (delta.paddr != P_ADDR_NULL &&
16611666
delta.paddr.get_addr_type() == paddr_types_t::SEGMENT) {
16621667
auto& seg_addr = delta.paddr.as_seg_paddr();
1663-
auto& seg_info = segment_provider->get_seg_info(seg_addr.get_segment_id());
1664-
auto delta_paddr_segment_seq = seg_info.seq;
1665-
auto delta_paddr_segment_type = seg_info.type;
1666-
if (delta_paddr_segment_seq != delta.ext_seq ||
1667-
delta_paddr_segment_type != delta.seg_type) {
1668-
DEBUG("delta is obsolete, delta_paddr_segment_seq={},"
1669-
" delta_paddr_segment_type={} -- {}",
1670-
segment_seq_printer_t{delta_paddr_segment_seq},
1671-
delta_paddr_segment_type,
1672-
delta);
1673-
return replay_delta_ertr::make_ready_future<bool>(false);
1668+
auto seg_info = get_segment_info(seg_addr.get_segment_id());
1669+
if (seg_info) {
1670+
auto delta_paddr_segment_seq = seg_info->seq;
1671+
auto delta_paddr_segment_type = seg_info->type;
1672+
if (delta_paddr_segment_seq != delta.ext_seq ||
1673+
delta_paddr_segment_type != delta.seg_type) {
1674+
DEBUG("delta is obsolete, delta_paddr_segment_seq={},"
1675+
" delta_paddr_segment_type={} -- {}",
1676+
segment_seq_printer_t{delta_paddr_segment_seq},
1677+
delta_paddr_segment_type,
1678+
delta);
1679+
return replay_delta_ertr::make_ready_future<bool>(false);
1680+
}
16741681
}
16751682
}
16761683

0 commit comments

Comments
 (0)