Skip to content

Commit 91d4b00

Browse files
authored
EXT-1082 Add replication aware vdisk mapper option to the bs controller (#25063)
(cherry picked from commit c6e1031)
1 parent ac8e6ce commit 91d4b00

File tree

8 files changed

+275
-19
lines changed

8 files changed

+275
-19
lines changed

ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
def add_options(p):
1111
p.add_argument('--max-replicating-pdisks', type=int, help='Limit number of maximum replicating PDisks in the cluster')
1212
p.add_argument('--only-from-overpopulated-pdisks', action='store_true', help='Move vdisks out only from pdisks with over expected slot count')
13+
p.add_argument('--prefer-less-occupied-rack', action='store_true', help='Take into account racks\' free slots picking pdisk from rack with more free slots first')
14+
p.add_argument('--with-attention-to-replication', action='store_true', help='Take into account replicating vdisks picking node and pdisk with less amount of them')
1315
common.add_basic_format_options(p)
1416

1517

@@ -122,6 +124,8 @@ def add_reassign_cmd(request, vslot):
122124
cmd.FailRealmIdx = vslot.FailRealmIdx
123125
cmd.FailDomainIdx = vslot.FailDomainIdx
124126
cmd.VDiskIdx = vslot.VDiskIdx
127+
cmd.PreferLessOccupiedRack = args.prefer_less_occupied_rack
128+
cmd.WithAttentionToReplication = args.with_attention_to_replication
125129

126130
request = common.kikimr_bsconfig.TConfigRequest(Rollback=True)
127131
index = len(request.Command)

ydb/core/mind/bscontroller/cmds_storage_pool.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,14 @@ namespace NKikimr::NBsController {
382382
Fit.OnlyToLessOccupiedPDisk = true;
383383
}
384384

385+
if (cmd.GetPreferLessOccupiedRack()) {
386+
Fit.PreferLessOccupiedRack = true;
387+
}
388+
389+
if (cmd.GetWithAttentionToReplication()) {
390+
Fit.WithAttentionToReplication = true;
391+
}
392+
385393
Fit.PoolsAndGroups.emplace(group->StoragePoolId, group->ID);
386394
}
387395

ydb/core/mind/bscontroller/config.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ namespace NKikimr {
1313
std::set<TBoxId> Boxes;
1414
std::multiset<std::tuple<TBoxStoragePoolId, std::optional<TGroupId>>> PoolsAndGroups; // nullopt goes first and means 'cover all groups in the pool'
1515
bool OnlyToLessOccupiedPDisk = false;
16+
bool PreferLessOccupiedRack = false;
17+
bool WithAttentionToReplication = false;
1618

1719
operator bool() const {
1820
return !Boxes.empty() || !PoolsAndGroups.empty();

ydb/core/mind/bscontroller/config_fit_groups.cpp

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -450,24 +450,25 @@ namespace NKikimr {
450450
State.CheckConsistency();
451451
}
452452

453-
private:
453+
private:
454454
template<typename T>
455455
std::invoke_result_t<T, TGroupGeometryInfo&, TGroupMapper&, TGroupId, TGroupMapper::TGroupDefinition&, TGroupMapper::TGroupConstraintsDefinition&,
456456
const THashMap<TVDiskIdShort, TPDiskId>&, TGroupMapper::TForbiddenPDisks, i64> AllocateOrSanitizeGroup(
457457
TGroupId groupId, TGroupMapper::TGroupDefinition& group, TGroupMapper::TGroupConstraintsDefinition& constraints,
458458
const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid,
459459
i64 requiredSpace, bool addExistingDisks, T&& func) {
460460
if (!Mapper) {
461-
Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping);
461+
Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping, State.Fit.PreferLessOccupiedRack, State.Fit.WithAttentionToReplication);
462462
PopulateGroupMapper();
463463
}
464+
TPDiskSlotTracker& pdiskSlotTracker= Mapper->GetPDiskSlotTracker();
464465
TStackVec<TPDiskId, 32> removeQ;
465466
if (addExistingDisks) {
466467
for (const auto& realm : group) {
467468
for (const auto& domain : realm) {
468469
for (const TPDiskId id : domain) {
469470
if (id != TPDiskId()) {
470-
if (auto *info = State.PDisks.Find(id); info && RegisterPDisk(id, *info, false, "X")) {
471+
if (auto *info = State.PDisks.Find(id); info && RegisterPDisk(id, *info, false, pdiskSlotTracker, "X")) {
471472
removeQ.push_back(id);
472473
}
473474
}
@@ -476,14 +477,14 @@ namespace NKikimr {
476477
}
477478
}
478479
struct TUnregister {
479-
TGroupMapper& Mapper;
480+
TBlobStorageController::TGroupFitter& Self;
480481
TStackVec<TPDiskId, 32>& RemoveQ;
481482
~TUnregister() {
482483
for (const TPDiskId pdiskId : RemoveQ) {
483-
Mapper.UnregisterPDisk(pdiskId);
484+
Self.UnregisterPDisk(pdiskId);
484485
}
485486
}
486-
} unregister{*Mapper, removeQ};
487+
} unregister{*this, removeQ};
487488
return std::invoke(func, Geometry, *Mapper, groupId, group, constraints, replacedDisks, std::move(forbid), requiredSpace);
488489
}
489490

@@ -500,6 +501,22 @@ namespace NKikimr {
500501
void PopulateGroupMapper() {
501502
const TBoxId boxId = std::get<0>(StoragePoolId);
502503

504+
TPDiskSlotTracker pdiskSlotTracker;
505+
506+
bool populateSlotTracker = State.Fit.PreferLessOccupiedRack || State.Fit.WithAttentionToReplication;
507+
508+
if (populateSlotTracker) {
509+
State.VSlots.ForEach([&](const TVSlotId& id, const TVSlotInfo& info) {
510+
if (info.IsBeingDeleted()) {
511+
return; // ignore slots being deleted
512+
}
513+
if (info.GetStatus() == NKikimrBlobStorage::EVDiskStatus::REPLICATING) {
514+
TPDiskId pdiskId = id.ComprisingPDiskId();
515+
pdiskSlotTracker.AddReplicatingVSlot(pdiskId);
516+
}
517+
});
518+
}
519+
503520
State.PDisks.ForEach([&](const TPDiskId& id, const TPDiskInfo& info) {
504521
if (info.BoxId != boxId) {
505522
return; // ignore disks not from desired box
@@ -511,15 +528,17 @@ namespace NKikimr {
511528

512529
for (const auto& filter : StoragePool.PDiskFilters) {
513530
if (filter.MatchPDisk(info)) {
514-
const bool inserted = RegisterPDisk(id, info, true);
531+
const bool inserted = RegisterPDisk(id, info, true, pdiskSlotTracker);
515532
Y_ABORT_UNLESS(inserted);
516533
break;
517534
}
518535
}
519536
});
537+
538+
Mapper->SetPDiskSlotTracker(std::move(pdiskSlotTracker));
520539
}
521540

522-
bool RegisterPDisk(TPDiskId id, const TPDiskInfo& info, bool usable, TString whyUnusable = {}) {
541+
bool RegisterPDisk(TPDiskId id, const TPDiskInfo& info, bool usable, TPDiskSlotTracker& pdiskSlotTracker, TString whyUnusable = {}) {
523542
// calculate number of used slots on this PDisk, also counting the static ones
524543
ui32 numSlots = info.NumActiveSlots + info.StaticSlotUsage;
525544

@@ -574,19 +593,42 @@ namespace NKikimr {
574593
whyUnusable.append('D');
575594
}
576595

596+
ui32 maxSlots = info.ExpectedSlotCount;
597+
auto location = State.HostRecords->GetLocation(id.NodeId);
598+
577599
// register PDisk in the mapper
578-
return Mapper->RegisterPDisk({
600+
bool registered = Mapper->RegisterPDisk({
579601
.PDiskId = id,
580-
.Location = State.HostRecords->GetLocation(id.NodeId),
602+
.Location = location,
581603
.Usable = usable,
582604
.NumSlots = numSlots,
583-
.MaxSlots = info.ExpectedSlotCount,
605+
.MaxSlots = maxSlots,
584606
.Groups = std::move(groups),
585607
.SpaceAvailable = availableSpace,
586608
.Operational = info.Operational,
587609
.Decommitted = info.Decommitted(),
588610
.WhyUnusable = std::move(whyUnusable),
589611
});
612+
613+
bool populateSlotTracker = State.Fit.PreferLessOccupiedRack || State.Fit.WithAttentionToReplication;
614+
615+
if (registered && populateSlotTracker) {
616+
i32 freeSlots = i32(maxSlots) - numSlots;
617+
pdiskSlotTracker.AddFreeSlotsForRack(location.GetRackId(), freeSlots);
618+
}
619+
620+
return registered;
621+
}
622+
623+
void UnregisterPDisk(TPDiskId id) {
624+
TGroupMapper::TPDiskRecord rec = Mapper->UnregisterPDisk(id);
625+
626+
bool populatedSlotTracker = State.Fit.PreferLessOccupiedRack || State.Fit.WithAttentionToReplication;
627+
628+
if (populatedSlotTracker) {
629+
i32 freeSlots = i32(rec.MaxSlots) - rec.NumSlots;
630+
Mapper->GetPDiskSlotTracker().AddFreeSlotsForRack(rec.Location.GetRackId(), -freeSlots);
631+
}
590632
}
591633

592634
std::map<TVDiskIdShort, TVSlotInfo*> CreateVSlotsForGroup(TGroupInfo *groupInfo,

ydb/core/mind/bscontroller/group_mapper.cpp

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,50 @@ namespace NKikimr::NBsController {
250250
}
251251

252252
bool DiskIsBetter(const TPDiskInfo& pretender, const TPDiskInfo& king) const {
253+
if (Self.PreferLessOccupiedRack) {
254+
Y_ABORT_UNLESS(Self.PDiskSlotTracker.has_value());
255+
256+
auto& pdiskSlotTracker = *Self.PDiskSlotTracker;
257+
258+
// Compare by number of free slots in PDisk's rack.
259+
i32 freeSlotsPretender = pdiskSlotTracker.GetFreeSlotsOnRack(pretender.Location.GetRackId());
260+
i32 freeSlotsKing = pdiskSlotTracker.GetFreeSlotsOnRack(king.Location.GetRackId());
261+
262+
if (freeSlotsPretender != freeSlotsKing) {
263+
return freeSlotsPretender > freeSlotsKing;
264+
}
265+
}
266+
267+
if (Self.WithAttentionToReplication) {
268+
auto pretenderNode = pretender.PDiskId.NodeId;
269+
auto kingNode = king.PDiskId.NodeId;
270+
271+
Y_ABORT_UNLESS(Self.PDiskSlotTracker.has_value());
272+
273+
auto& pdiskSlotTracker = *Self.PDiskSlotTracker;
274+
275+
// Compare by number of replicating VDisks on the PDisk's node.
276+
auto pretenderNodeRepls = pdiskSlotTracker.GetReplicatingVDisksOnNode(pretenderNode);
277+
auto kingNodeRepls = pdiskSlotTracker.GetReplicatingVDisksOnNode(kingNode);
278+
279+
if (pretenderNodeRepls != kingNodeRepls) {
280+
return pretenderNodeRepls < kingNodeRepls;
281+
}
282+
283+
// Compare by number of replicating VDisks on the PDisk.
284+
auto pretenderPDiskRepls = pdiskSlotTracker.GetReplicatingVDisksOnPDisk(pretender.PDiskId);
285+
auto kingPDiskRepls = pdiskSlotTracker.GetReplicatingVDisksOnPDisk(king.PDiskId);
286+
287+
if (pretenderPDiskRepls != kingPDiskRepls) {
288+
return pretenderPDiskRepls < kingPDiskRepls;
289+
}
290+
}
291+
253292
if (pretender.FreeSlots() != king.FreeSlots()) {
254293
return pretender.FreeSlots() > king.FreeSlots();
255-
} else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) {
294+
}
295+
296+
if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) {
256297
return true;
257298
} else {
258299
if (pretender.NumDomainMatchingDisks != king.NumDomainMatchingDisks) {
@@ -849,13 +890,26 @@ namespace NKikimr::NBsController {
849890
TPDisks PDisks;
850891
TPDiskByPosition PDiskByPosition;
851892
bool Dirty = false;
893+
bool PreferLessOccupiedRack;
894+
bool WithAttentionToReplication;
895+
std::optional<TPDiskSlotTracker> PDiskSlotTracker;
852896

853897
public:
854-
TImpl(TGroupGeometryInfo geom, bool randomize)
898+
TImpl(TGroupGeometryInfo geom, bool randomize, bool preferLessOccupiedRack, bool withAttentionToReplication)
855899
: Geom(std::move(geom))
856900
, Randomize(randomize)
901+
, PreferLessOccupiedRack(preferLessOccupiedRack)
902+
, WithAttentionToReplication(withAttentionToReplication)
857903
{}
858904

905+
void SetPDiskSlotTracker(TPDiskSlotTracker&& tracker) {
906+
PDiskSlotTracker = std::move(tracker);
907+
}
908+
909+
TPDiskSlotTracker& GetPDiskSlotTracker() {
910+
return PDiskSlotTracker.value();
911+
}
912+
859913
bool RegisterPDisk(const TPDiskRecord& pdisk) {
860914
// calculate disk position
861915
const TPDiskLayoutPosition p(DomainMapper, pdisk.Location, pdisk.PDiskId, Geom);
@@ -872,13 +926,15 @@ namespace NKikimr::NBsController {
872926
return inserted;
873927
}
874928

875-
void UnregisterPDisk(TPDiskId pdiskId) {
929+
TPDiskRecord UnregisterPDisk(TPDiskId pdiskId) {
876930
const auto it = PDisks.find(pdiskId);
877931
Y_ABORT_UNLESS(it != PDisks.end());
878932
auto x = std::remove(PDiskByPosition.begin(), PDiskByPosition.end(), std::make_pair(it->second.Position, &it->second));
879933
Y_ABORT_UNLESS(x + 1 == PDiskByPosition.end());
880934
PDiskByPosition.pop_back();
935+
TPDiskRecord ret = it->second;
881936
PDisks.erase(it);
937+
return ret;
882938
}
883939

884940
void AdjustSpaceAvailable(TPDiskId pdiskId, i64 increment) {
@@ -1126,17 +1182,25 @@ namespace NKikimr::NBsController {
11261182
}
11271183
};
11281184

1129-
TGroupMapper::TGroupMapper(TGroupGeometryInfo geom, bool randomize)
1130-
: Impl(new TImpl(std::move(geom), randomize))
1185+
TGroupMapper::TGroupMapper(TGroupGeometryInfo geom, bool randomize, bool preferLessOccupiedRack, bool withAttentionToReplication)
1186+
: Impl(new TImpl(std::move(geom), randomize, preferLessOccupiedRack, withAttentionToReplication))
11311187
{}
11321188

11331189
TGroupMapper::~TGroupMapper() = default;
11341190

1191+
void TGroupMapper::SetPDiskSlotTracker(TPDiskSlotTracker&& tracker) {
1192+
Impl->SetPDiskSlotTracker(std::move(tracker));
1193+
}
1194+
1195+
TPDiskSlotTracker& TGroupMapper::GetPDiskSlotTracker() {
1196+
return Impl->GetPDiskSlotTracker();
1197+
}
1198+
11351199
bool TGroupMapper::RegisterPDisk(const TPDiskRecord& pdisk) {
11361200
return Impl->RegisterPDisk(pdisk);
11371201
}
11381202

1139-
void TGroupMapper::UnregisterPDisk(TPDiskId pdiskId) {
1203+
TGroupMapper::TPDiskRecord TGroupMapper::UnregisterPDisk(TPDiskId pdiskId) {
11401204
return Impl->UnregisterPDisk(pdiskId);
11411205
}
11421206

ydb/core/mind/bscontroller/group_mapper.h

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,42 @@ namespace NKikimr {
88

99
class TGroupGeometryInfo;
1010

11+
class TPDiskSlotTracker {
12+
absl::flat_hash_map<ui32, ui16> ReplicatingVDisksByNode;
13+
absl::flat_hash_map<TPDiskId, ui8> ReplicatingVDisksByPDisk;
14+
absl::flat_hash_map<TString, i32> FreeSlotsPerRack;
15+
public:
16+
ui16 GetReplicatingVDisksOnNode(ui32 nodeId) const {
17+
if (const auto it = ReplicatingVDisksByNode.find(nodeId); it != ReplicatingVDisksByNode.end()) {
18+
return it->second;
19+
}
20+
return 0;
21+
}
22+
23+
ui8 GetReplicatingVDisksOnPDisk(TPDiskId pdiskId) const {
24+
if (const auto it = ReplicatingVDisksByPDisk.find(pdiskId); it != ReplicatingVDisksByPDisk.end()) {
25+
return it->second;
26+
}
27+
return 0;
28+
}
29+
30+
i32 GetFreeSlotsOnRack(const TString& rack) const {
31+
if (const auto it = FreeSlotsPerRack.find(rack); it != FreeSlotsPerRack.end()) {
32+
return it->second;
33+
}
34+
return 0;
35+
}
36+
37+
void AddReplicatingVSlot(TPDiskId pdiskId) {
38+
++ReplicatingVDisksByNode[pdiskId.NodeId];
39+
++ReplicatingVDisksByPDisk[pdiskId];
40+
}
41+
42+
void AddFreeSlotsForRack(const TString& rack, i32 freeSlots) {
43+
FreeSlotsPerRack[rack] += freeSlots;
44+
}
45+
};
46+
1147
// TGroupMapper is a helper class used to create groups from a set of PDisks with their respective locations
1248
// over physical hardware
1349
class TGroupMapper {
@@ -65,14 +101,18 @@ namespace NKikimr {
65101
};
66102

67103
public:
68-
TGroupMapper(TGroupGeometryInfo geom, bool randomize = false);
104+
TGroupMapper(TGroupGeometryInfo geom, bool randomize = false, bool preferLessOccupiedRack = false, bool withAttentionToReplication = false);
69105
~TGroupMapper();
70106

107+
void SetPDiskSlotTracker(TPDiskSlotTracker&& state);
108+
109+
TPDiskSlotTracker& GetPDiskSlotTracker();
110+
71111
// Register PDisk inside mapper to use it in subsequent map operations
72112
bool RegisterPDisk(const TPDiskRecord& pdisk);
73113

74114
// Remove PDisk from the table.
75-
void UnregisterPDisk(TPDiskId pdiskId);
115+
TPDiskRecord UnregisterPDisk(TPDiskId pdiskId);
76116

77117
// Adjust VDisk space quota.
78118
void AdjustSpaceAvailable(TPDiskId pdiskId, i64 increment);

0 commit comments

Comments
 (0)