Skip to content

Commit 155b318

Browse files
fixed crash while listing tablet boot info backup that was corrupted; added replying with raw protobuf on list requests (#4565)
если мы не успели дописать бэкап следующим образом ``` Channels { Channel: 5 ChannelType: 0 History { FromGeneration: 0 GroupID: 2181038117 } StoragePool: "/pre-prod_sas/NBS:rot" } Channels { ``` и пытаемся после этого подняться и прогреть группы, то протобуфф успешно парсится, но падает на верифайках при попытке почитать бэкап при конвертации из протобуфины во внутреннюю структуру. Вызов функции с верифайками: https://github.com/ydb-platform/nbs/blob/fdde68d77b9eb3b13e9c5a0fe8082599e2e30f63/cloud/storage/core/libs/hive_proxy/tablet_boot_info_backup.cpp#L272-L274 https://github.com/ydb-platform/nbs/blob/fdde68d77b9eb3b13e9c5a0fe8082599e2e30f63/contrib/ydb/core/base/tablet.cpp#L5
1 parent a8c816c commit 155b318

File tree

4 files changed

+86
-12
lines changed

4 files changed

+86
-12
lines changed

cloud/blockstore/libs/daemon/ydb/bootstrap.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,17 +241,20 @@ class TWarmupBSGroupConnectionsActor final
241241
auto tabletBootInfos = std::move(ev->Get()->TabletBootInfos);
242242
THashSet<ui64> groupIds;
243243
for (const auto& tabletBootInfo: tabletBootInfos) {
244-
for (const auto& channel: tabletBootInfo.StorageInfo->Channels) {
244+
for (const auto& channel:
245+
tabletBootInfo.StorageInfoProto.GetChannels())
246+
{
245247
auto historyEntries =
246-
channel.History | std::views::reverse |
248+
channel.GetHistory() | std::views::reverse |
247249
std::views::filter(
248250
[&](const auto& el)
249-
{ return groupIds.insert(el.GroupID).second; }) |
251+
{ return groupIds.insert(el.GetGroupID()).second; }) |
250252
std::views::take(GroupsPerChannelToWarmup);
251253
for (const auto& historyEntry: historyEntries) {
252254
NCloud::Send(
253255
ctx,
254-
NKikimr::MakeBlobStorageProxyID(historyEntry.GroupID),
256+
NKikimr::MakeBlobStorageProxyID(
257+
historyEntry.GetGroupID()),
255258
std::make_unique<NKikimr::TEvBlobStorage::TEvStatus>(
256259
TInstant::Max()));
257260
}

cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "hive_proxy_events_private.h"
33

44
#include <cloud/storage/core/libs/api/hive_proxy.h>
5+
#include <cloud/storage/core/libs/hive_proxy/protos/tablet_boot_info_backup.pb.h>
56

67
#include <contrib/ydb/core/base/blobstorage.h>
78
#include <contrib/ydb/core/base/hive.h>
@@ -13,6 +14,7 @@
1314
#include <contrib/ydb/core/testlib/basics/helpers.h>
1415
#include <contrib/ydb/core/testlib/tablet_helpers.h>
1516

17+
#include <library/cpp/protobuf/util/pb_io.h>
1618
#include <library/cpp/testing/unittest/registar.h>
1719

1820
namespace NCloud::NStorage {
@@ -1672,19 +1674,86 @@ Y_UNIT_TEST_SUITE(THiveProxyTest)
16721674
UNIT_ASSERT_VALUES_EQUAL(1, result.TabletBootInfos.size());
16731675
UNIT_ASSERT_VALUES_EQUAL(
16741676
FakeTablet2,
1675-
result.TabletBootInfos[0].StorageInfo->TabletID);
1677+
result.TabletBootInfos[0].StorageInfoProto.GetTabletID());
16761678
TVector<TVector<ui64>> actualGroupIds;
1677-
for (auto& channel: result.TabletBootInfos[0].StorageInfo->Channels)
1679+
for (auto& channel:
1680+
result.TabletBootInfos[0].StorageInfoProto.GetChannels())
16781681
{
16791682
actualGroupIds.emplace_back();
1680-
for (auto& historyEntry: channel.History) {
1681-
actualGroupIds.back().emplace_back(historyEntry.GroupID);
1683+
for (auto& historyEntry: channel.GetHistory()) {
1684+
actualGroupIds.back().emplace_back(
1685+
historyEntry.GetGroupID());
16821686
}
16831687
}
16841688

16851689
UNIT_ASSERT_VALUES_EQUAL(expectedGroupIds, actualGroupIds);
16861690
}
16871691
}
1692+
1693+
Y_UNIT_TEST(ShouldListCorruptedTabletBootInfoBackupsWithoutCrash)
1694+
{
1695+
TString backupFilePath =
1696+
"BootExternal.tablet_boot_info_backup.txt";
1697+
1698+
NKikimr::TTabletStorageInfoPtr storageInfo;
1699+
1700+
TVector<TVector<ui64>> expectedGroupIds;
1701+
{
1702+
TTestBasicRuntime runtime;
1703+
TTestEnv env(runtime, backupFilePath, false);
1704+
1705+
TTabletStorageInfoPtr expected = CreateTestTabletInfo(
1706+
FakeTablet2,
1707+
TTabletTypes::BlockStorePartition);
1708+
env.HiveState->StorageInfos[FakeTablet2] = expected;
1709+
1710+
auto sender = runtime.AllocateEdgeActor();
1711+
1712+
auto result =
1713+
env.SendBootExternalRequest(sender, FakeTablet2, S_OK);
1714+
UNIT_ASSERT(result.StorageInfo);
1715+
UNIT_ASSERT_VALUES_EQUAL(FakeTablet2, result.StorageInfo->TabletID);
1716+
UNIT_ASSERT_VALUES_EQUAL(1u, result.SuggestedGeneration);
1717+
1718+
// Smoke check for background sync (15 seconds should be enough).
1719+
runtime.AdvanceCurrentTime(TDuration::Seconds(15));
1720+
runtime.DispatchEvents(TDispatchOptions(), TDuration::Seconds(15));
1721+
1722+
for (auto& channel: result.StorageInfo->Channels) {
1723+
expectedGroupIds.emplace_back();
1724+
for (auto& historyEntry: channel.History) {
1725+
expectedGroupIds.back().emplace_back(
1726+
historyEntry.GroupID);
1727+
}
1728+
}
1729+
storageInfo = result.StorageInfo;
1730+
env.SendBackupTabletBootInfos(sender, S_OK);
1731+
}
1732+
1733+
// simulate backup corruption
1734+
{
1735+
NHiveProxy::NProto::TTabletBootInfoBackup backupProto;
1736+
MergeFromTextFormat(backupFilePath, backupProto);
1737+
1738+
backupProto.MutableData()
1739+
->begin()
1740+
->second.MutableStorageInfo()
1741+
->MutableChannels(0)
1742+
->SetChannelErasureName("");
1743+
1744+
TFileOutput output(backupFilePath);
1745+
SerializeToTextFormat(backupProto, output);
1746+
}
1747+
1748+
{
1749+
TTestBasicRuntime runtime;
1750+
TTestEnv env(runtime, backupFilePath, false);
1751+
1752+
auto sender = runtime.AllocateEdgeActor();
1753+
1754+
env.SendListTabletBootInfoBackups(sender, S_OK);
1755+
}
1756+
}
16881757
}
16891758

16901759
} // namespace NCloud::NStorage

cloud/storage/core/libs/hive_proxy/tablet_boot_info.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
#include <cloud/storage/core/libs/kikimr/public.h>
66

7+
#include <contrib/ydb/core/protos/tablet.pb.h>
8+
79
namespace NCloud::NStorage {
810

911
////////////////////////////////////////////////////////////////////////////////
@@ -13,13 +15,13 @@ struct TTabletBootInfo
1315
TTabletBootInfo() = default;
1416

1517
TTabletBootInfo(
16-
NKikimr::TTabletStorageInfoPtr storageInfo,
18+
NKikimrTabletBase::TTabletStorageInfo storageInfoProto,
1719
ui64 suggestedGeneration)
18-
: StorageInfo(std::move(storageInfo))
20+
: StorageInfoProto(std::move(storageInfoProto))
1921
, SuggestedGeneration(suggestedGeneration)
2022
{}
2123

22-
NKikimr::TTabletStorageInfoPtr StorageInfo;
24+
NKikimrTabletBase::TTabletStorageInfo StorageInfoProto;
2325
ui64 SuggestedGeneration = 0;
2426
};
2527

cloud/storage/core/libs/hive_proxy/tablet_boot_info_backup.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ void TTabletBootInfoBackup::HandleListTabletBootInfoBackups(
270270
TVector<TTabletBootInfo> tabletBootInfos;
271271
for (const auto& [_, tabletBootInfo]: BackupProto.GetData()) {
272272
tabletBootInfos.emplace_back(
273-
NKikimr::TabletStorageInfoFromProto(tabletBootInfo.GetStorageInfo()),
273+
tabletBootInfo.GetStorageInfo(),
274274
tabletBootInfo.GetSuggestedGeneration());
275275
}
276276

0 commit comments

Comments
 (0)