1616#include < ydb/core/base/path.h>
1717#include < ydb/core/base/statestorage.h>
1818#include < ydb/core/base/tablet_pipe.h>
19+ #include < ydb/core/cms/console/configs_dispatcher.h>
1920#include < ydb/core/mon/mon.h>
2021#include < ydb/core/base/nameservice.h>
2122#include < ydb/core/blobstorage/base/blobstorage_events.h>
2829#include < ydb/core/util/tuples.h>
2930
3031#include < ydb/core/protos/blobstorage_distributed_config.pb.h>
32+ #include < ydb/core/protos/config.pb.h>
3133#include < ydb/core/sys_view/common/events.h>
3234
3335#include < ydb/public/api/grpc/ydb_monitoring_v1.grpc.pb.h>
@@ -121,11 +123,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
121123 ui64 Cookie;
122124 NWilson::TSpan Span;
123125
124- TSelfCheckRequest (const TActorId& sender, THolder<TEvSelfCheckRequest> request, ui64 cookie, NWilson::TTraceId&& traceId)
126+ TSelfCheckRequest (const TActorId& sender, THolder<TEvSelfCheckRequest> request, ui64 cookie, NWilson::TTraceId&& traceId, const NKikimrConfig::THealthCheckConfig& config )
125127 : Sender(sender)
126128 , Request(std::move(request))
127129 , Cookie(cookie)
128130 , Span(TComponentTracingLevels::TTablet::Basic, std::move(traceId), " health_check" , NWilson::EFlags::AUTO_END)
131+ , HealthCheckConfig(config)
129132 {}
130133
131134 using TGroupId = ui32;
@@ -163,7 +166,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
163166 struct TNodeTabletState {
164167 struct TTabletStateSettings {
165168 TInstant AliveBarrier;
166- ui32 MaxRestartsPerPeriod = 30 ; // per hour
169+ ui32 MaxRestartsPerPeriod; // per hour
167170 ui32 MaxTabletIdsStored = 10 ;
168171 bool ReportGoodTabletsIds = false ;
169172 };
@@ -266,6 +269,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
266269 TString ErasureSpecies;
267270 std::vector<const NKikimrSysView::TVSlotEntry*> VSlots;
268271 ui32 Generation;
272+ bool LayoutCorrect = true ;
269273 };
270274
271275 struct TSelfCheckResult {
@@ -647,6 +651,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
647651 std::optional<TRequestResponse<TEvStateStorage::TEvBoardInfo>> DatabaseBoardInfo;
648652 THashSet<TNodeId> UnknownStaticGroups;
649653
654+ const NKikimrConfig::THealthCheckConfig& HealthCheckConfig;
655+
650656 std::vector<TNodeId> SubscribedNodeIds;
651657 THashSet<TNodeId> StorageNodeIds;
652658 THashSet<TNodeId> ComputeNodeIds;
@@ -742,7 +748,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
742748
743749 TTabletRequestsState TabletRequests;
744750
745- TDuration Timeout = TDuration::MilliSeconds(20000 );
751+ TDuration Timeout = TDuration::MilliSeconds(HealthCheckConfig.GetTimeout() );
746752 static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = " static" ;
747753
748754 bool IsSpecificDatabaseFilter () const {
@@ -1504,6 +1510,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15041510 for (const auto & [hiveId, hiveResponse] : HiveInfo) {
15051511 if (hiveResponse.IsOk ()) {
15061512 settings.AliveBarrier = TInstant::MilliSeconds (hiveResponse->Record .GetResponseTimestamp ()) - TDuration::Minutes (5 );
1513+ settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds ().GetTabletsRestartsOrange ();
15071514 for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record .GetTablets ()) {
15081515 TSubDomainKey tenantId = TSubDomainKey (hiveTablet.GetObjectDomain ());
15091516 auto itDomain = FilterDomainKey.find (tenantId);
@@ -1569,6 +1576,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15691576 auto & groupState = GroupState[groupId];
15701577 groupState.ErasureSpecies = group.GetInfo ().GetErasureSpeciesV2 ();
15711578 groupState.Generation = group.GetInfo ().GetGeneration ();
1579+ groupState.LayoutCorrect = group.GetInfo ().GetLayoutCorrect ();
15721580 StoragePoolState[poolId].Groups .emplace (groupId);
15731581 }
15741582 for (const auto & vSlot : VSlots->Get ()->Record .GetEntries ()) {
@@ -1729,9 +1737,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17291737 FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
17301738
17311739 TSelfCheckContext rrContext (&context, " NODE_UPTIME" );
1732- if (databaseState.NodeRestartsPerPeriod [nodeId] >= 30 ) {
1740+ if (databaseState.NodeRestartsPerPeriod [nodeId] >= HealthCheckConfig. GetThresholds (). GetNodeRestartsOrange () ) {
17331741 rrContext.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Node is restarting too often" , ETags::Uptime);
1734- } else if (databaseState.NodeRestartsPerPeriod [nodeId] >= 10 ) {
1742+ } else if (databaseState.NodeRestartsPerPeriod [nodeId] >= HealthCheckConfig. GetThresholds (). GetNodeRestartsYellow () ) {
17351743 rrContext.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, " The number of node restarts has increased" , ETags::Uptime);
17361744 } else {
17371745 rrContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
@@ -1769,9 +1777,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17691777 long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs ();
17701778 TDuration timeDifferenceDuration = TDuration::MicroSeconds (abs (timeDifferenceUs));
17711779 Ydb::Monitoring::StatusFlag::Status status;
1772- if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME ) {
1780+ if (timeDifferenceDuration > TDuration::MicroSeconds (HealthCheckConfig. GetThresholds (). GetNodesTimeDifferenceOrange ()) ) {
17731781 status = Ydb::Monitoring::StatusFlag::ORANGE;
1774- } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME ) {
1782+ } else if (timeDifferenceDuration > TDuration::MicroSeconds (HealthCheckConfig. GetThresholds (). GetNodesTimeDifferenceYellow ()) ) {
17751783 status = Ydb::Monitoring::StatusFlag::YELLOW;
17761784 } else {
17771785 status = Ydb::Monitoring::StatusFlag::GREEN;
@@ -2343,6 +2351,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23432351
23442352 class TGroupChecker {
23452353 TString ErasureSpecies;
2354+ bool LayoutCorrect;
23462355 int FailedDisks = 0 ;
23472356 std::array<int , Ydb::Monitoring::StatusFlag::Status_ARRAYSIZE> DisksColors = {};
23482357 TStackVec<std::pair<ui32, int >> FailedRealms;
@@ -2359,7 +2368,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23592368 }
23602369
23612370 public:
2362- TGroupChecker (const TString& erasure) : ErasureSpecies(erasure) {}
2371+ TGroupChecker (const TString& erasure, const bool layoutCorrect = true )
2372+ : ErasureSpecies(erasure)
2373+ , LayoutCorrect(layoutCorrect)
2374+ {}
23632375
23642376 void AddVDiskStatus (Ydb::Monitoring::StatusFlag::Status status, ui32 realm) {
23652377 ++DisksColors[status];
@@ -2378,6 +2390,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23782390
23792391 void ReportStatus (TSelfCheckContext& context) const {
23802392 context.OverallStatus = Ydb::Monitoring::StatusFlag::GREEN;
2393+ if (!LayoutCorrect) {
2394+ context.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Group layout is incorrect" , ETags::GroupState);
2395+ }
23812396 if (ErasureSpecies == NONE) {
23822397 if (FailedDisks > 0 ) {
23832398 context.ReportStatus (Ydb::Monitoring::StatusFlag::RED, " Group failed" , ETags::GroupState, {ETags::VDiskState});
@@ -2727,7 +2742,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
27272742 return ;
27282743 }
27292744
2730- TGroupChecker checker (itGroup->second .ErasureSpecies );
2745+ TGroupChecker checker (itGroup->second .ErasureSpecies , itGroup-> second . LayoutCorrect );
27312746 const auto & slots = itGroup->second .VSlots ;
27322747 for (const auto * slot : slots) {
27332748 const auto & slotInfo = slot->GetInfo ();
@@ -2921,9 +2936,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
29212936 }
29222937 }
29232938
2924- const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000 );
2925- const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000 );
2926-
29272939 void FillResult (TOverallStateContext context) {
29282940 if (IsSpecificDatabaseFilter ()) {
29292941 FillDatabaseResult (context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -3252,12 +3264,16 @@ void TNodeCheckRequest<NMon::TEvHttpInfo>::Bootstrap() {
32523264class THealthCheckService : public TActorBootstrapped <THealthCheckService> {
32533265public:
32543266 static constexpr NKikimrServices::TActivity::EType ActorActivityType () { return NKikimrServices::TActivity::MONITORING_SERVICE; }
3267+ NKikimrConfig::THealthCheckConfig HealthCheckConfig;
32553268
32563269 THealthCheckService ()
32573270 {
32583271 }
32593272
32603273 void Bootstrap () {
3274+ HealthCheckConfig.CopyFrom (AppData ()->HealthCheckConfig );
3275+ Send (NConsole::MakeConfigsDispatcherID (SelfId ().NodeId ()),
3276+ new NConsole::TEvConfigsDispatcher::TEvSetConfigSubscriptionRequest ({NKikimrConsole::TConfigItem::HealthCheckConfigItem}));
32613277 TMon* mon = AppData ()->Mon ;
32623278 if (mon) {
32633279 mon->RegisterActorPage ({
@@ -3270,8 +3286,16 @@ class THealthCheckService : public TActorBootstrapped<THealthCheckService> {
32703286 Become (&THealthCheckService::StateWork);
32713287 }
32723288
3289+ void Handle (NConsole::TEvConsole::TEvConfigNotificationRequest::TPtr& ev) {
3290+ const auto & record = ev->Get ()->Record ;
3291+ if (record.GetConfig ().HasHealthCheckConfig ()) {
3292+ HealthCheckConfig.CopyFrom (record.GetConfig ().GetHealthCheckConfig ());
3293+ }
3294+ Send (ev->Sender , new NConsole::TEvConsole::TEvConfigNotificationResponse (record), 0 , ev->Cookie );
3295+ }
3296+
32733297 void Handle (TEvSelfCheckRequest::TPtr& ev) {
3274- Register (new TSelfCheckRequest (ev->Sender , ev.Get ()->Release (), ev->Cookie , std::move (ev->TraceId )));
3298+ Register (new TSelfCheckRequest (ev->Sender , ev.Get ()->Release (), ev->Cookie , std::move (ev->TraceId ), HealthCheckConfig ));
32753299 }
32763300
32773301 std::shared_ptr<NYdbGrpc::TGRpcClientLow> GRpcClientLow;
@@ -3299,6 +3323,7 @@ class THealthCheckService : public TActorBootstrapped<THealthCheckService> {
32993323 hFunc (TEvSelfCheckRequest, Handle);
33003324 hFunc (TEvNodeCheckRequest, Handle);
33013325 hFunc (NMon::TEvHttpInfo, Handle);
3326+ hFunc (NConsole::TEvConsole::TEvConfigNotificationRequest, Handle);
33023327 cFunc (TEvents::TSystem::PoisonPill, PassAway);
33033328 }
33043329 }
0 commit comments