diff --git a/cloud/blockstore/config/diagnostics.proto b/cloud/blockstore/config/diagnostics.proto index 882e98f13fc..48e232cd06d 100644 --- a/cloud/blockstore/config/diagnostics.proto +++ b/cloud/blockstore/config/diagnostics.proto @@ -230,5 +230,10 @@ message TDiagnosticsConfig // Skip reporting ZeroBlocks metrics as part of WriteBlocks metrics optional bool SkipReportingZeroBlocksMetricsForYDBBasedDisks = 53; + // Opentelemetry trace config optional NCloud.NProto.TOpentelemetryTraceConfig OpentelemetryTraceConfig = 54; + + // Report time histograms with milliseconds buckets. Percentiles are still + // reported in microseconds. + optional bool UseMsUnitsForTimeHistogram = 56; } diff --git a/cloud/blockstore/libs/.gitignore b/cloud/blockstore/libs/.gitignore index 255c48b48f9..92826532bf7 100644 --- a/cloud/blockstore/libs/.gitignore +++ b/cloud/blockstore/libs/.gitignore @@ -4,6 +4,7 @@ client/ut/cloud-blockstore-libs-client-ut common/benchmark/benchmark common/ut/cloud-blockstore-libs-common-ut daemon/ydb/ut/cloud-blockstore-libs-daemon-ydb-ut +diagnostics/gtest/cloud-blockstore-libs-diagnostics-gtest diagnostics/ut/cloud-blockstore-libs-diagnostics-ut discovery/ut/cloud-blockstore-libs-discovery-ut disk_agent/ut/cloud-blockstore-libs-disk_agent-ut diff --git a/cloud/blockstore/libs/diagnostics/config.cpp b/cloud/blockstore/libs/diagnostics/config.cpp index 12dc54bf4ff..424bb579b4b 100644 --- a/cloud/blockstore/libs/diagnostics/config.cpp +++ b/cloud/blockstore/libs/diagnostics/config.cpp @@ -56,7 +56,8 @@ namespace { xxx(LocalHDDDowntimeThreshold, TDuration, TDuration::Seconds(15) )\ xxx(ReportHistogramAsMultipleCounters, bool, true )\ xxx(ReportHistogramAsSingleCounter, bool, false )\ - xxx(StatsFetcherType, NCloud::NProto::EStatsFetcherType, NCloud::NProto::EStatsFetcherType::CGROUP )\ + xxx(UseMsUnitsForTimeHistogram, bool, false )\ + xxx(StatsFetcherType, NCloud::NProto::EStatsFetcherType, NCloud::NProto::EStatsFetcherType::CGROUP )\ \ xxx(SkipReportingZeroBlocksMetricsForYDBBasedDisks, bool, false )\ xxx(OpentelemetryTraceConfig, ::NCloud::NProto::TOpentelemetryTraceConfig, {} )\ @@ -173,6 +174,10 @@ EHistogramCounterOptions TDiagnosticsConfig::GetHistogramCounterOptions() const if (GetReportHistogramAsSingleCounter()) { histogramCounterOptions |= EHistogramCounterOption::ReportSingleCounter; } + if (GetUseMsUnitsForTimeHistogram()) { + histogramCounterOptions |= + EHistogramCounterOption::UseMsUnitsForTimeHistogram; + } return histogramCounterOptions; } diff --git a/cloud/blockstore/libs/diagnostics/config.h b/cloud/blockstore/libs/diagnostics/config.h index 49e80c12ce7..3d8eb2ba375 100644 --- a/cloud/blockstore/libs/diagnostics/config.h +++ b/cloud/blockstore/libs/diagnostics/config.h @@ -158,6 +158,7 @@ class TDiagnosticsConfig TRequestThresholds GetRequestThresholds() const; EHistogramCounterOptions GetHistogramCounterOptions() const; + bool GetUseMsUnitsForTimeHistogram() const; NCloud::NProto::EStatsFetcherType GetStatsFetcherType() const; diff --git a/cloud/blockstore/libs/diagnostics/ut/res/user_server_volume_instance_skip_zero_blocks_test.json b/cloud/blockstore/libs/diagnostics/gtest/res/user_server_volume_instance_skip_zero_blocks_test.json similarity index 100% rename from cloud/blockstore/libs/diagnostics/ut/res/user_server_volume_instance_skip_zero_blocks_test.json rename to cloud/blockstore/libs/diagnostics/gtest/res/user_server_volume_instance_skip_zero_blocks_test.json diff --git a/cloud/blockstore/libs/diagnostics/ut/res/user_server_volume_instance_test.json b/cloud/blockstore/libs/diagnostics/gtest/res/user_server_volume_instance_test.json similarity index 100% rename from cloud/blockstore/libs/diagnostics/ut/res/user_server_volume_instance_test.json rename to cloud/blockstore/libs/diagnostics/gtest/res/user_server_volume_instance_test.json diff --git a/cloud/blockstore/libs/diagnostics/ut/res/user_service_volume_instance_test.json b/cloud/blockstore/libs/diagnostics/gtest/res/user_service_volume_instance_test.json similarity index 100% rename from cloud/blockstore/libs/diagnostics/ut/res/user_service_volume_instance_test.json rename to cloud/blockstore/libs/diagnostics/gtest/res/user_service_volume_instance_test.json diff --git a/cloud/blockstore/libs/diagnostics/gtest/ya.make b/cloud/blockstore/libs/diagnostics/gtest/ya.make new file mode 100644 index 00000000000..0deb01111ce --- /dev/null +++ b/cloud/blockstore/libs/diagnostics/gtest/ya.make @@ -0,0 +1,30 @@ +GTEST() + +INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/small.inc) + +PEERDIR( + cloud/blockstore/libs/service + cloud/storage/core/libs/diagnostics + cloud/storage/core/libs/user_stats/counter + + library/cpp/eventlog/dumper + library/cpp/json + library/cpp/monlib/encode/json + library/cpp/monlib/encode/spack + library/cpp/monlib/encode/text + library/cpp/resource +) + +RESOURCE( + res/user_server_volume_instance_test.json user_server_volume_instance_test + res/user_service_volume_instance_test.json user_service_volume_instance_test + res/user_server_volume_instance_skip_zero_blocks_test.json user_server_volume_instance_skip_zero_blocks_test +) + +SRCS( + ../user_counter.cpp + + ../user_counter_ut.cpp +) + +END() diff --git a/cloud/blockstore/libs/diagnostics/user_counter.cpp b/cloud/blockstore/libs/diagnostics/user_counter.cpp index 2b94d05f910..fc2762844df 100644 --- a/cloud/blockstore/libs/diagnostics/user_counter.cpp +++ b/cloud/blockstore/libs/diagnostics/user_counter.cpp @@ -76,6 +76,7 @@ void RegisterServiceVolume( const TString& cloudId, const TString& folderId, const TString& diskId, + EHistogramCounterOptions histogramCounterOptions, TDynamicCounterPtr src) { const auto commonLabels = @@ -94,7 +95,10 @@ void RegisterServiceVolume( auto readSub = src->FindSubgroup("request", "ReadBlocks"); AddHistogramUserMetric( - GetUsBuckets(), + histogramCounterOptions.HasFlag( + EHistogramCounterOption::UseMsUnitsForTimeHistogram) + ? GetMsBuckets() + : GetUsBuckets(), dsc, commonLabels, {{readSub, "ThrottlerDelay"}}, @@ -103,7 +107,10 @@ void RegisterServiceVolume( auto writeSub = src->FindSubgroup("request", "WriteBlocks"); auto zeroSub = src->FindSubgroup("request", "ZeroBlocks"); AddHistogramUserMetric( - GetUsBuckets(), + histogramCounterOptions.HasFlag( + EHistogramCounterOption::UseMsUnitsForTimeHistogram) + ? GetMsBuckets() + : GetUsBuckets(), dsc, commonLabels, {{writeSub, "ThrottlerDelay"}, {zeroSub, "ThrottlerDelay"}}, @@ -132,6 +139,7 @@ void RegisterServerVolumeInstance( const TString& diskId, const TString& instanceId, const bool reportZeroBlocksMetrics, + EHistogramCounterOptions histogramCounterOptions, TDynamicCounterPtr src) { if (instanceId.empty()) { @@ -184,7 +192,10 @@ void RegisterServerVolumeInstance( {{readSub, "MaxInProgressBytes"}}, DISK_READ_BYTES_IN_FLIGHT_BURST); AddHistogramUserMetric( - GetUsBuckets(), + histogramCounterOptions.HasFlag( + EHistogramCounterOption::UseMsUnitsForTimeHistogram) + ? GetMsBuckets() + : GetUsBuckets(), dsc, commonLabels, {{readSub, "Time"}}, @@ -242,7 +253,10 @@ void RegisterServerVolumeInstance( getWriteCounters("MaxInProgressBytes"), DISK_WRITE_BYTES_IN_FLIGHT_BURST); AddHistogramUserMetric( - GetUsBuckets(), + histogramCounterOptions.HasFlag( + EHistogramCounterOption::UseMsUnitsForTimeHistogram) + ? GetMsBuckets() + : GetUsBuckets(), dsc, commonLabels, getWriteCounters("Time"), diff --git a/cloud/blockstore/libs/diagnostics/user_counter.h b/cloud/blockstore/libs/diagnostics/user_counter.h index 1128b655e4f..44ff3c3fa9d 100644 --- a/cloud/blockstore/libs/diagnostics/user_counter.h +++ b/cloud/blockstore/libs/diagnostics/user_counter.h @@ -1,10 +1,11 @@ #pragma once +#include +#include + #include #include -#include - #include namespace NCloud::NBlockStore::NUserCounter { @@ -18,6 +19,7 @@ void RegisterServiceVolume( const TString& cloudId, const TString& folderId, const TString& diskId, + EHistogramCounterOptions histogramCounterOptions, NMonitoring::TDynamicCounterPtr src); void UnregisterServiceVolume( @@ -33,6 +35,7 @@ void RegisterServerVolumeInstance( const TString& diskId, const TString& instanceId, const bool reportZeroBlocksMetrics, + EHistogramCounterOptions histogramCounterOptions, NMonitoring::TDynamicCounterPtr src); void UnregisterServerVolumeInstance( diff --git a/cloud/blockstore/libs/diagnostics/user_counter_ut.cpp b/cloud/blockstore/libs/diagnostics/user_counter_ut.cpp index 4d395982b99..2c976cefe9a 100644 --- a/cloud/blockstore/libs/diagnostics/user_counter_ut.cpp +++ b/cloud/blockstore/libs/diagnostics/user_counter_ut.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace NCloud::NBlockStore::NUserCounter { @@ -27,12 +27,12 @@ NJson::TJsonValue GetValue(const auto& object, const auto& name) } } } - UNIT_ASSERT_C(false, "Value not found " + name); + EXPECT_TRUE(false) << "Value not found " << name; return NJson::TJsonValue{}; -}; +} -NJson::TJsonValue GetHist( - const auto& object, const auto& name, const auto& valueName) +NJson::TJsonValue +GetHist(const auto& object, const auto& name, const auto& valueName) { for (const auto& data: object["sensors"].GetArray()) { if (data["labels"]["name"] == name) { @@ -41,31 +41,30 @@ NJson::TJsonValue GetHist( } } } - UNIT_ASSERT_C(false, "Value not found " + name + "/" + valueName); + + EXPECT_TRUE(false) << "Value not found " << name << "/" << valueName; return NJson::TJsonValue{}; -}; +} void ValidateJsons( const NJson::TJsonValue& expectedJson, const NJson::TJsonValue& actualJson) { - for(const auto& jsonValue: expectedJson["sensors"].GetArray()) { + for (const auto& jsonValue: expectedJson["sensors"].GetArray()) { const TString name = jsonValue["labels"]["name"].GetString(); if (jsonValue.Has("hist")) { for (const auto* valueName: {"bounds", "buckets", "inf"}) { - UNIT_ASSERT_STRINGS_EQUAL_C( + EXPECT_EQ( NJson::WriteJson(GetHist(expectedJson, name, valueName)), - NJson::WriteJson(GetHist(actualJson, name, valueName)), - name - ); + NJson::WriteJson(GetHist(actualJson, name, valueName))) + << " for " << name << "/" << valueName; } } else { - UNIT_ASSERT_STRINGS_EQUAL_C( + EXPECT_EQ( NJson::WriteJson(GetValue(expectedJson, name)), - NJson::WriteJson(GetValue(actualJson, name)), - name - ); + NJson::WriteJson(GetValue(actualJson, name))) + << " for " << name; } } } @@ -85,23 +84,13 @@ void ValidateTestResult(IUserCounterSupplier* supplier, const TString& resource) } //////////////////////////////////////////////////////////////////////////////// + struct THistogramTestConfiguration { bool SetUnits; bool SetHistogramSingleCounter; bool SetHistogramMultipleCounter; - - static TVector All() - { - // all combinations except (*, false, false) - return { - {false, true, false}, - {false, false, true}, - {false, true, true}, - {true, true, false}, - {true, false, true}, - {true, true, true}}; - } + bool UseMsUnitsForTimeHistogram; }; void SetTimeHistogramCountersUs( @@ -144,7 +133,7 @@ void SetTimeHistogramCountersUs( histogram->Collect(5000000., 12); histogram->Collect(10000000., 13); histogram->Collect(35000000., 14); - histogram->Collect(100000000., 15); // Inf + histogram->Collect(100000000., 15); // Inf } if (config.SetHistogramMultipleCounter) { @@ -176,98 +165,230 @@ void SetTimeHistogramCountersUs( } } +void SetTimeHistogramCountersMs( + const TIntrusivePtr& counters, + const TString& histName, + const THistogramTestConfiguration& config) +{ + auto subgroup = counters->GetSubgroup("histogram", histName); + if (config.SetUnits) { + subgroup = subgroup->GetSubgroup("units", "msec"); + } + + if (config.SetHistogramSingleCounter) { + const auto& buckets = TRequestMsTimeBuckets::Buckets; + const auto& bounds = ConvertToHistBounds(buckets); + auto histogram = subgroup->GetHistogram( + histName, + NMonitoring::ExplicitHistogram(bounds)); + histogram->Collect(0.001, 1); + histogram->Collect(0.1, 2); + histogram->Collect(0.2, 3); + histogram->Collect(0.3, 4); + histogram->Collect(0.4, 5); + histogram->Collect(0.5, 6); + histogram->Collect(0.6, 7); + histogram->Collect(0.7, 8); + histogram->Collect(0.8, 9); + histogram->Collect(0.9, 0); + histogram->Collect(1., 1); + histogram->Collect(2., 2); + histogram->Collect(5., 3); + histogram->Collect(10., 4); + histogram->Collect(20., 5); + histogram->Collect(50., 6); + histogram->Collect(100., 7); + histogram->Collect(200., 8); + histogram->Collect(500., 9); + histogram->Collect(1000., 10); + histogram->Collect(2000., 11); + histogram->Collect(5000., 12); + histogram->Collect(10000., 13); + histogram->Collect(35000., 14); + histogram->Collect(100000., 15); // Inf + } + + if (config.SetHistogramMultipleCounter) { + subgroup->GetCounter("0.001ms")->Set(1); + subgroup->GetCounter("0.1ms")->Set(2); + subgroup->GetCounter("0.2ms")->Set(3); + subgroup->GetCounter("0.3ms")->Set(4); + subgroup->GetCounter("0.4ms")->Set(5); + subgroup->GetCounter("0.5ms")->Set(6); + subgroup->GetCounter("0.6ms")->Set(7); + subgroup->GetCounter("0.7ms")->Set(8); + subgroup->GetCounter("0.8ms")->Set(9); + subgroup->GetCounter("0.9ms")->Set(0); + subgroup->GetCounter("1ms")->Set(1); + subgroup->GetCounter("2ms")->Set(2); + subgroup->GetCounter("5ms")->Set(3); + subgroup->GetCounter("10ms")->Set(4); + subgroup->GetCounter("20ms")->Set(5); + subgroup->GetCounter("50ms")->Set(6); + subgroup->GetCounter("100ms")->Set(7); + subgroup->GetCounter("200ms")->Set(8); + subgroup->GetCounter("500ms")->Set(9); + subgroup->GetCounter("1000ms")->Set(10); + subgroup->GetCounter("2000ms")->Set(11); + subgroup->GetCounter("5000ms")->Set(12); + subgroup->GetCounter("10000ms")->Set(13); + subgroup->GetCounter("35000ms")->Set(14); + subgroup->GetCounter("Inf")->Set(15); + } +} + +class TUserWrapperTest + : public testing::TestWithParam> +{ +public: + TUserWrapperTest() = default; + ~TUserWrapperTest() override = default; + + static THistogramTestConfiguration GetHistogramTestConfiguration() + { + return THistogramTestConfiguration{ + .SetUnits = std::get<0>(GetParam()), + .SetHistogramSingleCounter = std::get<1>(GetParam()), + .SetHistogramMultipleCounter = std::get<2>(GetParam()), + .UseMsUnitsForTimeHistogram = std::get<3>(GetParam())}; + } +}; + } // namespace //////////////////////////////////////////////////////////////////////////////// using namespace NMonitoring; -Y_UNIT_TEST_SUITE(TUserWrapperTest) +TEST_P(TUserWrapperTest, UserServerVolumeInstanceTests) { - Y_UNIT_TEST(UserServerVolumeInstanceTests) + const auto histConfig = GetHistogramTestConfiguration(); + if (histConfig.SetHistogramSingleCounter == false && + histConfig.SetHistogramMultipleCounter == false) { - auto setCounters = [](const NMonitoring::TDynamicCounterPtr& stats, - const TString& name, - const THistogramTestConfiguration& histConfig) - { - auto request = stats->GetSubgroup("request", name); - request->GetCounter("Count")->Set(1); - request->GetCounter("MaxCount")->Set(10); - request->GetCounter("Errors/Fatal")->Set(2); - request->GetCounter("RequestBytes")->Set(3); - request->GetCounter("MaxRequestBytes")->Set(30); - request->GetCounter("InProgress")->Set(4); - request->GetCounter("MaxInProgress")->Set(40); - request->GetCounter("InProgressBytes")->Set(5); - request->GetCounter("MaxInProgressBytes")->Set(50); + return; + } + auto setCounters = [](const NMonitoring::TDynamicCounterPtr& stats, + const TString& name, + const THistogramTestConfiguration& histConfig) + { + auto request = stats->GetSubgroup("request", name); + request->GetCounter("Count")->Set(1); + request->GetCounter("MaxCount")->Set(10); + request->GetCounter("Errors/Fatal")->Set(2); + request->GetCounter("RequestBytes")->Set(3); + request->GetCounter("MaxRequestBytes")->Set(30); + request->GetCounter("InProgress")->Set(4); + request->GetCounter("MaxInProgress")->Set(40); + request->GetCounter("InProgressBytes")->Set(5); + request->GetCounter("MaxInProgressBytes")->Set(50); + + if (histConfig.UseMsUnitsForTimeHistogram) { + SetTimeHistogramCountersMs(request, "Time", histConfig); + } else { SetTimeHistogramCountersUs(request, "Time", histConfig); - }; - - struct TTestConfiguration - { - bool ReportZeroBlocksMetrics; - TString Resource; - }; - - std::vector testConfigurations = { - {true, "user_server_volume_instance_test"}, - {false, "user_server_volume_instance_skip_zero_blocks_test"}}; - - for (const auto& config: testConfigurations) { - for (const auto& histConfig: THistogramTestConfiguration::All()) { - auto stats = MakeIntrusive(); - setCounters(stats, "ReadBlocks", histConfig); - setCounters(stats, "WriteBlocks", histConfig); - setCounters(stats, "ZeroBlocks", histConfig); - - auto supplier = CreateUserCounterSupplier(); - RegisterServerVolumeInstance( - *supplier, - "cloudId", - "folderId", - "diskId", - "instanceId", - config.ReportZeroBlocksMetrics, - stats); - - ValidateTestResult(supplier.get(), config.Resource); - } } + }; + + struct TTestConfiguration + { + bool ReportZeroBlocksMetrics; + TString Resource; + }; + + std::vector testConfigurations = { + {true, "user_server_volume_instance_test"}, + {false, "user_server_volume_instance_skip_zero_blocks_test"}}; + + for (const auto& config: testConfigurations) { + auto stats = MakeIntrusive(); + setCounters(stats, "ReadBlocks", histConfig); + setCounters(stats, "WriteBlocks", histConfig); + setCounters(stats, "ZeroBlocks", histConfig); + + auto supplier = CreateUserCounterSupplier(); + EHistogramCounterOptions histogramCounterOptions; + if (histConfig.UseMsUnitsForTimeHistogram) { + histogramCounterOptions |= + EHistogramCounterOption::UseMsUnitsForTimeHistogram; + } + RegisterServerVolumeInstance( + *supplier, + "cloudId", + "folderId", + "diskId", + "instanceId", + config.ReportZeroBlocksMetrics, + histogramCounterOptions, + stats); + + ValidateTestResult(supplier.get(), config.Resource); } +} - Y_UNIT_TEST(UserServiceVolumeInstanceTests) +TEST_P(TUserWrapperTest, UserServiceVolumeInstanceTests) +{ + const auto histConfig = GetHistogramTestConfiguration(); + if (histConfig.SetHistogramSingleCounter == false && + histConfig.SetHistogramMultipleCounter == false) { - auto makeCounters = [](const NMonitoring::TDynamicCounterPtr& stats, - const TString& name, - const THistogramTestConfiguration& histConfig) - { - stats->GetCounter("UsedQuota")->Set(1); - stats->GetCounter("MaxUsedQuota")->Set(10); - - auto request = stats->GetSubgroup("request", name); + return; + } + + auto makeCounters = [](const NMonitoring::TDynamicCounterPtr& stats, + const TString& name, + const THistogramTestConfiguration& histConfig) + { + stats->GetCounter("UsedQuota")->Set(1); + stats->GetCounter("MaxUsedQuota")->Set(10); + + auto request = stats->GetSubgroup("request", name); + if (histConfig.UseMsUnitsForTimeHistogram) { + SetTimeHistogramCountersMs(request, "ThrottlerDelay", histConfig); + } else { SetTimeHistogramCountersUs(request, "ThrottlerDelay", histConfig); - }; - - for (const auto& histConfig: THistogramTestConfiguration::All()) { - auto stats = MakeIntrusive(); - - makeCounters(stats, "ReadBlocks", histConfig); - makeCounters(stats, "WriteBlocks", histConfig); - makeCounters(stats, "ZeroBlocks", histConfig); - - auto supplier = CreateUserCounterSupplier(); - RegisterServiceVolume( - *supplier, - "cloudId", - "folderId", - "diskId", - stats); - ValidateTestResult( - supplier.get(), - "user_service_volume_instance_test"); } + }; + + auto stats = MakeIntrusive(); + + makeCounters(stats, "ReadBlocks", histConfig); + makeCounters(stats, "WriteBlocks", histConfig); + makeCounters(stats, "ZeroBlocks", histConfig); + + auto supplier = CreateUserCounterSupplier(); + EHistogramCounterOptions histogramCounterOptions; + if (histConfig.UseMsUnitsForTimeHistogram) { + histogramCounterOptions |= + EHistogramCounterOption::UseMsUnitsForTimeHistogram; } + RegisterServiceVolume( + *supplier, + "cloudId", + "folderId", + "diskId", + histogramCounterOptions, + stats); + ValidateTestResult(supplier.get(), "user_service_volume_instance_test"); } +INSTANTIATE_TEST_SUITE_P( + , + TUserWrapperTest, + testing::Combine( + testing::Bool(), + testing::Bool(), + testing::Bool(), + testing::Bool()), + [](const testing::TestParamInfo& info) + -> std::string + { + const auto name = TStringBuilder() << std::get<0>(info.param) << "_" + << std::get<1>(info.param) << "_" + << std::get<2>(info.param) << "_" + << std::get<3>(info.param); + return name; + }); + } // namespace NCloud::NBlockStore::NUserCounter diff --git a/cloud/blockstore/libs/diagnostics/volume_stats.cpp b/cloud/blockstore/libs/diagnostics/volume_stats.cpp index e660ac0140f..b9fad7726a3 100644 --- a/cloud/blockstore/libs/diagnostics/volume_stats.cpp +++ b/cloud/blockstore/libs/diagnostics/volume_stats.cpp @@ -898,6 +898,7 @@ class TVolumeStats final volumeConfig.GetDiskId(), realInstanceId.GetInstanceId(), reportZeroBlocksMetrics, + DiagnosticsConfig->GetHistogramCounterOptions(), countersGroup); return info; diff --git a/cloud/blockstore/libs/diagnostics/ya.make b/cloud/blockstore/libs/diagnostics/ya.make index be2464782b9..e1c64299829 100644 --- a/cloud/blockstore/libs/diagnostics/ya.make +++ b/cloud/blockstore/libs/diagnostics/ya.make @@ -33,12 +33,12 @@ PEERDIR( cloud/blockstore/libs/diagnostics/events cloud/blockstore/libs/service cloud/blockstore/public/api/protos - + cloud/storage/core/libs/common cloud/storage/core/libs/diagnostics cloud/storage/core/libs/throttling cloud/storage/core/libs/user_stats/counter - + library/cpp/deprecated/atomic library/cpp/digest/crc32c library/cpp/eventlog @@ -57,5 +57,6 @@ PEERDIR( END() RECURSE_FOR_TESTS( + gtest ut ) diff --git a/cloud/blockstore/libs/storage/stats_service/stats_service_actor_solomon.cpp b/cloud/blockstore/libs/storage/stats_service/stats_service_actor_solomon.cpp index a0689c9a7c6..455081b5020 100644 --- a/cloud/blockstore/libs/storage/stats_service/stats_service_actor_solomon.cpp +++ b/cloud/blockstore/libs/storage/stats_service/stats_service_actor_solomon.cpp @@ -95,6 +95,7 @@ void TStatsServiceActor::RegisterVolumeSelfCounters( volume.VolumeInfo.GetCloudId(), volume.VolumeInfo.GetFolderId(), volume.VolumeInfo.GetDiskId(), + DiagnosticsConfig->GetHistogramCounterOptions(), volumeCounters); volume.SelfCountersRegistered = true; diff --git a/cloud/filestore/config/diagnostics.proto b/cloud/filestore/config/diagnostics.proto index 13d560a21fc..048ce3d8d36 100644 --- a/cloud/filestore/config/diagnostics.proto +++ b/cloud/filestore/config/diagnostics.proto @@ -148,4 +148,7 @@ message TDiagnosticsConfig // Limit number of records flushed in each frame during flush iteration optional uint64 ProfileLogMaxFrameFlushRecords = 30; + // Report time histograms with milliseconds buckets. Percentiles are still + // reported in microseconds. + optional bool UseMsUnitsForTimeHistogram = 31; } diff --git a/cloud/filestore/libs/diagnostics/config.cpp b/cloud/filestore/libs/diagnostics/config.cpp index c74cc1857d1..181c735300f 100644 --- a/cloud/filestore/libs/diagnostics/config.cpp +++ b/cloud/filestore/libs/diagnostics/config.cpp @@ -38,11 +38,12 @@ namespace { xxx(MonitoringUrlData, TMonitoringUrlData, {} )\ xxx(ReportHistogramAsMultipleCounters, bool, true )\ xxx(ReportHistogramAsSingleCounter, bool, false )\ + xxx(UseMsUnitsForTimeHistogram, bool, false )\ \ xxx(HDDFileSystemPerformanceProfile, TFileSystemPerformanceProfile, {} )\ xxx(SSDFileSystemPerformanceProfile, TFileSystemPerformanceProfile, {} )\ \ - xxx(StatsFetcherType, NCloud::NProto::EStatsFetcherType, NCloud::NProto::EStatsFetcherType::CGROUP )\ + xxx(StatsFetcherType, NCloud::NProto::EStatsFetcherType, NCloud::NProto::EStatsFetcherType::CGROUP )\ \ xxx(ProfileLogMaxFlushRecords, ui64, 0 )\ xxx(ProfileLogMaxFrameFlushRecords, ui64, 0 )\ @@ -150,6 +151,10 @@ EHistogramCounterOptions TDiagnosticsConfig::GetHistogramCounterOptions() const if (GetReportHistogramAsSingleCounter()) { histogramCounterOptions |= EHistogramCounterOption::ReportSingleCounter; } + if (GetUseMsUnitsForTimeHistogram()) { + histogramCounterOptions |= + EHistogramCounterOption::UseMsUnitsForTimeHistogram; + } return histogramCounterOptions; } diff --git a/cloud/filestore/libs/diagnostics/config.h b/cloud/filestore/libs/diagnostics/config.h index 23e4f255109..19ccc87973b 100644 --- a/cloud/filestore/libs/diagnostics/config.h +++ b/cloud/filestore/libs/diagnostics/config.h @@ -133,6 +133,7 @@ class TDiagnosticsConfig bool GetReportHistogramAsMultipleCounters() const; bool GetReportHistogramAsSingleCounter() const; EHistogramCounterOptions GetHistogramCounterOptions() const; + bool GetUseMsUnitsForTimeHistogram() const; TFileSystemPerformanceProfile GetHDDFileSystemPerformanceProfile() const; TFileSystemPerformanceProfile GetSSDFileSystemPerformanceProfile() const; diff --git a/cloud/storage/core/libs/diagnostics/histogram_counter_options.h b/cloud/storage/core/libs/diagnostics/histogram_counter_options.h index dc480778227..4ddb0fb1e79 100644 --- a/cloud/storage/core/libs/diagnostics/histogram_counter_options.h +++ b/cloud/storage/core/libs/diagnostics/histogram_counter_options.h @@ -7,10 +7,12 @@ namespace NCloud { enum class EHistogramCounterOption { ReportSingleCounter = (1 << 0), ReportMultipleCounters = (1 << 1), + // Report time histograms with milliseconds buckets. Percentiles are still + // reported in microseconds. + UseMsUnitsForTimeHistogram = (1 << 2), }; Y_DECLARE_FLAGS(EHistogramCounterOptions, EHistogramCounterOption); Y_DECLARE_OPERATORS_FOR_FLAGS(EHistogramCounterOptions); } // namespace NCloud - diff --git a/cloud/storage/core/libs/diagnostics/histogram_types.h b/cloud/storage/core/libs/diagnostics/histogram_types.h index 35251c443d8..05f2b37263b 100644 --- a/cloud/storage/core/libs/diagnostics/histogram_types.h +++ b/cloud/storage/core/libs/diagnostics/histogram_types.h @@ -28,6 +28,7 @@ struct TRequestUsTimeBuckets }}; static constexpr TStringBuf Units = "usec"; + static constexpr double PercentileMultiplier = 1.0; static TVector MakeNames(); @@ -57,6 +58,7 @@ struct TRequestUsTimeBucketsLowResolution }}; static constexpr TStringBuf Units = "usec"; + static constexpr double PercentileMultiplier = 1.0; static TVector MakeNames(); }; @@ -67,13 +69,17 @@ struct TRequestMsTimeBuckets { static constexpr size_t BUCKETS_COUNT = TRequestUsTimeBuckets::BUCKETS_COUNT; + static constexpr TStringBuf Units = ""; + // Uses buckets in milliseconds for the histogram, but reports values in + // microseconds for percentiles. + static constexpr double PercentileMultiplier = 1000.0; - static constexpr auto MakeArray = []( - const std::array& array) + static constexpr auto MakeArray = + [](const std::array& array) { std::array result; for (size_t i = 0; i + 1 < array.size(); ++i) { - result[i] = array[i] / 1000; + result[i] = array[i] / PercentileMultiplier; } result.back() = std::numeric_limits::max(); return result; @@ -82,8 +88,6 @@ struct TRequestMsTimeBuckets static constexpr std::array Buckets = MakeArray(TRequestUsTimeBuckets::Buckets); - static constexpr TStringBuf Units = "msec"; - static TVector MakeNames(); }; @@ -102,6 +106,7 @@ struct TQueueSizeBuckets }}; static constexpr TStringBuf Units = ""; + static constexpr double PercentileMultiplier = 1.0; static TVector MakeNames(); }; @@ -117,6 +122,7 @@ struct TKbSizeBuckets }}; static constexpr TStringBuf Units = "KB"; + static constexpr double PercentileMultiplier = 1.0; // TODO: multiply by 1024? static TVector MakeNames(); }; diff --git a/cloud/storage/core/libs/diagnostics/request_counters.cpp b/cloud/storage/core/libs/diagnostics/request_counters.cpp index 9ee0fa8e175..b1d603e8359 100644 --- a/cloud/storage/core/libs/diagnostics/request_counters.cpp +++ b/cloud/storage/core/libs/diagnostics/request_counters.cpp @@ -32,6 +32,7 @@ struct THistBase { const TString Name; const TString Units; + const double PercentileMultiplier; const TBucketBounds HistBounds; const EHistogramCounterOptions CounterOptions; @@ -41,6 +42,7 @@ struct THistBase THistBase(TString name, EHistogramCounterOptions counterOptions) : Name(std::move(name)) , Units(TDerived::Units) + , PercentileMultiplier(TDerived::PercentileMultiplier) , HistBounds(ConvertToHistBounds(TDerived::Buckets)) , CounterOptions(counterOptions) , Hist( @@ -111,12 +113,16 @@ struct THistBase { return Name; } + + double GetPercentileMultiplier() const + { + return PercentileMultiplier; + } }; //////////////////////////////////////////////////////////////////////////////// -struct TTimeHist - : public THistBase +struct TUsTimeHist: public THistBase { using THistBase::THistBase; @@ -128,6 +134,102 @@ struct TTimeHist //////////////////////////////////////////////////////////////////////////////// +struct TMsTimeHist: public THistBase +{ + using THistBase::THistBase; + + void Increment(TDuration requestTime, ui64 count = 1) + { + THistBase::Increment( + requestTime.MicroSeconds() / GetPercentileMultiplier(), + count); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +// Allows to use either microseconds or milliseconds for time histograms. +class TAdaptiveTimeHist +{ +private: + using TTimeHistVariant = std::variant; + std::unique_ptr TimeHistPtr; + +public: + TAdaptiveTimeHist(TString name, EHistogramCounterOptions counterOptions) + { + if (counterOptions & + EHistogramCounterOption::UseMsUnitsForTimeHistogram) + { + TimeHistPtr = std::make_unique( + std::in_place_type, + std::move(name), + counterOptions); + } else { + TimeHistPtr = std::make_unique( + std::in_place_type, + std::move(name), + counterOptions); + } + } + ~TAdaptiveTimeHist() = default; + + TAdaptiveTimeHist(const TAdaptiveTimeHist&) = delete; + TAdaptiveTimeHist(TAdaptiveTimeHist&&) = default; + TAdaptiveTimeHist& operator=(const TAdaptiveTimeHist&) = delete; + TAdaptiveTimeHist& operator=(TAdaptiveTimeHist&&) = default; + + void Increment(TDuration requestTime, ui64 count = 1) + { + std::visit( + [&](auto& timeHist) { timeHist.Increment(requestTime, count); }, + *TimeHistPtr); + } + + template + void Register(Args&&... args) + { + std::visit( + [&](auto& timeHist) + { timeHist.Register(std::forward(args)...); }, + *TimeHistPtr); + } + + [[nodiscard]] TVector GetBuckets() const + { + return std::visit( + [&](auto& timeHist) -> TVector + { return timeHist.GetBuckets(); }, + *TimeHistPtr); + } + + [[nodiscard]] const TString& GetUnits() const + { + return std::visit( + [&](auto& timeHist) -> const TString& + { return timeHist.GetUnits(); }, + *TimeHistPtr); + } + + [[nodiscard]] const TString& GetName() const + { + return std::visit( + [&](auto& timeHist) -> const TString& + { return timeHist.GetName(); }, + *TimeHistPtr); + } + + [[nodiscard]] double GetPercentileMultiplier() const + { + return std::visit( + [&](auto& timeHist) -> double + { return timeHist.GetPercentileMultiplier(); }, + *TimeHistPtr); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + struct TSizeHist : public THistBase { @@ -190,7 +292,8 @@ class TRequestPercentiles GetDefaultPercentiles()); for (ui32 i = 0; i < Min(Counters.size(), result.size()); ++i) { - *Counters[i] = std::lround(result[i]); + *Counters[i] = + std::lround(result[i] * SrcHistogram.GetPercentileMultiplier()); } } }; @@ -289,19 +392,19 @@ struct TRequestCounters::TStatCounters TSizeHist SizeHist; TRequestPercentiles SizePercentiles; - TTimeHist TimeHist; - TTimeHist TimeHistUnaligned; - TRequestPercentiles TimePercentiles; + TAdaptiveTimeHist TimeHist; + TAdaptiveTimeHist TimeHistUnaligned; + TRequestPercentiles TimePercentiles; - TTimeHist ExecutionTimeHist; - TTimeHist ExecutionTimeHistUnaligned; - TRequestPercentiles ExecutionTimePercentiles; + TAdaptiveTimeHist ExecutionTimeHist; + TAdaptiveTimeHist ExecutionTimeHistUnaligned; + TRequestPercentiles ExecutionTimePercentiles; - TTimeHist RequestCompletionTimeHist; - TRequestPercentiles RequestCompletionTimePercentiles; + TAdaptiveTimeHist RequestCompletionTimeHist; + TRequestPercentiles RequestCompletionTimePercentiles; - TTimeHist PostponedTimeHist; - TRequestPercentiles PostponedTimePercentiles; + TAdaptiveTimeHist PostponedTimeHist; + TRequestPercentiles PostponedTimePercentiles; TMaxCalculator MaxTimeCalc; TMaxCalculator MaxTotalTimeCalc; diff --git a/cloud/storage/core/libs/diagnostics/request_counters_ut.cpp b/cloud/storage/core/libs/diagnostics/request_counters_ut.cpp index 239ca507fc5..40d7aa7c76c 100644 --- a/cloud/storage/core/libs/diagnostics/request_counters_ut.cpp +++ b/cloud/storage/core/libs/diagnostics/request_counters_ut.cpp @@ -4,7 +4,7 @@ #include #include -#include "cloud/storage/core/libs/diagnostics/histogram_types.h" +#include #include #include @@ -13,6 +13,7 @@ #include #include +#include namespace NCloud { @@ -703,12 +704,177 @@ Y_UNIT_TEST_SUITE(TRequestCountersTest) } } - Y_UNIT_TEST(ShouldReportHistogramAsMultipleSensors) + void ShouldReportCompoundTimeHistogramWithMultipleCounters( + EHistogramCounterOptions options) + { + auto monitoring = CreateMonitoringServiceStub(); + auto counters = MakeRequestCountersPtr( + TRequestCounters::EOption::ReportDataPlaneHistogram, + options | EHistogramCounterOption::ReportMultipleCounters); + counters->Register(*monitoring->GetCounters()); + + AddRequestStats( + *counters, + WriteRequestType, + { + {1_KB, TDuration::Seconds(8), TDuration::Zero()}, + {1_KB, TDuration::Seconds(20), TDuration::Zero()}, + {1_KB, TDuration::Seconds(30), TDuration::Zero()}, + {1_KB, TDuration::Seconds(37), TDuration::Zero()}, + {1_KB, TDuration::Seconds(50), TDuration::Zero()}, + {1_KB, TDuration::Seconds(100), TDuration::Zero()}, + }); + + counters->UpdateStats(); + const auto timeHist = monitoring->GetCounters() + ->GetSubgroup("request", "WriteBlocks") + ->GetSubgroup("histogram", "Time"); + + const auto usGroup = timeHist->GetSubgroup("units", "usec"); + const auto msGroup = timeHist; + + const auto validateCounters = + [](TIntrusivePtr group, + TVector bucketNames, + TStringBuf suffix, + bool shouldExist) + { + TMap expectedHistogramValues; + for (const auto& bucketName: bucketNames) { + expectedHistogramValues[bucketName] = 0; + } + expectedHistogramValues[TString("10000") + suffix] = 1; + expectedHistogramValues[TString("35000") + suffix] = 2; + expectedHistogramValues["Inf"] = 3; + + for (const auto& [name, value]: expectedHistogramValues) { + const auto counter = group->FindCounter(name); + if (shouldExist) { + UNIT_ASSERT_C( + counter, + "Counter " + name.Quote() + " not found"); + UNIT_ASSERT_VALUES_EQUAL(counter->Val(), value); + } else { + UNIT_ASSERT_C( + !counter, + "Counter " + name.Quote() + " should not exist"); + } + } + }; + + validateCounters( + usGroup, + TRequestUsTimeBuckets::MakeNames(), + "000", + !(options & EHistogramCounterOption::UseMsUnitsForTimeHistogram)); + validateCounters( + msGroup, + TRequestMsTimeBuckets::MakeNames(), + "ms", + options & EHistogramCounterOption::UseMsUnitsForTimeHistogram); + } + + Y_UNIT_TEST(ShouldReportCompoundTimeHistogram_UseMsUnitsForTimeHistogram) + { + ShouldReportCompoundTimeHistogramWithMultipleCounters( + EHistogramCounterOption::UseMsUnitsForTimeHistogram); + } + + Y_UNIT_TEST(ShouldReportCompoundTimeHistogram_UseUsUnitsForTimeHistogram) + { + ShouldReportCompoundTimeHistogramWithMultipleCounters( + EHistogramCounterOptions()); + } + + void ShouldReportCompoundTimeHistogramWithSingleCounter( + EHistogramCounterOptions options) { auto monitoring = CreateMonitoringServiceStub(); auto counters = MakeRequestCountersPtr( TRequestCounters::EOption::ReportDataPlaneHistogram, - EHistogramCounterOption::ReportMultipleCounters); + options | EHistogramCounterOption::ReportSingleCounter); + counters->Register(*monitoring->GetCounters()); + + AddRequestStats( + *counters, + WriteRequestType, + { + {1_KB, TDuration::Seconds(8), TDuration::Zero()}, + {1_KB, TDuration::Seconds(20), TDuration::Zero()}, + {1_KB, TDuration::Seconds(30), TDuration::Zero()}, + {1_KB, TDuration::Seconds(37), TDuration::Zero()}, + {1_KB, TDuration::Seconds(50), TDuration::Zero()}, + {1_KB, TDuration::Seconds(100), TDuration::Zero()}, + }); + + const TMap expectedHistogramValues = { + {22, 1}, // 10000ms + {23, 2}, // 35000ms + {24, 3}, // Inf + }; + + counters->UpdateStats(); + + const auto timeHist = monitoring->GetCounters() + ->GetSubgroup("request", "WriteBlocks") + ->GetSubgroup("histogram", "Time"); + + const auto usGroup = timeHist->GetSubgroup("units", "usec"); + const auto msGroup = timeHist; + + const auto validateCounters = + [expectedHistogramValues]( + TIntrusivePtr group, + bool shouldExist) + { + const auto histogram = group->FindHistogram("Time"); + if (!shouldExist) { + UNIT_ASSERT(!histogram); + return; + } + UNIT_ASSERT(histogram); + const auto snapshot = histogram->Snapshot(); + UNIT_ASSERT_VALUES_EQUAL( + snapshot->Count(), + TRequestMsTimeBuckets::Buckets.size()); + for (size_t bucketId = 0; bucketId < snapshot->Count(); bucketId++) + { + auto expectedValue = expectedHistogramValues.contains(bucketId) + ? expectedHistogramValues.at(bucketId) + : 0; + UNIT_ASSERT_VALUES_EQUAL( + snapshot->Value(bucketId), + expectedValue); + } + }; + + validateCounters( + usGroup, + !(options & EHistogramCounterOption::UseMsUnitsForTimeHistogram)); + validateCounters( + msGroup, + options & EHistogramCounterOption::UseMsUnitsForTimeHistogram); + } + + Y_UNIT_TEST( + ShouldReportCompoundTimeHistogramWithSingleCounter_UseMsUnitsForTimeHistogram) + { + ShouldReportCompoundTimeHistogramWithSingleCounter( + EHistogramCounterOption::UseMsUnitsForTimeHistogram); + } + + Y_UNIT_TEST( + ShouldReportCompoundTimeHistogramWithSingleCounter_UseUsUnitsForTimeHistogram) + { + ShouldReportCompoundTimeHistogramWithSingleCounter( + EHistogramCounterOptions()); + } + + Y_UNIT_TEST(ShouldReportHistogramAsMultipleSensors) + { + auto monitoring = CreateMonitoringServiceStub(); + auto counters = MakeRequestCountersPtr( + TRequestCounters::EOption::ReportDataPlaneHistogram); counters->Register(*monitoring->GetCounters()); AddRequestStats(*counters, WriteRequestType, { @@ -788,8 +954,7 @@ Y_UNIT_TEST_SUITE(TRequestCountersTest) { auto monitoring = CreateMonitoringServiceStub(); auto counters = MakeRequestCountersPtr( - TRequestCounters::EOption::ReportDataPlaneHistogram, - {}); + TRequestCounters::EOption::ReportDataPlaneHistogram); counters->Register(*monitoring->GetCounters()); AddRequestStats(*counters, WriteRequestType, { diff --git a/cloud/storage/core/libs/diagnostics/weighted_percentile.cpp b/cloud/storage/core/libs/diagnostics/weighted_percentile.cpp index 163cab88950..a8a0283d2a2 100644 --- a/cloud/storage/core/libs/diagnostics/weighted_percentile.cpp +++ b/cloud/storage/core/libs/diagnostics/weighted_percentile.cpp @@ -70,8 +70,8 @@ TVector CalculateWeightedPercentiles( current = (double)(prevSum + bit->second) / totalSum; } if (current >= pit->first) { - const auto delta = pit->first * totalSum - prevSum; - const auto part = (bit->first - prevLimit) + const double delta = pit->first * totalSum - prevSum; + const double part = (bit->first - prevLimit) * (delta / (bit->second ? bit->second : 1)); result.push_back(std::min(prevLimit + part, greatestFiniteLimit)); ++pit; diff --git a/cloud/storage/core/libs/user_stats/counter/user_counter.cpp b/cloud/storage/core/libs/user_stats/counter/user_counter.cpp index a2d46c68615..beb356bcd34 100644 --- a/cloud/storage/core/libs/user_stats/counter/user_counter.cpp +++ b/cloud/storage/core/libs/user_stats/counter/user_counter.cpp @@ -146,6 +146,13 @@ std::shared_ptr CreateUserCounterSupplierStub() /////////////////////////////////////////////////////////////////////////////// +TBucketsWithUnits GetMsBuckets() +{ + constexpr auto Identity = [](double data) { return data; }; + static const auto Buckets = MakeBuckets(Identity); + return {Buckets, "msec"}; +} + TBucketsWithUnits GetUsBuckets() { constexpr auto UsToMs = [](double data) { diff --git a/cloud/storage/core/libs/user_stats/counter/user_counter.h b/cloud/storage/core/libs/user_stats/counter/user_counter.h index b927dc9f450..33ec871f58f 100644 --- a/cloud/storage/core/libs/user_stats/counter/user_counter.h +++ b/cloud/storage/core/libs/user_stats/counter/user_counter.h @@ -66,6 +66,7 @@ static constexpr size_t BUCKETS_COUNT = 25; using TBuckets = std::array; using TBucketsWithUnits = std::pair; +TBucketsWithUnits GetMsBuckets(); TBucketsWithUnits GetUsBuckets(); //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/storage/core/protos/diagnostics.proto b/cloud/storage/core/protos/diagnostics.proto index 79cfffcc6b5..b0a531d3f95 100644 --- a/cloud/storage/core/protos/diagnostics.proto +++ b/cloud/storage/core/protos/diagnostics.proto @@ -12,4 +12,3 @@ enum EStatsFetcherType CGROUP = 0; TASKSTATS = 1; }; -