Skip to content

Commit b2c042a

Browse files
authored
impl(bigtable): introduce connectivity error count metrics (#15370)
* impl(bigtable): introduce connectivity error count metrics.
1 parent 69f7f8a commit b2c042a

File tree

4 files changed

+379
-14
lines changed

4 files changed

+379
-14
lines changed

google/cloud/bigtable/internal/metrics.cc

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,18 @@ LabelMap IntoLabelMap(ResourceLabels const& r, DataLabels const& d,
7777
return labels;
7878
}
7979

80+
bool HasServerTiming(grpc::ClientContext const& client_context) {
81+
auto const& initial_metadata = client_context.GetServerInitialMetadata();
82+
auto it = initial_metadata.find("server-timing");
83+
return it != initial_metadata.end();
84+
}
85+
86+
bool IsConnectivityError(google::cloud::Status const& status,
87+
grpc::ClientContext const& client_context) {
88+
return status.code() != google::cloud::StatusCode::kDeadlineExceeded &&
89+
!HasServerTiming(client_context);
90+
}
91+
8092
absl::optional<google::bigtable::v2::ResponseParams>
8193
GetResponseParamsFromTrailingMetadata(
8294
grpc::ClientContext const& client_context) {
@@ -344,6 +356,48 @@ std::unique_ptr<Metric> ServerLatency::clone(ResourceLabels resource_labels,
344356
return m;
345357
}
346358

359+
ConnectivityErrorCount::ConnectivityErrorCount(
360+
std::string const& instrumentation_scope,
361+
opentelemetry::nostd::shared_ptr<
362+
opentelemetry::metrics::MeterProvider> const& provider)
363+
: connectivity_error_count_(
364+
provider
365+
->GetMeter(instrumentation_scope,
366+
kMeterInstrumentationScopeVersion)
367+
->CreateUInt64Counter("connectivity_error_count")
368+
.release()) {}
369+
370+
void ConnectivityErrorCount::PostCall(opentelemetry::context::Context const&,
371+
grpc::ClientContext const& client_context,
372+
PostCallParams const& p) {
373+
auto response_params = GetResponseParamsFromTrailingMetadata(client_context);
374+
if (response_params) {
375+
resource_labels_.cluster = response_params->cluster_id();
376+
resource_labels_.zone = response_params->zone_id();
377+
}
378+
auto const& status = p.attempt_status;
379+
data_labels_.status = StatusCodeToString(status.code());
380+
if (resource_labels_.cluster.empty() || resource_labels_.zone.empty() ||
381+
IsConnectivityError(status, client_context)) {
382+
++num_errors_;
383+
}
384+
}
385+
386+
void ConnectivityErrorCount::OnDone(
387+
opentelemetry::context::Context const& context, OnDoneParams const&) {
388+
auto m = IntoLabelMap(resource_labels_, data_labels_,
389+
std::set<std::string>{"streaming"});
390+
connectivity_error_count_->Add(num_errors_, std::move(m), context);
391+
}
392+
393+
std::unique_ptr<Metric> ConnectivityErrorCount::clone(
394+
ResourceLabels resource_labels, DataLabels data_labels) const {
395+
auto m = std::make_unique<ConnectivityErrorCount>(*this);
396+
m->resource_labels_ = std::move(resource_labels);
397+
m->data_labels_ = std::move(data_labels);
398+
return m;
399+
}
400+
347401
ApplicationBlockingLatency::ApplicationBlockingLatency(
348402
std::string const& instrumentation_scope,
349403
opentelemetry::nostd::shared_ptr<

google/cloud/bigtable/internal/metrics.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ using LabelMap = std::unordered_map<std::string, std::string>;
5656
LabelMap IntoLabelMap(ResourceLabels const& r, DataLabels const& d,
5757
std::set<std::string> const& filtered_data_labels = {});
5858

59+
bool HasServerTiming(grpc::ClientContext const& client_context);
60+
bool IsConnectivityError(google::cloud::Status const& status,
61+
grpc::ClientContext const& client_context);
5962
absl::optional<google::bigtable::v2::ResponseParams>
6063
GetResponseParamsFromTrailingMetadata(
6164
grpc::ClientContext const& client_context);
@@ -250,6 +253,28 @@ class ApplicationBlockingLatency : public Metric {
250253
std::vector<LatencyDuration> pending_latencies_;
251254
};
252255

256+
class ConnectivityErrorCount : public Metric {
257+
public:
258+
ConnectivityErrorCount(
259+
std::string const& instrumentation_scope,
260+
opentelemetry::nostd::shared_ptr<
261+
opentelemetry::metrics::MeterProvider> const& provider);
262+
void PostCall(opentelemetry::context::Context const&,
263+
grpc::ClientContext const& client_context,
264+
PostCallParams const& p) override;
265+
void OnDone(opentelemetry::context::Context const& context,
266+
OnDoneParams const&) override;
267+
std::unique_ptr<Metric> clone(ResourceLabels resource_labels,
268+
DataLabels data_labels) const override;
269+
270+
private:
271+
ResourceLabels resource_labels_;
272+
DataLabels data_labels_;
273+
std::uint64_t num_errors_ = 0;
274+
opentelemetry::nostd::shared_ptr<
275+
opentelemetry::metrics::Counter<std::uint64_t>>
276+
connectivity_error_count_;
277+
};
253278
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
254279
} // namespace bigtable_internal
255280
} // namespace cloud

0 commit comments

Comments
 (0)