@@ -99,6 +99,7 @@ type ProxyStore struct {
9999
100100type proxyStoreMetrics struct {
101101 emptyStreamResponses prometheus.Counter
102+ storeFailureCount * prometheus.CounterVec
102103}
103104
104105func newProxyStoreMetrics (reg prometheus.Registerer ) * proxyStoreMetrics {
@@ -108,6 +109,10 @@ func newProxyStoreMetrics(reg prometheus.Registerer) *proxyStoreMetrics {
108109 Name : "thanos_proxy_store_empty_stream_responses_total" ,
109110 Help : "Total number of empty responses received." ,
110111 })
112+ m .storeFailureCount = promauto .With (reg ).NewCounterVec (prometheus.CounterOpts {
113+ Name : "thanos_proxy_store_failure_total" ,
114+ Help : "Total number of store failures." ,
115+ }, []string {"group" , "replica" })
111116
112117 return & m
113118}
@@ -431,7 +436,8 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
431436 if err != nil {
432437 // NB: respSet is nil in case of error.
433438 level .Error (reqLogger ).Log ("err" , err )
434- level .Warn (s .logger ).Log ("msg" , "Store failure" , "group" , st .GroupKey (), "replica" , st .ReplicaKey ())
439+ level .Warn (s .logger ).Log ("msg" , "Store failure" , "group" , st .GroupKey (), "replica" , st .ReplicaKey (), "err" , err )
440+ s .metrics .storeFailureCount .WithLabelValues (st .GroupKey (), st .ReplicaKey ()).Inc ()
435441 bumpCounter (st .GroupKey (), st .ReplicaKey (), failedStores )
436442 totalFailedStores ++
437443 if r .PartialResponseStrategy == storepb .PartialResponseStrategy_GROUP_REPLICA {
@@ -464,14 +470,18 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
464470
465471 if resp .GetWarning () != "" {
466472 totalFailedStores ++
467- level .Error (s .logger ).Log ("msg" , "Series: warning from store" , "warning" , resp .GetWarning ())
473+ maxWarningBytes := 2000
474+ warning := resp .GetWarning ()[:min (maxWarningBytes , len (resp .GetWarning ()))]
475+ level .Error (s .logger ).Log ("msg" , "Store failure with warning" , "warning" , warning )
476+ // Don't have group/replica keys here, so we can't attribute the warning to a specific store.
477+ s .metrics .storeFailureCount .WithLabelValues ("" , "" ).Inc ()
468478 if r .PartialResponseStrategy == storepb .PartialResponseStrategy_GROUP_REPLICA {
469479 // TODO: attribute the warning to the store(group key and replica key) that produced it.
470480 // Each client streams a sequence of time series, so it's not trivial to attribute the warning to a specific client.
471481 if totalFailedStores > 1 {
472482 level .Error (reqLogger ).Log ("msg" , "more than one stores have failed" )
473483 // If we don't know which store has failed, we can tolerate at most one failed store.
474- return status .Error (codes .Aborted , resp . GetWarning () )
484+ return status .Error (codes .Aborted , warning )
475485 }
476486 } else if r .PartialResponseDisabled || r .PartialResponseStrategy == storepb .PartialResponseStrategy_ABORT {
477487 return status .Error (codes .Aborted , resp .GetWarning ())
0 commit comments