Skip to content

Commit bc1dae4

Browse files
committed
refactor: replace Gauge with Counter for error tracking in SloContext
1 parent e0889cb commit bc1dae4

File tree

3 files changed

+74
-53
lines changed

3 files changed

+74
-53
lines changed

slo/src/AdoNet/SloContext.cs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ public class SloContext : SloContext<YdbDataSource>
1414
.WaitAndRetryAsync(10, attempt => TimeSpan.FromMilliseconds(attempt * 10),
1515
(e, _, _, context) =>
1616
{
17-
var errorsGauge = (Gauge)context["errorsGauge"];
17+
var errorsTotal = (Counter)context["errorsTotal"];
1818

1919
Logger.LogWarning(e, "Failed read / write operation");
20-
errorsGauge?.WithLabels(((YdbException)e).Code.StatusName(), "retried").Inc();
20+
errorsTotal?.WithLabels(((YdbException)e).Code.StatusName(), "retried").Inc();
2121
});
2222

2323
protected override string Job => "AdoNet";
@@ -32,12 +32,12 @@ protected override async Task Create(YdbDataSource client, string createTableSql
3232
}
3333

3434
protected override async Task<(int, StatusCode)> Upsert(YdbDataSource dataSource, string upsertSql,
35-
Dictionary<string, YdbValue> parameters, int writeTimeout, Gauge? errorsGauge = null)
35+
Dictionary<string, YdbValue> parameters, int writeTimeout, Counter? errorsTotal = null)
3636
{
3737
var context = new Context();
38-
if (errorsGauge != null)
38+
if (errorsTotal != null)
3939
{
40-
context["errorsGauge"] = errorsGauge;
40+
context["errorsTotal"] = errorsTotal;
4141
}
4242

4343
var policyResult = await _policy.ExecuteAndCaptureAsync(async _ =>
@@ -61,12 +61,12 @@ protected override async Task Create(YdbDataSource client, string createTableSql
6161
}
6262

6363
protected override async Task<(int, StatusCode, object?)> Select(YdbDataSource dataSource, string selectSql,
64-
Dictionary<string, YdbValue> parameters, int readTimeout, Gauge? errorsGauge = null)
64+
Dictionary<string, YdbValue> parameters, int readTimeout, Counter? errorsTotal = null)
6565
{
6666
var context = new Context();
67-
if (errorsGauge != null)
67+
if (errorsTotal != null)
6868
{
69-
context["errorsGauge"] = errorsGauge;
69+
context["errorsTotal"] = errorsTotal;
7070
}
7171

7272
var attempts = 0;
@@ -105,4 +105,4 @@ protected override Task<YdbDataSource> CreateClient(Config config)
105105
return Task.FromResult(new YdbDataSource(new YdbConnectionStringBuilder
106106
{ UseTls = useTls, Host = host, Port = int.Parse(port), Database = config.Db, LoggerFactory = Factory }));
107107
}
108-
}
108+
}

slo/src/Internal/SloContext.cs

Lines changed: 62 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ public async Task Run(RunConfig runConfig)
137137
return;
138138

139139
Task ShootingTask(RateLimiter rateLimitPolicy, string operationType,
140-
Func<T, RunConfig, Gauge?, Task<(int, StatusCode)>> action)
140+
Func<T, RunConfig, Counter?, Task<(int, StatusCode)>> action)
141141
{
142142
var metricFactory = Metrics.WithLabels(new Dictionary<string, string>
143143
{
@@ -149,41 +149,61 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string operationType,
149149
}
150150
);
151151

152-
var okGauge = metricFactory.CreateCounter( // Counter
152+
var operationsTotal = metricFactory.CreateCounter(
153+
"sdk_operations_total",
154+
"Total number of operations performed by the SDK, categorized by type."
155+
);
156+
157+
var operationsSuccessTotal = metricFactory.CreateCounter(
153158
"sdk_operations_success_total",
154159
"Total number of successful operations, categorized by type."
155160
);
156-
var notOkGauge = metricFactory.CreateCounter(
161+
162+
var operationsFailureTotal = metricFactory.CreateCounter(
157163
"sdk_operations_failure_total",
158164
"Total number of failed operations, categorized by type."
159165
);
160-
var latencySummary = metricFactory.CreateSummary(
166+
167+
var operationLatencySeconds = metricFactory.CreateHistogram(
161168
"sdk_operation_latency_seconds",
162169
"Latency of operations performed by the SDK in seconds, categorized by type and status.",
163-
new[] { "status" },
164-
new SummaryConfiguration // Гистограмма
170+
["operation_status"],
171+
new HistogramConfiguration
165172
{
166-
MaxAge = TimeSpan.FromSeconds(15),
167-
Objectives = new QuantileEpsilonPair[]
168-
{
169-
new(0.5, 0.05),
170-
new(0.99, 0.005),
171-
new(0.999, 0.0005)
172-
}
173-
});
174-
var attemptsHistogram = metricFactory.CreateHistogram(
173+
Buckets =
174+
[
175+
0.001, // 1 ms
176+
0.002, // 2 ms
177+
0.003, // 3 ms
178+
0.004, // 4 ms
179+
0.005, // 5 ms
180+
0.0075, // 7.5 ms
181+
0.010, // 10 ms
182+
0.020, // 20 ms
183+
0.050, // 50 ms
184+
0.100, // 100 ms
185+
0.200, // 200 ms
186+
0.500, // 500 ms
187+
1.000 // 1 s
188+
]
189+
}
190+
);
191+
192+
var retryAttempts = metricFactory.CreateGauge(
175193
"sdk_retry_attempts",
176-
"Current retry attempts, categorized by operation type.",
177-
new[] { "status" },
178-
new HistogramConfiguration { Buckets = Histogram.LinearBuckets(1, 1, 10) }
194+
"Current retry attempts, categorized by operation type."
179195
);
180-
var errorsGauge = metricFactory.CreateGauge("errors", "amount of errors", new[] { "class", "in" });
181196

182-
foreach (var statusCode in Enum.GetValues<StatusCode>())
183-
{
184-
errorsGauge.WithLabels(statusCode.StatusName(), "retried").IncTo(0);
185-
errorsGauge.WithLabels(statusCode.StatusName(), "finally").IncTo(0);
186-
}
197+
var pendingOperations = metricFactory.CreateGauge(
198+
"sdk_pending_operations",
199+
"Current number of pending operations, categorized by type."
200+
);
201+
202+
var errorsTotal = metricFactory.CreateCounter(
203+
"sdk_errors_total",
204+
"Total number of errors encountered, categorized by error type.",
205+
["error_type"]
206+
);
187207

188208
// ReSharper disable once MethodSupportsCancellation
189209
return Task.Run(async () =>
@@ -200,25 +220,26 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string operationType,
200220

201221
_ = Task.Run(async () =>
202222
{
223+
pendingOperations.Inc();
203224
var sw = Stopwatch.StartNew();
204-
var (attempts, statusCode) = await action(client, runConfig, errorsGauge);
225+
var (attempts, statusCode) = await action(client, runConfig, errorsTotal);
205226
sw.Stop();
206-
string label;
227+
228+
retryAttempts.Set(attempts);
229+
operationsTotal.Inc();
230+
pendingOperations.Dec();
207231

208232
if (statusCode != StatusCode.Success)
209233
{
210-
notOkGauge.Inc();
211-
label = "err";
212-
errorsGauge.WithLabels(statusCode.StatusName(), "finally").Inc();
234+
errorsTotal.WithLabels(statusCode.StatusName()).Inc();
235+
operationsFailureTotal.Inc();
236+
operationLatencySeconds.WithLabels("err").Observe(sw.ElapsedMilliseconds / 1000);
213237
}
214238
else
215239
{
216-
okGauge.Inc();
217-
label = "ok";
240+
operationsSuccessTotal.Inc();
241+
operationLatencySeconds.WithLabels("success").Observe(sw.ElapsedMilliseconds / 1000);
218242
}
219-
220-
attemptsHistogram.WithLabels(label).Observe(attempts);
221-
latencySummary.WithLabels(label).Observe(sw.ElapsedMilliseconds);
222243
}, cancellationTokenSource.Token);
223244
}
224245

@@ -237,12 +258,12 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string operationType,
237258
// return attempt count & StatusCode operation
238259
protected abstract Task<(int, StatusCode)> Upsert(T client, string upsertSql,
239260
Dictionary<string, YdbValue> parameters,
240-
int writeTimeout, Gauge? errorsGauge = null);
261+
int writeTimeout, Counter? errorsTotal = null);
241262

242263
protected abstract Task<(int, StatusCode, object?)> Select(T client, string selectSql,
243-
Dictionary<string, YdbValue> parameters, int readTimeout, Gauge? errorsGauge = null);
264+
Dictionary<string, YdbValue> parameters, int readTimeout, Counter? errorsTotal = null);
244265

245-
private Task<(int, StatusCode)> Upsert(T client, Config config, Gauge? errorsGauge = null)
266+
private Task<(int, StatusCode)> Upsert(T client, Config config, Counter? errorsTotal = null)
246267
{
247268
const int minSizeStr = 20;
248269
const int maxSizeStr = 40;
@@ -265,12 +286,12 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string operationType,
265286
},
266287
{ "$payload_double", YdbValue.MakeDouble(Random.Shared.NextDouble()) },
267288
{ "$payload_timestamp", YdbValue.MakeTimestamp(DateTime.Now) }
268-
}, config.WriteTimeout, errorsGauge);
289+
}, config.WriteTimeout, errorsTotal);
269290
}
270291

271292
protected abstract Task<T> CreateClient(Config config);
272293

273-
private async Task<(int, StatusCode)> Select(T client, RunConfig config, Gauge? errorsGauge = null)
294+
private async Task<(int, StatusCode)> Select(T client, RunConfig config, Counter? errorsTotal = null)
274295
{
275296
var (attempts, code, _) = await Select(client,
276297
$"""
@@ -281,7 +302,7 @@ Task ShootingTask(RateLimiter rateLimitPolicy, string operationType,
281302
new Dictionary<string, YdbValue>
282303
{
283304
{ "$id", YdbValue.MakeInt32(Random.Shared.Next(_maxId)) }
284-
}, config.ReadTimeout, errorsGauge);
305+
}, config.ReadTimeout, errorsTotal);
285306

286307
return (attempts, code);
287308
}
@@ -294,4 +315,4 @@ public static string StatusName(this StatusCode statusCode)
294315
var prefix = statusCode >= StatusCode.ClientTransportResourceExhausted ? "GRPC" : "YDB";
295316
return $"{prefix}_{statusCode}";
296317
}
297-
}
318+
}

slo/src/TableService/SloContext.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ protected override async Task Create(TableClient client, string createTableSql,
2222
}
2323

2424
protected override async Task<(int, StatusCode)> Upsert(TableClient tableClient, string upsertSql,
25-
Dictionary<string, YdbValue> parameters, int writeTimeout, Gauge? errorsGauge = null)
25+
Dictionary<string, YdbValue> parameters, int writeTimeout, Counter? errorsGauge = null)
2626
{
2727
var querySettings = new ExecuteDataQuerySettings
2828
{ OperationTimeout = TimeSpan.FromSeconds(writeTimeout) };
@@ -49,7 +49,7 @@ protected override async Task Create(TableClient client, string createTableSql,
4949
}
5050

5151
protected override async Task<(int, StatusCode, object?)> Select(TableClient tableClient, string selectSql,
52-
Dictionary<string, YdbValue> parameters, int readTimeout, Gauge? errorsGauge = null)
52+
Dictionary<string, YdbValue> parameters, int readTimeout, Counter? errorsGauge = null)
5353
{
5454
var querySettings = new ExecuteDataQuerySettings
5555
{ OperationTimeout = TimeSpan.FromSeconds(readTimeout) };
@@ -81,4 +81,4 @@ protected override async Task<TableClient> CreateClient(Config config)
8181
{
8282
return new TableClient(await Driver.CreateInitialized(new DriverConfig(config.Endpoint, config.Db), Factory));
8383
}
84-
}
84+
}

0 commit comments

Comments
 (0)