77
88environ ["PROMETHEUS_DISABLE_CREATED_SERIES" ] = "True"
99
10- from prometheus_client import CollectorRegistry , Gauge , Histogram , push_to_gateway # noqa: E402
11- from summary import Summary # noqa: E402
10+ from prometheus_client import CollectorRegistry , Counter , Gauge , Histogram , push_to_gateway # noqa: E402
1211
13- JOB_READ_LABEL , JOB_WRITE_LABEL = "read" , "write"
14- JOB_STATUS_OK , JOB_STATUS_ERR = "ok " , "err"
12+ OP_TYPE_READ , OP_TYPE_WRITE = "read" , "write"
13+ OP_STATUS_SUCCESS , OP_STATUS_FAILURE = "success " , "err"
1514
16- SDK_SERVICE_NAME = environ .get ("SDK_SERVICE" , "sync-python-table" )
15+ REF = environ .get ("REF" , "main" )
16+ SDK_SERVICE_NAME = environ .get ("SDK_SERVICE" , "py-sync-table" )
1717
1818
1919class Metrics :
2020 def __init__ (self , push_gateway ):
2121 self ._push_gtw = push_gateway
2222 self ._registry = CollectorRegistry ()
2323 self ._metrics = dict (
24- oks = Gauge (
25- "oks " ,
26- "amount of OK requests " ,
27- labelnames = ("jobName" , ),
24+ errors_total = Counter (
25+ "sdk_errors_total " ,
26+ "Total number of errors encountered, categorized by error type. " ,
27+ labelnames = ("operation_type" , "error_type" ),
2828 registry = self ._registry ,
2929 ),
30- not_oks = Gauge (
31- "not_oks " ,
32- "amount of not OK requests " ,
33- labelnames = ("jobName " ,),
30+ operations_total = Counter (
31+ "sdk_operations_total " ,
32+ "Total number of operations, categorized by type attempted by the SDK. " ,
33+ labelnames = ("operation_type " ,),
3434 registry = self ._registry ,
3535 ),
36- inflight = Gauge (
37- "inflight " ,
38- "amount of requests in flight " ,
39- labelnames = ("jobName " ,),
36+ operations_success_total = Counter (
37+ "sdk_operations_success_total " ,
38+ "Total number of successful operations, categorized by type. " ,
39+ labelnames = ("operation_type " ,),
4040 registry = self ._registry ,
4141 ),
42- latency = Summary (
43- "latency " ,
44- "summary of latencies in ms " ,
45- labelnames = ("jobName" , "status" ),
42+ operations_failure_total = Counter (
43+ "sdk_operations_failure_total " ,
44+ "Total number of failed operations, categorized by type. " ,
45+ labelnames = ("operation_type" , ),
4646 registry = self ._registry ,
47- objectives = (
48- (0.5 , 0.01 ),
49- (0.99 , 0.001 ),
50- (1.0 , 0.0 ),
47+ ),
48+ operation_latency_seconds = Histogram (
49+ "sdk_operation_latency_seconds" ,
50+ "Latency of operations performed by the SDK in seconds, categorized by type and status." ,
51+ labelnames = (
52+ "operation_type" ,
53+ "operation_status" ,
54+ ),
55+ registry = self ._registry ,
56+ buckets = (
57+ 0.001 , # 1 ms
58+ 0.002 , # 2 ms
59+ 0.003 , # 3 ms
60+ 0.004 , # 4 ms
61+ 0.005 , # 5 ms
62+ 0.0075 , # 7.5 ms
63+ 0.010 , # 10 ms
64+ 0.020 , # 20 ms
65+ 0.050 , # 50 ms
66+ 0.100 , # 100 ms
67+ 0.200 , # 200 ms
68+ 0.500 , # 500 ms
69+ 1.000 , # 1 s
5170 ),
5271 ),
53- attempts = Histogram (
54- "attempts" ,
55- "histogram of amount of requests" ,
56- labelnames = ("jobName" , "status" ),
72+ retry_attempts_total = Counter (
73+ "sdk_retry_attempts_total" ,
74+ "Total number of retry attempts, categorized by operation type." ,
75+ labelnames = ("operation_type" ,),
76+ registry = self ._registry ,
77+ ),
78+ retries_success_total = Counter (
79+ "sdk_retries_success_total" ,
80+ "Total number of successful retries, categorized by operation type." ,
81+ labelnames = ("operation_type" ,),
82+ registry = self ._registry ,
83+ ),
84+ retries_failure_total = Counter (
85+ "sdk_retries_failure_total" ,
86+ "Total number of failed retries, categorized by operation type." ,
87+ labelnames = ("operation_type" ,),
88+ registry = self ._registry ,
89+ ),
90+ pending_operations = Gauge (
91+ "sdk_pending_operations" ,
92+ "Current number of pending operations, categorized by type." ,
93+ labelnames = ("operation_type" ,),
5794 registry = self ._registry ,
58- buckets = tuple (range (1 , 11 )),
5995 ),
6096 )
6197 self .reset ()
@@ -81,44 +117,44 @@ def start(self, labels):
81117 if not isinstance (labels , Iterable ):
82118 labels = (labels ,)
83119
84- self .inflight .labels (* labels ).inc ()
120+ self .pending_operations .labels (* labels ).inc ()
85121 return time .time ()
86122
87123 def stop (self , labels , start_time , attempts = 1 , error = None ):
88- runtime_ms = 1000 * ( time .time () - start_time )
124+ duration = time .time () - start_time
89125
90126 if not isinstance (labels , Iterable ):
91127 labels = (labels ,)
92128
93- self .inflight .labels (* labels ).dec ()
129+ self .operations_total .labels (* labels ).inc ()
130+ self .pending_operations .labels (* labels ).dec ()
131+ self .retry_attempts_total .labels (* labels ).inc (attempts )
94132
95133 if error :
96- self .not_oks .labels (* labels ).inc ()
97- self .latency .labels (* labels , JOB_STATUS_ERR ).observe (runtime_ms )
134+ self .errors_total .labels (* labels , type (error ).__name__ ).inc ()
135+ self .retries_failure_total .labels (* labels ).inc (attempts )
136+ self .operations_failure_total .labels (* labels ).inc ()
137+ self .operation_latency_seconds .labels (* labels , OP_STATUS_FAILURE ).observe (duration )
98138 return
99139
100- self .oks .labels (* labels ).inc ()
101- self .latency .labels (* labels , JOB_STATUS_OK ). observe ( runtime_ms )
102- self .attempts .labels (* labels , JOB_STATUS_OK ).observe (attempts )
140+ self .retries_success_total .labels (* labels ).inc (attempts )
141+ self .operations_success_total .labels (* labels ). inc ( )
142+ self .operation_latency_seconds .labels (* labels , OP_STATUS_SUCCESS ).observe (duration )
103143
104144 def push (self ):
105145 push_to_gateway (
106146 self ._push_gtw ,
107147 job = f"workload-{ SDK_SERVICE_NAME } " ,
108148 registry = self ._registry ,
109149 grouping_key = {
150+ "ref" : REF ,
110151 "sdk" : SDK_SERVICE_NAME ,
111- "sdkVersion " : version ("ydb" ),
152+ "sdk_version " : version ("ydb" ),
112153 },
113154 )
114155
115156 def reset (self ):
116- for label in (JOB_READ_LABEL , JOB_WRITE_LABEL ):
117- self .oks .labels (label ).set (0 )
118- self .not_oks .labels (label ).set (0 )
119- self .inflight .labels (label ).set (0 )
120-
121- self .latency .clear ()
122- self .attempts .clear ()
157+ for m in self ._metrics .values ():
158+ m .clear ()
123159
124160 self .push ()
0 commit comments