-
Notifications
You must be signed in to change notification settings - Fork 29
Replace Sentry metrics with Prometheus metrics #918
Changes from 18 commits
c8685db
52dd85c
2909aa6
91a8578
ac58dbf
6d72497
5af7369
af3f0f0
4c79e54
cfbe611
9fe6d4b
66f02a9
3f86b59
e9ef0b4
9ff1ba8
79d6d52
2dfa87b
2e71531
eaf9a41
4d2c20e
61865f0
27f74ad
00bd746
13086fd
7077928
08b9256
44b2c12
3ad8aff
bb7757a
e222298
e052728
2386132
919ee71
c691c6e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,7 +19,6 @@ | |
| ) | ||
| from graphql import DocumentNode | ||
| from sentry_sdk import capture_exception | ||
| from sentry_sdk import metrics as sentry_metrics | ||
| from shared.metrics import Counter, Histogram | ||
|
|
||
| from codecov.commands.exceptions import BaseException | ||
|
|
@@ -51,6 +50,17 @@ | |
| buckets=[0.05, 0.1, 0.25, 0.5, 0.75, 1, 2, 5, 10, 30], | ||
| ) | ||
|
|
||
| GQL_REQUEST_MADE_COUNTER = Counter( | ||
| "api_gql_requests_made", | ||
| "Total API GQL requests made", | ||
| ["path"], | ||
| ) | ||
|
|
||
| GQL_ERROR_TYPE_COUNTER = Counter( | ||
| "api_gql_errors", | ||
| "Number of times API GQL endpoint failed with an exception by type", | ||
| ["error_type", "path"], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we include a comment on when to use existing |
||
| ) | ||
|
|
||
| # covers named and 3 unnamed operations (see graphql_api/types/query/query.py) | ||
| GQL_TYPE_AND_NAME_PATTERN = r"^(query|mutation|subscription)(?:\(\$input:|) (\w+)(?:\(| \(|{| {|!)|^(?:{) (me|owner|config)(?:\(| |{)" | ||
|
|
@@ -226,10 +236,9 @@ async def post(self, request, *args, **kwargs): | |
| "user": request.user, | ||
| } | ||
| log.info("GraphQL Request", extra=log_data) | ||
| sentry_metrics.incr("graphql.info.request_made", tags={"path": req_path}) | ||
|
|
||
| GQL_REQUEST_MADE_COUNTER.labels(path=req_path).inc() | ||
| if self._check_ratelimit(request=request): | ||
| sentry_metrics.incr("graphql.error.rate_limit", tags={"path": req_path}) | ||
| GQL_ERROR_TYPE_COUNTER.labels(error_type="rate_limit", path=req_path).inc() | ||
| return JsonResponse( | ||
| data={ | ||
| "status": 429, | ||
|
|
@@ -245,7 +254,7 @@ async def post(self, request, *args, **kwargs): | |
| data = json.loads(content) | ||
|
|
||
| if "errors" in data: | ||
| sentry_metrics.incr("graphql.error.all", tags={"path": req_path}) | ||
| GQL_ERROR_TYPE_COUNTER.labels(error_type="all", path=req_path).inc() | ||
| try: | ||
| if data["errors"][0]["extensions"]["cost"]: | ||
| costs = data["errors"][0]["extensions"]["cost"] | ||
|
|
@@ -257,10 +266,9 @@ async def post(self, request, *args, **kwargs): | |
| request_body=req_body, | ||
| ), | ||
| ) | ||
| sentry_metrics.incr( | ||
| "graphql.error.query_cost_exceeded", | ||
| tags={"path": req_path}, | ||
| ) | ||
| GQL_ERROR_TYPE_COUNTER.labels( | ||
| error_type="query_cost_exceeded", path=req_path | ||
| ).inc() | ||
| return HttpResponseBadRequest( | ||
| JsonResponse("Your query is too costly.") | ||
| ) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -790,14 +790,15 @@ def get_version_from_headers(headers): | |
| return "unknown-user-agent" | ||
|
|
||
|
|
||
| def generate_upload_sentry_metrics_tags( | ||
| def generate_upload_prometheus_metrics_tags( | ||
| action, | ||
| request, | ||
| is_shelter_request, | ||
| endpoint: Optional[str] = None, | ||
| repository: Optional[Repository] = None, | ||
| position: Optional[str] = None, | ||
| upload_version: Optional[str] = None, | ||
| fill_labels: bool = True, | ||
|
||
| ): | ||
| metrics_tags = dict( | ||
| agent=get_agent_from_headers(request.headers), | ||
|
|
@@ -806,13 +807,23 @@ def generate_upload_sentry_metrics_tags( | |
| endpoint=endpoint, | ||
| is_using_shelter="yes" if is_shelter_request else "no", | ||
| ) | ||
|
|
||
| repo_visibility = None | ||
| if repository: | ||
| metrics_tags["repo_visibility"] = ( | ||
| "private" if repository.private is True else "public" | ||
| ) | ||
| if position: | ||
| metrics_tags["position"] = position | ||
| if upload_version: | ||
| metrics_tags["upload_version"] = upload_version | ||
| repo_visibility = "private" if repository.private else "public" | ||
|
|
||
| optional_fields = { | ||
| "repo_visibility": repo_visibility, | ||
| "position": position, | ||
| "upload_version": upload_version, | ||
| } | ||
|
Comment on lines
+815
to
+819
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code doesn't look like it's doing anything different from the original code, and the original code is a bit simpler. What is the reason to create an
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the original code, the The original code does not create the fields if the corresponding arguments are not provided. However, the prometheus-client in Python raises an error if we initialize a metric with a certain number of labels but then call it with some empty labels. I introduced i.e. old code, in the new code, it will return If we still want the first dict to be returned, we can call generate tags with
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. I think that the new code will actually return where the values for the three extra fields are of None type (not string). Is this still fine?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked and that's fine, in prometheus it will interpret it as a string |
||
|
|
||
| metrics_tags.update( | ||
| { | ||
| field: value | ||
| for field, value in optional_fields.items() | ||
| if value or fill_labels | ||
| } | ||
| ) | ||
|
|
||
| return metrics_tags | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| from shared.metrics import Counter | ||
|
|
||
| API_UPLOAD_COUNTER = Counter( | ||
| "api_upload", | ||
| "Total API upload endpoint requests", | ||
| [ | ||
| "agent", | ||
| "version", | ||
| "action", | ||
| "endpoint", | ||
| "is_using_shelter", | ||
| "repo_visibility", | ||
| "position", | ||
| "upload_version", | ||
| ], | ||
| ) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, I like this pattern of pulling it out into a separate file! |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couldn't quite tell - what's the intended difference between this one (
GQL_REQUEST_MADE_COUNTER) andGQL_HIT_COUNTERin above (line 34)? Any opportunities to combine?Also a nit but it seems like we use "Total" in the description when it's a
HistogramnotCounterThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added
GQL_REQUEST_MADE_COUNTERso that thepostfunction inAsyncGraphqlViewclass can increment this metric with the labelpath=req_path, and the other metricGQL_HIT_COUNTERis called when the actual gql API is called through therequest_startedextension: https://ariadnegraphql.org/docs/extensions. If we want to combine this, then we would have to givereq_pathto theQueryMetricsExtensionin some way, I was thinking of setting aself.pathattribute to theAsyncGraphqlViewclass but since it's handling many async requests I don't think that would work. Do you know of a way to implement this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe path is available in the
contextargument likeinfo.context["request"].url.path, if that's what you mean?If this will be a duplicate of the information in
GQL_HIT_COUNTERthen it seems like removing/combining makes sense. But since it sounds like it's meant to be another step in the lifecycle of a request, we can use it for debugging, for example comparing numbers against theGQL_REQUEST_MADE_COUNTER. Actually I like the location of theGQL_REQUEST_MADE_COUNTERbetter than the other one that already existed, so we can just leave it! I saw you were just converting the sentry one to prometheus anyway so we can just proceed.