|
7 | 7 | from starlette.responses import PlainTextResponse, Response |
8 | 8 | from starlette.routing import Route |
9 | 9 |
|
10 | | -from huggingface_inference_toolkit.async_utils import async_handler_call |
| 10 | +from huggingface_inference_toolkit.async_utils import MAX_CONCURRENT_THREADS, MAX_THREADS_GUARD, async_handler_call |
11 | 11 | from huggingface_inference_toolkit.const import ( |
12 | 12 | HF_FRAMEWORK, |
13 | 13 | HF_HUB_TOKEN, |
@@ -69,6 +69,18 @@ async def health(request): |
69 | 69 | return PlainTextResponse("Ok") |
70 | 70 |
|
71 | 71 |
|
| 72 | +# Report Prometheus metrics |
| 73 | +# inf_batch_current_size: Current number of requests being processed |
| 74 | +# inf_queue_size: Number of requests waiting in the queue |
| 75 | +async def metrics(request): |
| 76 | + batch_current_size = MAX_CONCURRENT_THREADS - MAX_THREADS_GUARD.value |
| 77 | + queue_size = MAX_THREADS_GUARD.statistics().tasks_waiting |
| 78 | + return PlainTextResponse( |
| 79 | + f"inf_batch_current_size {batch_current_size}\n" + |
| 80 | + f"inf_queue_size {queue_size}\n" |
| 81 | + ) |
| 82 | + |
| 83 | + |
72 | 84 | async def predict(request): |
73 | 85 | try: |
74 | 86 | # extracts content from request |
@@ -143,6 +155,7 @@ async def predict(request): |
143 | 155 | Route("/health", health, methods=["GET"]), |
144 | 156 | Route("/", predict, methods=["POST"]), |
145 | 157 | Route("/predict", predict, methods=["POST"]), |
| 158 | + Route("/metrics", metrics, methods=["GET"]), |
146 | 159 | ], |
147 | 160 | on_startup=[prepare_model_artifacts], |
148 | 161 | ) |
0 commit comments