Skip to content

Commit 572751c

Browse files
authored
♻️[Maintenance]: Improve RPC Server Logging and Error Handling (#8346)
1 parent 7913c04 commit 572751c

File tree

5 files changed

+63
-22
lines changed

5 files changed

+63
-22
lines changed

packages/service-library/src/servicelib/rabbitmq/_errors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class RemoteMethodNotRegisteredError(BaseRPCError):
2424
class RPCServerError(BaseRPCError):
2525
msg_template = (
2626
"While running method '{method_name}' raised "
27-
"'{exc_type}': '{exc_message}'\n{traceback}"
27+
"'{exc_type}' [{error_code}]: '{exc_message}'\n{traceback}"
2828
)
2929

3030

packages/service-library/src/servicelib/rabbitmq/_rpc_router.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,20 @@
66
from dataclasses import dataclass, field
77
from typing import Any, TypeVar
88

9+
from common_library.error_codes import create_error_code
910
from models_library.rabbitmq_basic_types import RPCMethodName
11+
from servicelib.logging_errors import create_troubleshootting_log_kwargs
1012

1113
from ..logging_utils import log_context
1214
from ._errors import RPCServerError
1315

1416
DecoratedCallable = TypeVar("DecoratedCallable", bound=Callable[..., Any])
1517

16-
# NOTE: this is equivalent to http access logs
17-
_logger = logging.getLogger("rpc.access")
18+
19+
_logger = logging.getLogger(
20+
# NOTE: this logger is equivalent to http access logs
21+
"rpc.access"
22+
)
1823

1924

2025
def _create_func_msg(func, args: tuple[Any, ...], kwargs: dict[str, Any]) -> str:
@@ -40,7 +45,9 @@ def expose(
4045
*,
4146
reraise_if_error_type: tuple[type[Exception], ...] | None = None,
4247
) -> Callable[[DecoratedCallable], DecoratedCallable]:
48+
4349
def _decorator(func: DecoratedCallable) -> DecoratedCallable:
50+
4451
@functools.wraps(func)
4552
async def _wrapper(*args, **kwargs):
4653
with log_context(
@@ -64,9 +71,19 @@ async def _wrapper(*args, **kwargs):
6471
):
6572
raise
6673

74+
error_code = create_error_code(exc)
6775
_logger.exception(
68-
"Unhandled exception on the rpc-server side. Re-raising as %s.",
69-
RPCServerError.__name__,
76+
# NOTE: equivalent to a 500 http status code error
77+
**create_troubleshootting_log_kwargs(
78+
f"Unhandled exception on the rpc-server side for '{func.__name__}'",
79+
error=exc,
80+
error_code=error_code,
81+
error_context={
82+
"rpc_method": func.__name__,
83+
"args": args,
84+
"kwargs": kwargs,
85+
},
86+
)
7087
)
7188
# NOTE: we do not return internal exceptions over RPC
7289
formatted_traceback = "\n".join(
@@ -77,6 +94,7 @@ async def _wrapper(*args, **kwargs):
7794
exc_type=f"{exc.__class__.__module__}.{exc.__class__.__name__}",
7895
exc_message=f"{exc}",
7996
traceback=f"{formatted_traceback}",
97+
error_code=error_code,
8098
) from None
8199

82100
self.routes[RPCMethodName(func.__name__)] = _wrapper

services/catalog/src/simcore_service_catalog/api/rpc/_services.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from models_library.users import UserID
2121
from pydantic import TypeAdapter, ValidationError, validate_call
2222
from pyinstrument import Profiler
23-
from servicelib.logging_utils import log_decorator
2423
from servicelib.rabbitmq import RPCRouter
2524
from servicelib.rabbitmq.rpc_interfaces.catalog.errors import (
2625
CatalogForbiddenError,
@@ -106,7 +105,6 @@ async def list_services_paginated(
106105
ValidationError,
107106
)
108107
)
109-
@log_decorator(_logger, level=logging.DEBUG)
110108
@_profile_rpc_call
111109
@validate_call(config={"arbitrary_types_allowed": True})
112110
async def get_service(
@@ -141,7 +139,6 @@ async def get_service(
141139
ValidationError,
142140
)
143141
)
144-
@log_decorator(_logger, level=logging.DEBUG)
145142
@validate_call(config={"arbitrary_types_allowed": True})
146143
async def update_service(
147144
app: FastAPI,
@@ -179,7 +176,6 @@ async def update_service(
179176
ValidationError,
180177
)
181178
)
182-
@log_decorator(_logger, level=logging.DEBUG)
183179
@validate_call(config={"arbitrary_types_allowed": True})
184180
async def check_for_service(
185181
app: FastAPI,
@@ -203,7 +199,6 @@ async def check_for_service(
203199

204200

205201
@router.expose(reraise_if_error_type=(CatalogForbiddenError, ValidationError))
206-
@log_decorator(_logger, level=logging.DEBUG)
207202
@validate_call(config={"arbitrary_types_allowed": True})
208203
async def batch_get_my_services(
209204
app: FastAPI,
@@ -233,7 +228,6 @@ async def batch_get_my_services(
233228

234229

235230
@router.expose(reraise_if_error_type=(ValidationError,))
236-
@log_decorator(_logger, level=logging.DEBUG)
237231
@validate_call(config={"arbitrary_types_allowed": True})
238232
async def list_my_service_history_latest_first(
239233
app: FastAPI,
@@ -281,7 +275,6 @@ async def list_my_service_history_latest_first(
281275
ValidationError,
282276
)
283277
)
284-
@log_decorator(_logger, level=logging.DEBUG)
285278
@validate_call(config={"arbitrary_types_allowed": True})
286279
async def get_service_ports(
287280
app: FastAPI,

services/director-v2/src/simcore_service_director_v2/modules/dask_client.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,10 @@ async def _get_task_state(job_id: str) -> RunningState:
450450
parsed_event = TaskLifeCycleState.model_validate(dask_events[-1][1])
451451

452452
if parsed_event.state == RunningState.FAILED:
453+
log_error_context = {
454+
"job_id": job_id,
455+
"dask-scheduler": self.backend.scheduler_id,
456+
}
453457
try:
454458
# find out if this was a cancellation
455459
task_future: distributed.Future = (
@@ -461,10 +465,7 @@ async def _get_task_state(job_id: str) -> RunningState:
461465
timeout=_DASK_DEFAULT_TIMEOUT_S
462466
)
463467
assert isinstance(exception, Exception) # nosec
464-
log_error_context = {
465-
"job_id": job_id,
466-
"dask-scheduler": self.backend.scheduler_id,
467-
}
468+
468469
if isinstance(exception, TaskCancelledError):
469470
_logger.info(
470471
**create_troubleshootting_log_kwargs(

services/web/server/src/simcore_service_webserver/projects/_controller/nodes_rest.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
import logging
33

44
from aiohttp import web
5+
from common_library.error_codes import create_error_code
56
from common_library.json_serialization import json_dumps
7+
from common_library.user_messages import user_message
68
from models_library.api_schemas_catalog.service_access_rights import (
79
ServiceAccessRightsGet,
810
)
@@ -23,9 +25,11 @@
2325
NodeServiceGet,
2426
ProjectNodeServicesGet,
2527
)
28+
from models_library.basic_types import IDStr
2629
from models_library.groups import EVERYONE_GROUP_ID, Group, GroupID, GroupType
2730
from models_library.projects import Project, ProjectID
2831
from models_library.projects_nodes_io import NodeID, NodeIDStr
32+
from models_library.rest_error import ErrorGet
2933
from models_library.services import ServiceKeyVersion
3034
from models_library.services_resources import ServiceResourcesDict
3135
from models_library.services_types import ServiceKey, ServiceVersion
@@ -43,6 +47,7 @@
4347
UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE,
4448
X_SIMCORE_USER_AGENT,
4549
)
50+
from servicelib.logging_errors import create_troubleshootting_log_kwargs
4651
from servicelib.long_running_tasks.models import TaskProgress
4752
from servicelib.long_running_tasks.task import TaskRegistry
4853
from servicelib.mimetype_constants import MIMETYPE_APPLICATION_JSON
@@ -57,6 +62,7 @@
5762
from ..._meta import API_VTAG as VTAG
5863
from ...catalog import catalog_service
5964
from ...dynamic_scheduler import api as dynamic_scheduler_service
65+
from ...exception_handling import create_error_response
6066
from ...groups import api as groups_service
6167
from ...groups.exceptions import GroupNotFoundError
6268
from ...login.decorators import login_required
@@ -304,7 +310,7 @@ async def _stop_dynamic_service_task(
304310
*,
305311
app: web.Application,
306312
dynamic_service_stop: DynamicServiceStop,
307-
):
313+
) -> web.Response:
308314
_ = progress
309315
# NOTE: _handle_project_nodes_exceptions only decorate handlers
310316
try:
@@ -323,18 +329,41 @@ async def _stop_dynamic_service_task(
323329
return web.json_response(status=status.HTTP_204_NO_CONTENT)
324330

325331
except (RPCServerError, ServiceWaitingForManualInterventionError) as exc:
326-
# in case there is an error reply as not found
327-
raise web.HTTPNotFound(text=f"{exc}") from exc
332+
error_code = getattr(exc, "error_code", None) or create_error_code(exc)
333+
user_error_msg = user_message(
334+
f"Could not stop dynamic service {dynamic_service_stop.project_id}.{dynamic_service_stop.node_id}"
335+
)
336+
_logger.debug(
337+
**create_troubleshootting_log_kwargs(
338+
user_error_msg,
339+
error=exc,
340+
error_code=error_code,
341+
error_context={
342+
"project_id": dynamic_service_stop.project_id,
343+
"node_id": dynamic_service_stop.node_id,
344+
"user_id": dynamic_service_stop.user_id,
345+
"save_state": dynamic_service_stop.save_state,
346+
"simcore_user_agent": dynamic_service_stop.simcore_user_agent,
347+
},
348+
)
349+
)
350+
# ANE: in case there is an error reply as not found
351+
return create_error_response(
352+
error=ErrorGet(
353+
message=user_error_msg,
354+
support_id=IDStr(error_code),
355+
status=status.HTTP_404_NOT_FOUND,
356+
),
357+
status_code=status.HTTP_404_NOT_FOUND,
358+
)
328359

329360
except ServiceWasNotFoundError:
330361
# in case the service is not found reply as all OK
331362
return web.json_response(status=status.HTTP_204_NO_CONTENT)
332363

333364

334365
def register_stop_dynamic_service_task(app: web.Application) -> None:
335-
TaskRegistry.register(
336-
_stop_dynamic_service_task, allowed_errors=(web.HTTPNotFound,), app=app
337-
)
366+
TaskRegistry.register(_stop_dynamic_service_task, app=app)
338367

339368

340369
@routes.post(

0 commit comments

Comments
 (0)