Skip to content

Commit 7f8b09c

Browse files
authored
Merge pull request #6886 from chaen/v8.0_FEAT_serviceMonitoring
[8.0 ] Implement actual service monitoring
2 parents 5b7253d + 098c6ba commit 7f8b09c

File tree

3 files changed

+62
-12
lines changed

3 files changed

+62
-12
lines changed

src/DIRAC/Core/DISET/private/Service.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -555,17 +555,28 @@ def _executeAction(self, trid, proposalTuple, handlerObj):
555555
response = handlerObj._rh_executeAction(proposalTuple)
556556
if not response["OK"]:
557557
return response
558+
retVal = response["Value"][0]
558559
if self.activityMonitoring:
560+
_actionType, actionName = proposalTuple[1]
561+
retStatus = "Unknown"
562+
if isReturnStructure(retVal):
563+
if retVal["OK"]:
564+
retStatus = "OK"
565+
else:
566+
retStatus = "ERROR"
559567
self.activityMonitoringReporter.addRecord(
560568
{
561569
"timestamp": int(TimeUtilities.toEpochMilliSeconds()),
562570
"Host": Network.getFQDN(),
563571
"ServiceName": "_".join(self._name.split("/")),
564572
"Location": self._cfg.getURL(),
565573
"ResponseTime": response["Value"][1],
574+
"MethodName": actionName,
575+
"Protocol": "dips",
576+
"Status": retStatus,
566577
}
567578
)
568-
return response["Value"][0]
579+
return retVal
569580
except Exception as e:
570581
gLogger.exception("Exception while executing handler action")
571582
return S_ERROR(f"Server error while executing action: {str(e)}")

src/DIRAC/Core/Tornado/Server/TornadoServer.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,12 @@ def startTornado(self):
176176
Starts the tornado server when ready.
177177
This method never returns.
178178
"""
179+
180+
# If we are running with python3, Tornado will use asyncio,
181+
# and we have to convince it to let us run in a different thread
182+
# This statement must be placed before setting PeriodicCallback
183+
asyncio.set_event_loop_policy(tornado.platform.asyncio.AnyThreadEventLoopPolicy())
184+
179185
# If there is no services loaded:
180186
if not self.__calculateAppSettings():
181187
raise Exception("There is no services loaded, please check your configuration")
@@ -205,13 +211,7 @@ def startTornado(self):
205211
self.__report = self.__startReportToMonitoringLoop()
206212
# Response time
207213
# Starting monitoring, IOLoop waiting time in ms, __monitoringLoopDelay is defined in seconds
208-
tornado.ioloop.PeriodicCallback(
209-
self.__reportToMonitoring(self.__elapsedTime), self.__monitoringLoopDelay * 1000
210-
).start()
211-
212-
# If we are running with python3, Tornado will use asyncio,
213-
# and we have to convince it to let us run in a different thread
214-
asyncio.set_event_loop_policy(tornado.platform.asyncio.AnyThreadEventLoopPolicy())
214+
tornado.ioloop.PeriodicCallback(self.__reportToMonitoring, self.__monitoringLoopDelay * 1000).start()
215215

216216
for port, app in self.__appsSettings.items():
217217
sLog.debug(" - %s" % "\n - ".join([f"{k} = {ssl_options[k]}" for k in ssl_options]))
@@ -233,7 +233,7 @@ def startTornado(self):
233233

234234
tornado.ioloop.IOLoop.current().start()
235235

236-
def __reportToMonitoring(self, responseTime):
236+
def __reportToMonitoring(self):
237237
"""
238238
Periodically reports to Monitoring
239239
"""
@@ -248,13 +248,24 @@ def __reportToMonitoring(self, responseTime):
248248
"ServiceName": "Tornado",
249249
"MemoryUsage": self.__report[2],
250250
"CpuPercentage": percentage,
251-
"ResponseTime": responseTime,
252251
}
253252
)
254253
self.activityMonitoringReporter.commit()
255254
# Save memory usage and save realtime/CPU time for next call
256255
self.__report = self.__startReportToMonitoringLoop()
257256

257+
# For each handler,
258+
for urlSpec in self.handlerManager.getHandlersDict().values():
259+
# If there is more than one URL, it's
260+
# most likely something that inherit from TornadoREST
261+
# so don't even try to monitor...
262+
if len(urlSpec["URLs"]) > 1:
263+
continue
264+
handler = urlSpec["URLs"][0].handler_class
265+
# If there is a Monitoring reporter, call commit on it
266+
if getattr(handler, "activityMonitoringReporter", None):
267+
handler.activityMonitoringReporter.commit()
268+
258269
def __startReportToMonitoringLoop(self):
259270
"""
260271
Snapshot of resources to be taken at the beginning

src/DIRAC/Core/Tornado/Server/private/BaseRequestHandler.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@
2020
import DIRAC
2121

2222
from DIRAC import gConfig, gLogger, S_OK, S_ERROR
23+
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
2324
from DIRAC.Core.Utilities import DErrno
2425
from DIRAC.Core.DISET.AuthManager import AuthManager
2526
from DIRAC.Core.Utilities.JEncode import decode, encode
27+
from DIRAC.Core.Utilities import Network, TimeUtilities
2628
from DIRAC.Core.Utilities.ReturnValues import isReturnStructure
2729
from DIRAC.Core.Security.X509Chain import X509Chain # pylint: disable=import-error
2830
from DIRAC.Resources.IdProvider.Utilities import getProvidersForInstance
@@ -475,6 +477,12 @@ def __initialize(cls, request):
475477

476478
cls.initializeHandler(cls._componentInfoDict)
477479

480+
cls.activityMonitoringReporter = None
481+
if "Monitoring" in Operations().getMonitoringBackends(monitoringType="ServiceMonitoring"):
482+
from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter
483+
484+
cls.activityMonitoringReporter = MonitoringReporter(monitoringType="ServiceMonitoring")
485+
478486
cls.__init_done = True
479487

480488
return S_OK()
@@ -662,26 +670,46 @@ def on_finish(self):
662670
Called after the end of HTTP request.
663671
Log the request duration
664672
"""
665-
elapsedTime = 1000.0 * self.request.request_time()
673+
elapsedTime = self.request.request_time()
674+
666675
credentials = self.srv_getFormattedRemoteCredentials()
667676

668677
argsString = f"OK {self._status_code}"
678+
monitoringRetStatus = "Unknown"
669679
# Finish with DIRAC result
670680
if isReturnStructure(self.__result):
671681
if self.__result["OK"]:
672682
argsString = "OK"
683+
monitoringRetStatus = "OK"
673684
else:
674685
argsString = f"ERROR: {self.__result['Message']}"
686+
monitoringRetStatus = "ERROR"
675687
if callStack := self.__result.pop("CallStack", None):
676688
argsString += "\n" + "".join(callStack)
677689
# If bad HTTP status code
678690
if self._status_code >= 400:
679691
argsString = f"ERROR {self._status_code}: {self._reason}"
680692

681693
self.log.notice(
682-
"Returning response", f"{credentials} {self._fullComponentName} ({elapsedTime:.2f} ms) {argsString}"
694+
"Returning response",
695+
f"{credentials} {self._fullComponentName} ({1000.0 * elapsedTime:.2f} ms) {argsString}",
683696
)
684697

698+
if self.activityMonitoringReporter:
699+
record = {
700+
"timestamp": int(TimeUtilities.toEpochMilliSeconds()),
701+
"Host": Network.getFQDN(),
702+
"ServiceName": "_".join(self._fullComponentName.split("/")),
703+
"Location": self.request.uri,
704+
"ResponseTime": elapsedTime,
705+
# Take the method name from the POST call
706+
"MethodName": self.request.arguments.get("method", ["Unknown"])[0],
707+
"Protocol": "https",
708+
"Status": monitoringRetStatus,
709+
}
710+
711+
self.activityMonitoringReporter.addRecord(record)
712+
685713
def _gatherPeerCredentials(self, grants: list = None) -> dict:
686714
"""Return a dictionary designed to work with the :py:class:`AuthManager <DIRAC.Core.DISET.AuthManager.AuthManager>`,
687715
already written for DISET and re-used for HTTPS.

0 commit comments

Comments
 (0)