Skip to content

Commit 5109e00

Browse files
committed
feat: pilot log download enhancement: Add remote logs download possibility
1 parent df6a89f commit 5109e00

File tree

5 files changed

+181
-9
lines changed

5 files changed

+181
-9
lines changed

src/DIRAC/Interfaces/API/DiracAdmin.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ def getJobPilotOutput(self, jobID, directory=""):
425425
426426
:param job: JobID
427427
:type job: integer or string
428+
:param str directory: a directory to download logs to.
428429
:return: S_OK,S_ERROR
429430
"""
430431
if not directory:
@@ -468,13 +469,13 @@ def getJobPilotOutput(self, jobID, directory=""):
468469

469470
#############################################################################
470471
def getPilotOutput(self, gridReference, directory=""):
471-
"""Retrieve the pilot output (std.out and std.err) for an existing job in the WMS.
472+
"""Retrieve the pilot output (std.out and std.err) for an existing pilot reference.
472473
473474
>>> gLogger.notice(dirac.getJobPilotOutput(12345))
474475
{'OK': True, 'Value': {}}
475476
476-
:param job: JobID
477-
:type job: integer or string
477+
:param str gridReference: pilot reference
478+
:param str directory: a directory to download logs to.
478479
:return: S_OK,S_ERROR
479480
"""
480481
if not isinstance(gridReference, str):
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""
2+
Pilot logging plugin abstract class.
3+
"""
4+
from abc import ABC, abstractmethod
5+
from DIRAC import S_OK, S_ERROR, gLogger
6+
7+
sLog = gLogger.getSubLogger(__name__)
8+
9+
10+
class DownloadPlugin(ABC):
11+
"""
12+
Remote pilot log retriever base abstract class. It defines abstract methods used to download log files from a remote
13+
storage to the server.
14+
Any pilot logger download plugin should inherit from this class and implement a (sub)set of methods required by
15+
:class:`PilotManagerHandler`.
16+
"""
17+
18+
@abstractmethod
19+
def getRemotePilotLogs(self, pilotStamp, vo):
20+
"""
21+
Pilot log getter method, carrying the unique pilot identity and a VO name.
22+
23+
:param str pilotStamp: pilot stamp.
24+
:param str vo: VO name of a pilot which generated the logs.
25+
:return: S_OK or S_ERROR
26+
:rtype: dict
27+
"""
28+
29+
pass
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
File cache pilot log downloader.
3+
"""
4+
import os
5+
import tempfile
6+
from DIRAC import S_OK, S_ERROR, gLogger, gConfig
7+
from DIRAC.DataManagementSystem.Client.DataManager import DataManager
8+
from DIRAC.ConfigurationSystem.Client.Helpers import Registry
9+
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
10+
from DIRAC.Core.Utilities.Proxy import executeWithUserProxy
11+
from DIRAC.WorkloadManagementSystem.Client.PilotLoggingPlugins.DownloadPlugin import DownloadPlugin
12+
13+
sLog = gLogger.getSubLogger(__name__)
14+
15+
16+
class FileCacheDownloadPlugin(DownloadPlugin):
17+
"""
18+
Class to handle log file download from an SE
19+
"""
20+
21+
def __init__(self):
22+
"""
23+
Sets the pilot log files location for a WebServer.
24+
25+
"""
26+
pass
27+
28+
def getRemotePilotLogs(self, pilotStamp, vo=None):
29+
"""
30+
Pilot log getter method, carrying the unique pilot identity and a VO name.
31+
32+
:param str pilotStamp: pilot stamp.
33+
:param str vo: VO name of a user/pilot which generated the logs.
34+
:return: S_OK or S_ERROR
35+
:rtype: dict
36+
"""
37+
38+
opsHelper = Operations(vo=vo)
39+
uploadPath = opsHelper.getValue("Pilot/UploadPath", "")
40+
lfn = os.path.join(uploadPath, pilotStamp + ".log")
41+
sLog.info("LFN to download: ", lfn)
42+
filepath = tempfile.TemporaryDirectory().name
43+
os.makedirs(filepath, exist_ok=True)
44+
# get pilot credentials which uploaded logs to an external storage:
45+
res = opsHelper.getOptionsDict("Shifter/DataManager")
46+
if not res["OK"]:
47+
message = f"No shifter defined for VO: {vo} - needed to retrieve the logs !"
48+
sLog.error(message)
49+
return S_ERROR(message)
50+
51+
proxyUser = res["Value"].get("User")
52+
proxyGroup = res["Value"].get("Group")
53+
54+
sLog.info(f"Proxy used for retrieving pilot logs: VO: {vo}, User: {proxyUser}, Group: {proxyGroup}")
55+
56+
res = self._downloadLogs( # pylint: disable=unexpected-keyword-arg
57+
lfn, filepath, proxyUserName=proxyUser, proxyUserGroup=proxyGroup
58+
)
59+
sLog.debug("getFile result:", res)
60+
if not res["OK"]:
61+
sLog.error(f"Failed to contact storage")
62+
return res
63+
if lfn in res["Value"]["Failed"]:
64+
sLog.error("Failed to retrieve a log file:", res["Value"]["Failed"])
65+
return S_ERROR(f"Failed to retrieve a log file: {res['Value']['Failed']}")
66+
try:
67+
filename = os.path.join(filepath, pilotStamp + ".log")
68+
with open(filename) as f:
69+
stdout = f.read()
70+
except FileNotFoundError as err:
71+
sLog.error(f"Error opening a log file:{filename}", err)
72+
return S_ERROR(repr(err))
73+
74+
resultDict = {}
75+
resultDict["StdOut"] = stdout
76+
return S_OK(resultDict)
77+
78+
@executeWithUserProxy
79+
def _downloadLogs(self, lfn, filepath):
80+
return DataManager().getFile(lfn, destinationDir=filepath)

src/DIRAC/WorkloadManagementSystem/Service/PilotManagerHandler.py

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66

77
from DIRAC import S_OK, S_ERROR
88
import DIRAC.Core.Utilities.TimeUtilities as TimeUtilities
9-
10-
from DIRAC.ConfigurationSystem.Client.Helpers import Registry
119
from DIRAC.Core.Utilities.Decorators import deprecated
10+
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
11+
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getVOForGroup
12+
from DIRAC.Core.DISET.RequestHandler import getServiceOption
1213
from DIRAC.Core.DISET.RequestHandler import RequestHandler
1314
from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
14-
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getUsernameForDN, getDNForUsername
1515
from DIRAC.WorkloadManagementSystem.Client import PilotStatus
1616
from DIRAC.WorkloadManagementSystem.Service.WMSUtilities import (
1717
getPilotCE,
@@ -35,6 +35,10 @@ def initializeHandler(cls, serviceInfoDict):
3535
except RuntimeError as excp:
3636
return S_ERROR(f"Can't connect to DB: {excp}")
3737

38+
# prepare remote pilot plugin initialization
39+
defaultOption, defaultClass = "DownloadPlugin", "FileCacheDownloadPlugin"
40+
cls.configValue = getServiceOption(serviceInfoDict, defaultOption, defaultClass)
41+
cls.loggingPlugin = None
3842
return S_OK()
3943

4044
##############################################################################
@@ -92,9 +96,16 @@ def export_addPilotTQRef(cls, pilotRef, taskQueueID, ownerGroup, gridType="DIRAC
9296
types_getPilotOutput = [str]
9397

9498
def export_getPilotOutput(self, pilotReference):
95-
"""Get the pilot job standard output and standard error files for the Grid
96-
job reference
9799
"""
100+
Get the pilot job standard output and standard error files for a pilot reference.
101+
Handles both classic, CE-based logs and remote logs. The type og logs returned is determined
102+
by the server.
103+
104+
:param str pilotReference:
105+
:return: S_OK or S_ERROR Dirac object
106+
:rtype: dict
107+
"""
108+
98109
result = self.pilotAgentsDB.getPilotInfo(pilotReference)
99110
if not result["OK"]:
100111
self.log.error("Failed to get info for pilot", result["Message"])
@@ -104,6 +115,25 @@ def export_getPilotOutput(self, pilotReference):
104115
return S_ERROR("Pilot info is empty")
105116

106117
pilotDict = result["Value"][pilotReference]
118+
vo = getVOForGroup(pilotDict["OwnerGroup"])
119+
opsHelper = Operations(vo=vo)
120+
remote = opsHelper.getValue("Pilot/RemoteLogsPriority", False)
121+
funcs = [self._getRemotePilotOutput, self._getPilotOutput]
122+
if remote:
123+
funcs.reverse()
124+
125+
result = funcs[0](pilotReference, pilotDict)
126+
if not result["OK"]:
127+
self.log.warn("Pilot log retrieval failed (first attempt), remote ?", remote)
128+
result = funcs[1](pilotReference, pilotDict)
129+
return result
130+
else:
131+
return result
132+
133+
def _getPilotOutput(self, pilotReference, pilotDict):
134+
"""Get the pilot job standard output and standard error files for the Grid
135+
job reference
136+
"""
107137

108138
group = pilotDict["OwnerGroup"]
109139

@@ -158,6 +188,39 @@ def export_getPilotOutput(self, pilotReference):
158188
shutil.rmtree(ce.ceParameters["WorkingDirectory"])
159189
return S_OK(resultDict)
160190

191+
def _getRemotePilotOutput(self, pilotReference, pilotDict):
192+
"""
193+
Get remote pilot log files.
194+
195+
:param str pilotReference:
196+
:return: S_OK Dirac object
197+
:rtype: dict
198+
"""
199+
200+
pilotStamp = pilotDict["PilotStamp"]
201+
group = pilotDict["OwnerGroup"]
202+
vo = getVOForGroup(group)
203+
204+
if self.loggingPlugin is None:
205+
result = ObjectLoader().loadObject(
206+
f"WorkloadManagementSystem.Client.PilotLoggingPlugins.{self.configValue}", self.configValue
207+
)
208+
if not result["OK"]:
209+
self.log.error("Failed to load LoggingPlugin", f"{self.configValue}: {result['Message']}")
210+
return result
211+
212+
componentClass = result["Value"]
213+
self.loggingPlugin = componentClass()
214+
self.log.info("Loaded: PilotLoggingPlugin class", self.configValue)
215+
216+
res = self.loggingPlugin.getRemotePilotLogs(pilotStamp, vo)
217+
218+
if res["OK"]:
219+
res["Value"]["OwnerGroup"] = group
220+
res["Value"]["FileList"] = []
221+
# return res, correct or not
222+
return res
223+
161224
##############################################################################
162225
types_getPilotInfo = [[list, str]]
163226

src/DIRAC/WorkloadManagementSystem/Service/WMSAdministratorHandler.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,6 @@ def export_getJobPilotOutput(self, jobID):
178178
job reference
179179
180180
:param str jobID: job ID
181-
182181
:return: S_OK(dict)/S_ERROR()
183182
"""
184183
pilotReference = ""

0 commit comments

Comments
 (0)