@@ -135,13 +135,22 @@ end
135
135
136
136
Submit a job to the PBS Pro scheduler using qsub, removing unwanted environment variables.
137
137
138
- Unset variables: "PBS_MEM_PER_CPU", "PBS_MEM_PER_GPU", "PBS_MEM_PER_NODE"
138
+ Unset variables: "PBS_MEM_PER_CPU", "PBS_MEM_PER_GPU", "PBS_MEM_PER_NODE", "PYTHONHOME", "PYTHONPATH", "PYTHONUSERBASE"
139
139
"""
140
140
function submit_pbs_job (filepath; debug = false , env = deepcopy (ENV ))
141
- unset_env_vars = (" PBS_MEM_PER_CPU" , " PBS_MEM_PER_GPU" , " PBS_MEM_PER_NODE" )
141
+ # Clean env to avoid user overrides breaking system PBS utilities (e.g., python wrappers)
142
+ unset_env_vars = (
143
+ " PBS_MEM_PER_CPU" ,
144
+ " PBS_MEM_PER_GPU" ,
145
+ " PBS_MEM_PER_NODE" ,
146
+ " PYTHONHOME" ,
147
+ " PYTHONPATH" ,
148
+ " PYTHONUSERBASE" ,
149
+ )
142
150
for k in unset_env_vars
143
151
haskey (env, k) && delete! (env, k)
144
152
end
153
+ env[" PYTHONNOUSERSITE" ] = " 1"
145
154
jobid = readchomp (setenv (` qsub $filepath ` , env))
146
155
return jobid
147
156
end
@@ -172,17 +181,77 @@ wait_for_jobs(
172
181
reruns,
173
182
)
174
183
184
+ """
185
+ _qstat_output(jobid, env; retries=2, delay=0.25)
186
+
187
+ Best-effort qstat caller: tries dsv then plain format, with a few short retries.
188
+ Returns the output String or `nothing` if all attempts fail.
189
+ """
190
+ function _qstat_output (jobid:: PBSJobID , env)
191
+ attempts = 3
192
+ delay = 0.25
193
+ for i in 1 : attempts
194
+ try
195
+ out = readchomp (setenv (` qstat -f $jobid -x -F dsv` , env))
196
+ if isempty (strip (out)) && i < attempts
197
+ sleep (delay)
198
+ continue
199
+ end
200
+ return out
201
+ catch
202
+ try
203
+ out = readchomp (setenv (` qstat -f $jobid -x` , env))
204
+ if isempty (strip (out)) && i < attempts
205
+ sleep (delay)
206
+ continue
207
+ end
208
+ return out
209
+ catch
210
+ i < attempts && sleep (delay)
211
+ end
212
+ end
213
+ end
214
+ return nothing
215
+ end
216
+
175
217
function job_status (jobid:: PBSJobID )
176
- status_str = readchomp (` qstat -f $jobid -x -F dsv` )
177
- job_state_match = match (r" job_state=([^|]+)" , status_str)
178
- status = first (job_state_match. captures)
179
- substate_match = match (r" substate=([^|]+)" , status_str)
180
- substate_number = parse (Int, (first (substate_match. captures)))
181
- status_dict = Dict (" Q" => :RUNNING , " F" => :COMPLETED )
182
- status_symbol = get (status_dict, status, :RUNNING )
183
- # Check for failure in the substate number
218
+ # Call qstat with a sanitized environment to avoid user Python interfering with PBS wrappers
219
+ clean_env = deepcopy (ENV )
220
+ for k in (" PYTHONHOME" , " PYTHONPATH" , " PYTHONUSERBASE" )
221
+ haskey (clean_env, k) && delete! (clean_env, k)
222
+ end
223
+ clean_env[" PYTHONNOUSERSITE" ] = " 1"
224
+
225
+ status_str = _qstat_output (jobid, clean_env)
226
+ if isnothing (status_str)
227
+ @warn " qstat failed for job $jobid ; assuming job is running"
228
+ return :RUNNING
229
+ end
230
+
231
+ # Support both dsv and plain formats
232
+ job_state_match = match (r" job_state\s *=\s *([^|\n\r ]+)" , status_str)
233
+ substate_match = match (r" substate\s *=\s *(\d +)" , status_str)
234
+
235
+ status_code = if isnothing (job_state_match)
236
+ @warn " Job status for $jobid not found in qstat output. Assuming job is running"
237
+ " Q"
238
+ else
239
+ strip (first (job_state_match. captures))
240
+ end
241
+
242
+ substate_number =
243
+ isnothing (substate_match) ? 0 :
244
+ parse (Int, first (substate_match. captures))
245
+
246
+ # Map PBS states to our symbols; default to :RUNNING while job exists
247
+ status_symbol = get (
248
+ Dict (" Q" => :RUNNING , " R" => :RUNNING , " F" => :COMPLETED ),
249
+ status_code,
250
+ :RUNNING ,
251
+ )
252
+
184
253
if status_symbol == :COMPLETED && substate_number in (91 , 93 )
185
- status_symbol = :FAILED
254
+ return :FAILED
186
255
end
187
256
return status_symbol
188
257
end
0 commit comments