11import os
22import subprocess
33import sys
4+ import time
45from importlib .resources import files
56from pathlib import Path
67
7- import parsl
88import yaml
9- from parsl import Config , HighThroughputExecutor , python_app
10- from parsl .addresses import address_by_hostname
9+ from jinja2 import Template
1110
12- from access_mopper .executors .pbs_scheduler import SmartPBSProvider
1311from access_mopper .tracking import TaskTracker
1412
1513
@@ -24,58 +22,85 @@ def start_dashboard(dashboard_path: str, db_path: str):
2422 )
2523
2624
27- @python_app
28- def run_cmor (variable , config , db_path ):
29- import glob
30- from pathlib import Path
25+ def create_job_script (variable , config , db_path , script_dir ):
26+ """Create a PBS job script for processing a single variable using Jinja2 template."""
3127
32- from access_mopper import ACCESS_ESM_CMORiser
33- from access_mopper .tracking import TaskTracker
28+ # Load the template from the templates directory
29+ template_path = files ( " access_mopper.templates" ). joinpath ( "cmor_job_script.j2" )
3430
35- input_folder = config ["input_folder" ]
36- pattern = config .get ("file_patterns" , {}).get (variable )
37- full_pattern = str (input_folder + pattern )
38- input_files = glob .glob (full_pattern )
39- if not input_files :
40- raise ValueError (f"No files found for pattern { pattern } " )
31+ with template_path .open () as f :
32+ template_content = f .read ()
4133
34+ job_template = Template (template_content )
35+
36+ # Get the package path for sys.path.insert
37+ package_path = Path (__file__ ).parent .parent
38+
39+ script_content = job_template .render (
40+ variable = variable ,
41+ config = config ,
42+ db_path = db_path ,
43+ script_dir = script_dir ,
44+ package_path = package_path ,
45+ )
46+
47+ script_path = script_dir / f"cmor_{ variable } .sh"
48+ with open (script_path , "w" ) as f :
49+ f .write (script_content )
50+
51+ os .chmod (script_path , 0o755 )
52+ return script_path
53+
54+
55+ def submit_job (script_path ):
56+ """Submit a PBS job and return the job ID."""
4257 try :
43- exp = config ["experiment_id" ]
44- tracker = TaskTracker (Path (db_path ))
45- tracker .add_task (variable , exp )
46-
47- if tracker .is_done (variable , exp ):
48- return f"Skipped: { variable } (already done)"
49-
50- tracker .mark_running (variable , exp )
51-
52- # Create CMORiser without Dask client
53- cmoriser = ACCESS_ESM_CMORiser (
54- input_paths = input_files ,
55- compound_name = variable ,
56- experiment_id = config ["experiment_id" ],
57- source_id = config ["source_id" ],
58- variant_label = config ["variant_label" ],
59- grid_label = config ["grid_label" ],
60- activity_id = config .get ("activity_id" ),
61- output_path = config ["output_folder" ],
62- drs_root = config .get ("drs_root" ),
58+ result = subprocess .run (
59+ ["qsub" , str (script_path )], capture_output = True , text = True , check = True
6360 )
64- cmoriser .run ()
65- tracker .mark_done (variable , exp )
61+ job_id = result .stdout .strip ()
62+ return job_id
63+ except subprocess .CalledProcessError as e :
64+ print (f"Failed to submit job { script_path } : { e } " )
65+ return None
66+
67+
68+ def wait_for_jobs (job_ids , poll_interval = 30 ):
69+ """Wait for all jobs to complete and report status."""
70+ print (f"Waiting for { len (job_ids )} jobs to complete..." )
71+
72+ while job_ids :
73+ time .sleep (poll_interval )
6674
67- return f"Completed: { variable } "
68- except Exception as e :
69- # Mark as failed
75+ # Check job status
7076 try :
71- exp = config ["experiment_id" ]
72- tracker = TaskTracker (Path (db_path ))
73- tracker .mark_failed (variable , exp , str (e ))
74- except Exception :
75- pass # Don't let tracker errors mask the original error
77+ result = subprocess .run (
78+ ["qstat" , "-x" ] + job_ids ,
79+ capture_output = True ,
80+ text = True ,
81+ check = False , # qstat returns non-zero when jobs complete
82+ )
83+
84+ # Parse qstat output to see which jobs are still running
85+ still_running = []
86+ for line in result .stdout .split ("\n " ):
87+ for job_id in job_ids :
88+ if job_id in line and any (
89+ status in line for status in ["Q" , "R" , "H" ]
90+ ):
91+ still_running .append (job_id )
92+ break
93+
94+ completed = [job_id for job_id in job_ids if job_id not in still_running ]
95+ if completed :
96+ print (f"Completed jobs: { completed } " )
97+ job_ids = still_running
7698
77- # Re-raise with just the error message to avoid serialization issues
78- raise RuntimeError (f"Failed processing { variable } : { str (e )} " )
99+ except subprocess .CalledProcessError :
100+ # If qstat fails, assume all jobs are done
101+ break
102+
103+ print ("All jobs completed!" )
79104
80105
81106def main ():
@@ -98,48 +123,43 @@ def main():
98123 DASHBOARD_SCRIPT = files ("access_mopper.dashboard" ).joinpath ("cmor_dashboard.py" )
99124 start_dashboard (str (DASHBOARD_SCRIPT ), str (DB_PATH ))
100125
101- # Read resource settings from config_data, with defaults
102- cpus_per_node = config_data .get ("cpus_per_node" , 4 )
103- mem = config_data .get ("mem" , "16GB" )
104- walltime = config_data .get ("walltime" , "01:00:00" )
105- storage = config_data .get ("storage" , None )
106- nodes_per_block = config_data .get ("nodes_per_block" , 1 )
107- init_blocks = config_data .get ("init_blocks" , 1 )
108- max_blocks = config_data .get ("max_blocks" , 10 )
109- queue = config_data .get ("queue" , "normal" )
110- scheduler_options = config_data .get ("scheduler_options" , "#PBS -P your_project" )
111- worker_init = config_data .get ("worker_init" , "module load netcdf-python" )
112-
113- # Configure Parsl
114- parsl_config = Config (
115- executors = [
116- HighThroughputExecutor (
117- label = "htex_pbs" ,
118- address = address_by_hostname (),
119- provider = SmartPBSProvider (
120- queue = queue ,
121- scheduler_options = scheduler_options ,
122- worker_init = worker_init ,
123- nodes_per_block = nodes_per_block ,
124- cpus_per_node = cpus_per_node ,
125- mem = mem ,
126- storage = storage ,
127- walltime = walltime ,
128- init_blocks = init_blocks ,
129- max_blocks = max_blocks ,
130- ),
131- )
132- ],
133- strategy = "simple" ,
134- )
135-
136- parsl .load (parsl_config )
137-
138- futures = [
139- run_cmor (var , config_data , str (DB_PATH )) for var in config_data ["variables" ]
140- ]
141- results = [f .result () for f in futures ]
142- print ("\n " .join (results ))
126+ # Create directory for job scripts
127+ script_dir = Path ("cmor_job_scripts" )
128+ script_dir .mkdir (exist_ok = True )
129+
130+ # Create and submit job scripts for each variable
131+ job_ids = []
132+ variables = config_data ["variables" ]
133+
134+ print (f"Submitting { len (variables )} CMORisation jobs..." )
135+
136+ for variable in variables :
137+ # Create job script
138+ script_path = create_job_script (variable , config_data , str (DB_PATH ), script_dir )
139+ print (f"Created job script: { script_path } " )
140+
141+ # Submit job
142+ job_id = submit_job (script_path )
143+ if job_id :
144+ job_ids .append (job_id )
145+ print (f"Submitted job { job_id } for variable { variable } " )
146+ else :
147+ print (f"Failed to submit job for variable { variable } " )
148+
149+ if job_ids :
150+ print (f"\n Submitted { len (job_ids )} jobs successfully:" )
151+ for i , (var , job_id ) in enumerate (zip (variables [: len (job_ids )], job_ids )):
152+ print (f" { var } : { job_id } " )
153+
154+ print (f"\n Monitor jobs with: qstat { ' ' .join (job_ids )} " )
155+ print ("Dashboard available at: http://localhost:8501" )
156+
157+ # Optionally wait for all jobs to complete
158+ if config_data .get ("wait_for_completion" , False ):
159+ wait_for_jobs (job_ids )
160+ else :
161+ print ("No jobs were submitted successfully" )
162+ sys .exit (1 )
143163
144164
145165if __name__ == "__main__" :
0 commit comments