Remove Parsl

rbeucher · rbeucher · commit b4e2c438a70b · 2025-08-01T10:59:32.000+10:00
diff --git a/src/access_mopper/batch_cmoriser.py b/src/access_mopper/batch_cmoriser.py
@@ -1,15 +1,13 @@
 import os
 import subprocess
 import sys
+import time
 from importlib.resources import files
 from pathlib import Path
 
-import parsl
 import yaml
-from parsl import Config, HighThroughputExecutor, python_app
-from parsl.addresses import address_by_hostname
+from jinja2 import Template
 
-from access_mopper.executors.pbs_scheduler import SmartPBSProvider
 from access_mopper.tracking import TaskTracker
 
 
@@ -24,58 +22,85 @@ def start_dashboard(dashboard_path: str, db_path: str):
     )
 
 
-@python_app
-def run_cmor(variable, config, db_path):
-    import glob
-    from pathlib import Path
+def create_job_script(variable, config, db_path, script_dir):
+    """Create a PBS job script for processing a single variable using Jinja2 template."""
 
-    from access_mopper import ACCESS_ESM_CMORiser
-    from access_mopper.tracking import TaskTracker
+    # Load the template from the templates directory
+    template_path = files("access_mopper.templates").joinpath("cmor_job_script.j2")
 
-    input_folder = config["input_folder"]
-    pattern = config.get("file_patterns", {}).get(variable)
-    full_pattern = str(input_folder + pattern)
-    input_files = glob.glob(full_pattern)
-    if not input_files:
-        raise ValueError(f"No files found for pattern {pattern}")
+    with template_path.open() as f:
+        template_content = f.read()
 
+    job_template = Template(template_content)
+
+    # Get the package path for sys.path.insert
+    package_path = Path(__file__).parent.parent
+
+    script_content = job_template.render(
+        variable=variable,
+        config=config,
+        db_path=db_path,
+        script_dir=script_dir,
+        package_path=package_path,
+    )
+
+    script_path = script_dir / f"cmor_{variable}.sh"
+    with open(script_path, "w") as f:
+        f.write(script_content)
+
+    os.chmod(script_path, 0o755)
+    return script_path
+
+
+def submit_job(script_path):
+    """Submit a PBS job and return the job ID."""
     try:
-        exp = config["experiment_id"]
-        tracker = TaskTracker(Path(db_path))
-        tracker.add_task(variable, exp)
-
-        if tracker.is_done(variable, exp):
-            return f"Skipped: {variable} (already done)"
-
-        tracker.mark_running(variable, exp)
-
-        # Create CMORiser without Dask client
-        cmoriser = ACCESS_ESM_CMORiser(
-            input_paths=input_files,
-            compound_name=variable,
-            experiment_id=config["experiment_id"],
-            source_id=config["source_id"],
-            variant_label=config["variant_label"],
-            grid_label=config["grid_label"],
-            activity_id=config.get("activity_id"),
-            output_path=config["output_folder"],
-            drs_root=config.get("drs_root"),
+        result = subprocess.run(
+            ["qsub", str(script_path)], capture_output=True, text=True, check=True
         )
-        cmoriser.run()
-        tracker.mark_done(variable, exp)
+        job_id = result.stdout.strip()
+        return job_id
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to submit job {script_path}: {e}")
+        return None
+
+
+def wait_for_jobs(job_ids, poll_interval=30):
+    """Wait for all jobs to complete and report status."""
+    print(f"Waiting for {len(job_ids)} jobs to complete...")
+
+    while job_ids:
+        time.sleep(poll_interval)
 
-        return f"Completed: {variable}"
-    except Exception as e:
-        # Mark as failed
+        # Check job status
         try:
-            exp = config["experiment_id"]
-            tracker = TaskTracker(Path(db_path))
-            tracker.mark_failed(variable, exp, str(e))
-        except Exception:
-            pass  # Don't let tracker errors mask the original error
+            result = subprocess.run(
+                ["qstat", "-x"] + job_ids,
+                capture_output=True,
+                text=True,
+                check=False,  # qstat returns non-zero when jobs complete
+            )
+
+            # Parse qstat output to see which jobs are still running
+            still_running = []
+            for line in result.stdout.split("\n"):
+                for job_id in job_ids:
+                    if job_id in line and any(
+                        status in line for status in ["Q", "R", "H"]
+                    ):
+                        still_running.append(job_id)
+                        break
+
+            completed = [job_id for job_id in job_ids if job_id not in still_running]
+            if completed:
+                print(f"Completed jobs: {completed}")
+                job_ids = still_running
 
-        # Re-raise with just the error message to avoid serialization issues
-        raise RuntimeError(f"Failed processing {variable}: {str(e)}")
+        except subprocess.CalledProcessError:
+            # If qstat fails, assume all jobs are done
+            break
+
+    print("All jobs completed!")
 
 
 def main():
@@ -98,48 +123,43 @@ def main():
     DASHBOARD_SCRIPT = files("access_mopper.dashboard").joinpath("cmor_dashboard.py")
     start_dashboard(str(DASHBOARD_SCRIPT), str(DB_PATH))
 
-    # Read resource settings from config_data, with defaults
-    cpus_per_node = config_data.get("cpus_per_node", 4)
-    mem = config_data.get("mem", "16GB")
-    walltime = config_data.get("walltime", "01:00:00")
-    storage = config_data.get("storage", None)
-    nodes_per_block = config_data.get("nodes_per_block", 1)
-    init_blocks = config_data.get("init_blocks", 1)
-    max_blocks = config_data.get("max_blocks", 10)
-    queue = config_data.get("queue", "normal")
-    scheduler_options = config_data.get("scheduler_options", "#PBS -P your_project")
-    worker_init = config_data.get("worker_init", "module load netcdf-python")
-
-    # Configure Parsl
-    parsl_config = Config(
-        executors=[
-            HighThroughputExecutor(
-                label="htex_pbs",
-                address=address_by_hostname(),
-                provider=SmartPBSProvider(
-                    queue=queue,
-                    scheduler_options=scheduler_options,
-                    worker_init=worker_init,
-                    nodes_per_block=nodes_per_block,
-                    cpus_per_node=cpus_per_node,
-                    mem=mem,
-                    storage=storage,
-                    walltime=walltime,
-                    init_blocks=init_blocks,
-                    max_blocks=max_blocks,
-                ),
-            )
-        ],
-        strategy="simple",
-    )
-
-    parsl.load(parsl_config)
-
-    futures = [
-        run_cmor(var, config_data, str(DB_PATH)) for var in config_data["variables"]
-    ]
-    results = [f.result() for f in futures]
-    print("\n".join(results))
+    # Create directory for job scripts
+    script_dir = Path("cmor_job_scripts")
+    script_dir.mkdir(exist_ok=True)
+
+    # Create and submit job scripts for each variable
+    job_ids = []
+    variables = config_data["variables"]
+
+    print(f"Submitting {len(variables)} CMORisation jobs...")
+
+    for variable in variables:
+        # Create job script
+        script_path = create_job_script(variable, config_data, str(DB_PATH), script_dir)
+        print(f"Created job script: {script_path}")
+
+        # Submit job
+        job_id = submit_job(script_path)
+        if job_id:
+            job_ids.append(job_id)
+            print(f"Submitted job {job_id} for variable {variable}")
+        else:
+            print(f"Failed to submit job for variable {variable}")
+
+    if job_ids:
+        print(f"\nSubmitted {len(job_ids)} jobs successfully:")
+        for i, (var, job_id) in enumerate(zip(variables[: len(job_ids)], job_ids)):
+            print(f"  {var}: {job_id}")
+
+        print(f"\nMonitor jobs with: qstat {' '.join(job_ids)}")
+        print("Dashboard available at: http://localhost:8501")
+
+        # Optionally wait for all jobs to complete
+        if config_data.get("wait_for_completion", False):
+            wait_for_jobs(job_ids)
+    else:
+        print("No jobs were submitted successfully")
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/src/access_mopper/examples/batch_config.yml b/src/access_mopper/examples/batch_config.yml
@@ -1,16 +1,7 @@
-# === General input/output paths
-input_folder: /g/data/p73/archive/CMIP7/ACCESS-ESM1-6/spinup/JuneSpinUp-JuneSpinUp-bfaa9c5b
-output_folder: /scratch/tm70/rb5533/mopper_output
-drs_root: /scratch/tm70/rb5533/mopper_output/CMIP
+# Example configuration for batch CMORisation using PBS
+# Each variable will be processed in a separate PBS job with its own Dask cluster
 
-# === CMIP6 metadata
-experiment_id: piControl
-source_id: ACCESS-ESM1-5
-variant_label: r1i1p1f1
-grid_label: gn
-activity_id: CMIP
-
-# === List of variables to CMORise (table.variable)
+# Required: List of variables to process
 variables:
   - Amon.pr
   - Omon.tos
@@ -23,6 +14,20 @@ variables:
   - Amon.rsds
   - Amon.rsus
 
+# Required: CMIP6 metadata
+experiment_id: piControl
+source_id: ACCESS-ESM1-5
+variant_label: r1i1p1f1
+grid_label: gn
+activity_id: CMIP
+
+# Required: Input and output paths
+input_folder: /g/data/p73/archive/CMIP7/ACCESS-ESM1-6/spinup/JuneSpinUp-JuneSpinUp-bfaa9c5b
+output_folder: /scratch/tm70/rb5533/mopper_output
+
+
+# Optional: File patterns for each variable
+# If not specified, will use default patterns
 file_patterns:
   Amon.pr: "/output[0-4][0-9][0-9]/atmosphere/netCDF/*mon.nc"
   Omon.tos: "/output[0-4][0-9][0-9]/ocean/ocean-2d-surface_temp-1monthly-mean*.nc"
@@ -35,14 +40,22 @@ file_patterns:
   Amon.rsds: "/output[0-4][0-9][0-9]/atmosphere/netCDF/*mon.nc"
   Amon.rsus: "/output[0-4][0-9][0-9][0-9]/atmosphere/netCDF/*mon.nc"
 
-# === Job submission settings
+
+# Optional: DRS root (if you want to organize output in CMIP6 DRS structure)
+# drs_root: /scratch/tm70/rb5533/mopper_output/CMIP
+
+# PBS job configuration
+queue: "normal"
 cpus_per_node: 14
-mem: 32GB
-walltime: 01:00:00
-storage: gdata/p73+gdata/tm70+scratch/tm70
-nodes_per_block: 2
-init_blocks: 2
-max_blocks: 20
-queue: normal
+mem: "32GB"
+walltime: "02:00:00"
 scheduler_options: "#PBS -P tm70"
-worker_init: "source /g/data/tm70/rb5533/miniforge3/bin/activate && conda activate esmvaltool_dev"
+storage: "gdata/p73+gdata/tm70+scratch/tm70"
+
+# Environment setup for each job
+worker_init: |
+  source /g/data/tm70/rb5533/miniforge3/bin/activate
+  conda activate esmvaltool_dev
+
+# Optional: Wait for all jobs to complete before exiting
+wait_for_completion: false
diff --git a/src/access_mopper/templates/cmor_job_script.j2 b/src/access_mopper/templates/cmor_job_script.j2
@@ -0,0 +1,89 @@
+#!/bin/bash
+#PBS -N cmor_{{ variable }}
+#PBS -q {{ config.get('queue', 'normal') }}
+#PBS -l ncpus={{ config.get('cpus_per_node', 4) }}
+#PBS -l mem={{ config.get('mem', '16GB') }}
+#PBS -l walltime={{ config.get('walltime', '01:00:00') }}
+#PBS -o {{ script_dir }}/cmor_{{ variable }}.out
+#PBS -e {{ script_dir }}/cmor_{{ variable }}.err
+{{ config.get('scheduler_options', '#PBS -P your_project') }}
+{% if config.get('storage') %}#PBS -l storage={{ config.get('storage') }}{% endif %}
+
+# Initialize environment
+{{ config.get('worker_init', 'module load netcdf-python') }}
+
+# Set environment variables for this job
+export CMOR_TRACKER_DB="{{ db_path }}"
+export VARIABLE="{{ variable }}"
+
+# Run the CMORisation for this variable
+python -c "
+import os
+import glob
+import sys
+from pathlib import Path
+
+# Add any necessary paths
+sys.path.insert(0, '{{ package_path }}')
+
+from access_mopper import ACCESS_ESM_CMORiser
+from access_mopper.tracking import TaskTracker
+
+# Configuration
+config = {{ config|tojson }}
+variable = os.environ['VARIABLE']
+db_path = os.environ['CMOR_TRACKER_DB']
+
+# Find input files
+input_folder = config['input_folder']
+pattern = config.get('file_patterns', {}).get(variable)
+if not pattern:
+    raise ValueError(f'No pattern found for variable {variable}')
+
+full_pattern = str(Path(input_folder) / pattern)
+input_files = glob.glob(full_pattern)
+if not input_files:
+    raise ValueError(f'No files found for pattern {pattern}')
+
+print(f'Processing {variable} with {len(input_files)} files')
+
+try:
+    # Initialize tracking
+    exp = config['experiment_id']
+    tracker = TaskTracker(Path(db_path))
+    tracker.add_task(variable, exp)
+
+    if tracker.is_done(variable, exp):
+        print(f'Skipped: {variable} (already done)')
+        sys.exit(0)
+
+    tracker.mark_running(variable, exp)
+
+    # Create CMORiser with Dask parallelization
+    cmoriser = ACCESS_ESM_CMORiser(
+        input_paths=input_files,
+        compound_name=variable,
+        experiment_id=config['experiment_id'],
+        source_id=config['source_id'],
+        variant_label=config['variant_label'],
+        grid_label=config['grid_label'],
+        activity_id=config.get('activity_id'),
+        output_path=config['output_folder'],
+        drs_root=config.get('drs_root'),
+    )
+
+    # Run the CMORisation
+    cmoriser.run()
+    tracker.mark_done(variable, exp)
+
+    print(f'Completed: {variable}')
+
+except Exception as e:
+    print(f'Error processing {variable}: {e}', file=sys.stderr)
+    try:
+        exp = config['experiment_id']
+        tracker = TaskTracker(Path(db_path))
+        tracker.mark_failed(variable, exp, str(e))
+    except:
+        pass
+    sys.exit(1)