add run_local_async with desired async behavior (PolusAI#336)

vjaganat90 · web-flow · commit 7d4bb80ee7d2 · 2025-06-16T10:30:56.000-04:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -69,6 +69,7 @@ repos:
         - types-setuptools
         - types-six
         - types-urllib3
+        - types-aiofiles
 -   repo: https://github.com/pre-commit/pygrep-hooks
     rev: v1.10.0
     hooks:
diff --git a/install/system_deps.yml b/install/system_deps.yml
@@ -39,5 +39,3 @@ dependencies:
 # Needs binary PyQt5 dependencies.
   - kubernetes-helm
   - zstandard
-# # Needed for orjson wheels
-#   - orjson
diff --git a/pyproject.toml b/pyproject.toml
@@ -96,6 +96,7 @@ mypy-types = [
     "types-setuptools",
     "types-six",
     "types-urllib3",
+    "types-aiofiles"
 ]
 # NOTE: toil and cwltool have a somewhat continuous release model. However,
 # there can be updates and bugfixes in main that have not yet been released.
diff --git a/src/sophios/run_local.py b/src/sophios/run_local.py
@@ -9,11 +9,8 @@
 import shutil
 import platform
 import traceback
-import yaml
 from typing import Dict, List, Optional
 from datetime import datetime
-import sophios.post_compile as pc
-from sophios.wic_types import Json
 
 try:
     import cwltool.main
@@ -304,7 +301,7 @@ def copy_output_files(yaml_stem: str, basepath: str = '') -> None:
     Args:
         yaml_stem (str): The --yaml filename (without .extension)
     """
-    output_json_file = Path(f'output_{yaml_stem}.json')
+    output_json_file = Path(f'{basepath}/output_{yaml_stem}.json')
     if output_json_file.exists():
         pass  # TODO
 
@@ -386,7 +383,7 @@ def build_cmd(workflow_name: str, basepath: str, cwl_runner: str, container_cmd:
     elif cwl_runner == 'toil-cwl-runner':
         container_pull = []
         now = datetime.now()
-        date_time = now.strftime("%Y%m%d%H%M%S")
+        date_time = now.strftime("%Y_%m_%d_%H.%M.%S")
         cmd = [script] + container_pull + provenance + container_cmd_ + path_check
         cmd += ['--outdir', f'{basepath}/outdir_toil_{date_time}',
                 '--jobStore', f'file:{basepath}/jobStore_{workflow_name}',  # NOTE: This is the equivalent of --cachedir
@@ -401,90 +398,6 @@ def build_cmd(workflow_name: str, basepath: str, cwl_runner: str, container_cmd:
     return cmd
 
 
-def run_cwl_workflow(workflow_name: str, basepath: str, cwl_runner: str, container_cmd: str, use_subprocess: bool, env_commands: List[str] = []) -> int:
-    """Run the CWL workflow in an environment
-
-    Args:
-        workflow_name (str): Name of the .cwl workflow file to be executed
-        basepath (str): The path at which the workflow to be executed
-        cwl_runner (str): The CWL runner used to execute the workflow
-        container_cmd (str): The container engine command
-        use_subprocess (bool): When using cwltool, determines whether to use subprocess.run(...)
-        or use the cwltool python api.
-        env_commands (List[str]): environment variables and commands needed to be run before running the workflow
-    Returns:
-        retval: The return value
-    """
-    cmd = build_cmd(workflow_name, basepath, cwl_runner, container_cmd)
-    cmdline = ' '.join(cmd)
-
-    retval = 1  # overwrite on success
-    print('Running ' + cmdline)
-    if use_subprocess:
-        # To run in parallel (i.e. pytest ... --workers 8 ...), we need to
-        # use separate processes. Otherwise:
-        # "signal only works in main thread or with __pypy__.thread.enable_signals()"
-        proc = sub.run(cmd, check=False)
-        retval = proc.returncode
-    else:
-        print('via cwltool.main.main python API')
-        try:
-            if cwl_runner == 'cwltool':
-                retval = cwltool.main.main(cmd[1:])
-            elif cwl_runner == 'toil-cwl-runner':
-                _ = sub.run(env_commands, shell=True, check=False, executable="/bin/bash")
-                retval = toil.cwl.cwltoil.main(cmd[1:])
-            else:
-                raise Exception("Invalid cwl_runner!")
-
-            print(f'Final output json metadata blob is in output_{workflow_name}.json')
-        except Exception as e:
-            print('Failed to execute', workflow_name)
-            print(f'See error_{workflow_name}.txt for detailed technical information.')
-            # Do not display a nasty stack trace to the user; hide it in a file.
-            with open(f'error_{workflow_name}.txt', mode='w', encoding='utf-8') as f:
-                # https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks
-                if sys.version_info >= (3, 10):
-                    traceback.print_exception(type(e), value=e, tb=None, file=f)
-            print(e)  # we are always running this on CI
-    # only copy output files if using cwltool
-    if cwl_runner == 'cwltool':
-        copy_output_files(workflow_name, basepath=basepath)
-    return retval
-
-
-async def run_cwl_serialized_async(workflow: Json, basepath: str,
-                                   cwl_runner: str, container_cmd: str,
-                                   env_commands: List[str] = []) -> None:
-    """Prepare and run compiled and serialized CWL workflow asynchronously
-
-    Args:
-        workflow_json (Json): Compiled and serialized CWL workflow
-        basepath (str): The path at which the workflow to be executed
-        cwl_runner (str): The CWL runner used to execute the workflow
-        container_cmd (str): The container engine command
-        env_commands (List[str]): environment variables and commands needed to be run before running the workflow
-    """
-    workflow_name = workflow['name']
-    basepath = basepath.rstrip("/") if basepath != "/" else basepath
-    output_dirs = pc.find_output_dirs(workflow)
-    pc.create_output_dirs(output_dirs, basepath)
-    compiled_cwl = workflow_name + '.cwl'
-    inputs_yml = workflow_name + '_inputs.yml'
-    # write _input.yml file
-    with open(Path(basepath) / inputs_yml, 'w', encoding='utf-8') as f:
-        yaml.dump(workflow['yaml_inputs'], f)
-    workflow.pop('retval', None)
-    workflow.pop('yaml_inputs', None)
-    workflow.pop('name', None)
-    # write compiled .cwl file
-    with open(Path(basepath) / compiled_cwl, 'w', encoding='utf-8') as f:
-        yaml.dump(workflow, f)
-    retval = run_cwl_workflow(workflow_name, basepath,
-                              cwl_runner, container_cmd, False, env_commands=env_commands)
-    assert retval == 0
-
-
 def stage_input_files(yml_inputs: Yaml, root_yml_dir_abs: Path,
                       relative_run_path: bool = True, throw: bool = True) -> None:
     """Copies the input files in yml_inputs to the working directory.
diff --git a/src/sophios/run_local_async.py b/src/sophios/run_local_async.py
@@ -0,0 +1,125 @@
+from pathlib import Path
+import traceback
+import os
+from typing import Optional, Dict, Any
+import asyncio
+import aiofiles
+import yaml
+# we are already using fastapi elsewhere in this project
+# so use the run_in_threadpool to run sequential functions
+# without blocking the main event loop
+from fastapi.concurrency import run_in_threadpool
+
+import sophios.post_compile as pc
+from sophios.wic_types import Json
+from .run_local import build_cmd, copy_output_files
+
+
+def create_safe_env(user_env: Dict[str, str]) -> dict:
+    """Generate a sanitized environment dict without applying it"""
+    forbidden = {"PATH", "LD_", "PYTHON", "SECRET_", "BASH_ENV"}
+    for key in user_env:
+        if any(key.startswith(prefix) for prefix in forbidden):
+            raise ValueError(f"Prohibited key: {key}")
+    return {**os.environ, **user_env}
+
+
+async def run_cwl_workflow(workflow_name: str, basepath: str,
+                           cwl_runner: str, container_cmd: str,
+                           user_env: Dict[str, str]) -> Optional[int]:
+    """Run the CWL workflow in an environment
+
+    Args:
+        workflow_name (str): Name of the .cwl workflow file to be executed
+        basepath (str): The path at which the workflow to be executed
+        cwl_runner (str): The CWL runner used to execute the workflow
+        container_cmd (str): The container engine command
+        use_subprocess (bool): When using cwltool, determines whether to use subprocess.run(...)
+        or use the cwltool python api.
+        env_commands (List[str]): environment variables and commands needed to be run before running the workflow
+    Returns:
+        retval: The return value
+    """
+    cmd = await run_in_threadpool(build_cmd, workflow_name, basepath, cwl_runner, container_cmd)
+
+    retval = 1  # overwrite on success
+    print('Running ' + (' '.join(cmd)))
+    print('via command line')
+    runner_cmnds = ['cwltool', 'toil-cwl-runner']
+    try:
+        if cwl_runner in runner_cmnds:
+            print(f'Setting env vars : {user_env}')
+            exec_env = create_safe_env(user_env)
+
+            proc = await asyncio.create_subprocess_exec(*cmd,
+                                                        env=exec_env,
+                                                        stdout=asyncio.subprocess.PIPE,
+                                                        stderr=asyncio.subprocess.PIPE)
+
+            async def stream_to_file(stream: Any, filename: Path) -> None:
+                filename.parent.mkdir(parents=True, exist_ok=True)
+                async with aiofiles.open(filename, mode='wb') as f:
+                    while True:
+                        data = await stream.read(1024)  # 1KB chunks
+                        if not data:
+                            break
+                        await f.write(data)
+
+            await asyncio.gather(
+                stream_to_file(proc.stdout, Path(basepath) / 'LOGS' / 'stdout.txt'),
+                stream_to_file(proc.stderr, Path(basepath) / 'LOGS' / 'stderr.txt')
+            )
+            retval = await proc.wait()
+        else:
+            raise ValueError(
+                f'Invalid or Unsupported cwl_runner command! Only these are the supported runners {runner_cmnds}')
+
+    except Exception as e:
+        print('Failed to execute', workflow_name)
+        print(
+            f'See error_{workflow_name}.txt for detailed technical information.')
+        # Do not display a nasty stack trace to the user; hide it in a file.
+        with open(f'error_{workflow_name}.txt', mode='w', encoding='utf-8') as f:
+            traceback.print_exception(type(e), value=e, tb=None, file=f)
+        print(e)  # we are always running this on CI
+    # only copy output files if using cwltool
+    if cwl_runner == 'cwltool':
+        await run_in_threadpool(copy_output_files, workflow_name, basepath=basepath)
+    return retval
+
+
+async def run_cwl_serialized(workflow: Json, basepath: str,
+                             cwl_runner: str, container_cmd: str,
+                             user_env: Dict[str, str]) -> None:
+    """Prepare and run compiled and serialized CWL workflow asynchronously
+
+    Args:
+        workflow_json (Json): Compiled and serialized CWL workflow
+        basepath (str): The path at which the workflow to be executed
+        cwl_runner (str): The CWL runner used to execute the workflow
+        container_cmd (str): The container engine command
+        env_commands (List[str]): environment variables and commands
+        needed to be run before running the workflow
+    """
+    workflow_name = workflow['name']
+    basepath = basepath.rstrip("/") if basepath != "/" else basepath
+    output_dirs = pc.find_output_dirs(workflow)
+    pc.create_output_dirs(output_dirs, basepath)
+    compiled_cwl = workflow_name + '.cwl'
+    inputs_yml = workflow_name + '_inputs.yml'
+    # write _input.yml file
+    await run_in_threadpool(yaml.dump, workflow['yaml_inputs'],
+                            open(Path(basepath) / inputs_yml, 'w', encoding='utf-8'))
+
+    # clean up the object of tags and data that we don't need anymore
+    workflow.pop('retval', None)
+    workflow.pop('yaml_inputs', None)
+    workflow.pop('name', None)
+
+    # write compiled .cwl file
+    await run_in_threadpool(yaml.dump, workflow,
+                            open(Path(basepath) / compiled_cwl, 'w', encoding='utf-8'))
+
+    retval = await run_cwl_workflow(workflow_name, basepath,
+                                    cwl_runner, container_cmd, user_env=user_env)
+    assert retval == 0

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@ mypy-types = [`
`96`	`96`	`"types-setuptools",`
`97`	`97`	`"types-six",`
`98`	`98`	`"types-urllib3",`
	`99`	`+ "types-aiofiles"`
`99`	`100`	`]`
`100`	`101`	`# NOTE: toil and cwltool have a somewhat continuous release model. However,`
`101`	`102`	`# there can be updates and bugfixes in main that have not yet been released.`