Use prefect task to execute jobs, download files. Add flow and run names

andrii-i · andrii-i · commit 69b34ca5ecef · 2024-09-16T13:16:25.000-07:00
diff --git a/jupyter_scheduler/executors.py b/jupyter_scheduler/executors.py
@@ -188,7 +188,7 @@ def on_complete_workflow(self):
 class DefaultExecutionManager(ExecutionManager):
     """Default execution manager that executes notebooks"""
 
-    @task
+    @task(name="Execute workflow task")
     def execute_task(self, job: Job):
         with self.db_session() as session:
             staging_paths = Scheduler.get_staging_paths(DescribeJob.from_orm(job))
@@ -206,14 +206,14 @@ def execute_task(self, job: Job):
 
         return job_id
 
-    @task
+    @task(name="Get workflow task records")
     def get_tasks_records(self, task_ids: List[str]) -> List[Job]:
         with self.db_session() as session:
             tasks = session.query(Job).filter(Job.job_id.in_(task_ids)).all()
 
         return tasks
 
-    @flow
+    @flow(name="Execute workflow", flow_run_name="Execute workflow run")
     def execute_workflow(self):
         tasks_info: List[Job] = self.get_tasks_records(self.model.tasks)
         tasks = {task.job_id: task for task in tasks_info}
@@ -232,6 +232,7 @@ def make_task(task_id):
         for future in as_completed(final_tasks):
             future.result()
 
+    @flow(name="Execute job", flow_run_name="Execute job run")
     def execute(self):
         job = self.model
 
@@ -254,6 +255,7 @@ def execute(self):
             self.add_side_effects_files(staging_dir)
             self.create_output_files(job, nb)
 
+    @task(name="Check for and add side effect files")
     def add_side_effects_files(self, staging_dir: str):
         """Scan for side effect files potentially created after input file execution and update the job's packaged_files with these files"""
         input_notebook = os.path.relpath(self.staging_paths["input"])
@@ -276,6 +278,7 @@ def add_side_effects_files(self, staging_dir: str):
                 )
                 session.commit()
 
+    @task(name="Create output files")
     def create_output_files(self, job: DescribeJob, notebook_node):
         for output_format in job.output_formats:
             cls = nbconvert.get_exporter(output_format)
diff --git a/jupyter_scheduler/job_files_manager.py b/jupyter_scheduler/job_files_manager.py
@@ -6,6 +6,7 @@
 
 import fsspec
 from jupyter_server.utils import ensure_async
+from prefect import task
 
 from jupyter_scheduler.exceptions import SchedulerError
 from jupyter_scheduler.scheduler import BaseScheduler
@@ -23,17 +24,14 @@ async def copy_from_staging(self, job_id: str, redownload: Optional[bool] = Fals
         output_filenames = self.scheduler.get_job_filenames(job)
         output_dir = self.scheduler.get_local_output_path(model=job, root_dir_relative=True)
 
-        p = Process(
-            target=Downloader(
-                output_formats=job.output_formats,
-                output_filenames=output_filenames,
-                staging_paths=staging_paths,
-                output_dir=output_dir,
-                redownload=redownload,
-                include_staging_files=job.package_input_folder,
-            ).download
-        )
-        p.start()
+        target = Downloader(
+            output_formats=job.output_formats,
+            output_filenames=output_filenames,
+            staging_paths=staging_paths,
+            output_dir=output_dir,
+            redownload=redownload,
+            include_staging_files=job.package_input_folder,
+        ).download
 
 
 class Downloader:
@@ -77,6 +75,7 @@ def download_tar(self, archive_format: str = "tar"):
             with tarfile.open(fileobj=f, mode=read_mode) as tar:
                 tar.extractall(self.output_dir)
 
+    @task(name="Download job files")
     def download(self):
         # ensure presence of staging paths
         if not self.staging_paths:
diff --git a/jupyter_scheduler/scheduler.py b/jupyter_scheduler/scheduler.py
@@ -1,4 +1,3 @@
-import multiprocessing as mp
 import os
 import random
 import shutil
@@ -526,25 +525,15 @@ def create_job(self, model: CreateJob, run: bool = True) -> str:
 
     def run_job(self, job: Job, staging_paths: Dict[str, str]) -> str:
         with self.db_session() as session:
-            # The MP context forces new processes to not be forked on Linux.
-            # This is necessary because `asyncio.get_event_loop()` is bugged in
-            # forked processes in Python versions below 3.12. This method is
-            # called by `jupyter_core` by `nbconvert` in the default executor.
-            #
-            # See: https://github.com/python/cpython/issues/66285
-            # See also: https://github.com/jupyter/jupyter_core/pull/362
-            mp_ctx = mp.get_context("spawn")
-            p = mp_ctx.Process(
-                target=self.execution_manager_class(
-                    job_id=job.job_id,
-                    staging_paths=staging_paths,
-                    root_dir=self.root_dir,
-                    db_url=self.db_url,
-                ).process
+            execution_manager = self.execution_manager_class(
+                job_id=job.job_id,
+                staging_paths=staging_paths,
+                root_dir=self.root_dir,
+                db_url=self.db_url,
             )
-            p.start()
+            execution_manager.process()
 
-            job.pid = p.pid
+            job.pid = 1  # TODO: fix pid hardcode
             session.commit()
 
             job_id = job.job_id