refactor and update in line with comments

andrii-i · andrii-i · commit f8a97c3b050a · 2024-05-14T16:47:39.000-07:00
diff --git a/jupyter_scheduler/executors.py b/jupyter_scheduler/executors.py
@@ -1,3 +1,4 @@
+import base64
 import io
 import os
 import shutil
@@ -138,28 +139,15 @@ def execute(self):
             kernel_name=nb.metadata.kernelspec["name"], store_widget_state=True, cwd=staging_dir
         )
 
-        mlflow.set_tracking_uri(MLFLOW_SERVER_URI)
-        with mlflow.start_run(run_id=job.mlflow_run_id):
-            try:
-                ep.preprocess(nb, {"metadata": {"path": staging_dir}})
-                if job.parameters:
-                    mlflow.log_params(job.parameters)
-
-                for idx, cell in enumerate(nb.cells):
-                    if "tags" in cell.metadata and "mlflow_log" in cell.metadata["tags"]:
-                        mlflow.log_text(cell.source, f"source_cell_{idx}.txt")
-                        if cell.cell_type == "code" and cell.outputs:
-                            for output in cell.outputs:
-                                if "text/plain" in output.data:
-                                    mlflow.log_text(
-                                        output.data["text/plain"], f"output_cell_{idx}.txt"
-                                    )
-
-            except CellExecutionError as e:
-                raise e
-            finally:
-                self.add_side_effects_files(staging_dir)
-                self.create_output_files(job, nb)
+        try:
+            ep.preprocess(nb, {"metadata": {"path": staging_dir}})
+        except CellExecutionError as e:
+            raise e
+        finally:
+            self.add_side_effects_files(staging_dir)
+            self.create_output_files(job, nb)
+            if getattr(job, "mlflow_logging", False):
+                self.log_to_mlflow(job, nb)
 
     def add_side_effects_files(self, staging_dir: str):
         """Scan for side effect files potentially created after input file execution and update the job's packaged_files with these files"""
@@ -187,10 +175,105 @@ def create_output_files(self, job: DescribeJob, notebook_node):
         for output_format in job.output_formats:
             cls = nbconvert.get_exporter(output_format)
             output, _ = cls().from_notebook_node(notebook_node)
-            output_path = self.staging_paths[output_format]
-            with fsspec.open(output_path, "w", encoding="utf-8") as f:
+            with fsspec.open(self.staging_paths[output_format], "w", encoding="utf-8") as f:
                 f.write(output)
-            mlflow.log_artifact(output_path)
+
+    def log_to_mlflow(self, job, nb):
+        mlflow.set_tracking_uri(MLFLOW_SERVER_URI)
+        with mlflow.start_run(run_id=job.mlflow_run_id):
+            if job.parameters:
+                mlflow.log_params(job.parameters)
+
+            for cell_idx, cell in enumerate(nb.cells):
+                if "tags" in cell.metadata:
+                    if "mlflow_log" in cell.metadata["tags"]:
+                        self.mlflow_log(cell, cell_idx)
+                    elif "mlflow_log_input" in cell.metadata["tags"]:
+                        self.mlflow_log_input(cell, cell_idx)
+                    elif "mlflow_log_output" in cell.metadata["tags"]:
+                        self.mlflow_log_output(cell, cell_idx)
+
+            for output_format in job.output_formats:
+                output_path = self.staging_paths[output_format]
+                directory, file_name_with_extension = os.path.split(output_path)
+                file_name, file_extension = os.path.splitext(file_name_with_extension)
+                file_name_parts = file_name.split("-")
+                file_name_without_timestamp = "-".join(file_name_parts[:-7])
+                file_name_final = f"{file_name_without_timestamp}{file_extension}"
+                new_output_path = os.path.join(directory, file_name_final)
+                shutil.copy(output_path, new_output_path)
+                timestamp = "-".join(file_name_parts[-7:]).split(".")[0]
+                mlflow.log_param("job_created", timestamp)
+                mlflow.log_artifact(new_output_path, "")
+                os.remove(new_output_path)
+
+    def mlflow_log(self, cell, cell_idx):
+        self.mlflow_log_input(cell, cell_idx)
+        self.mlflow_log_output(cell, cell_idx)
+
+    def mlflow_log_input(self, cell, cell_idx):
+        mlflow.log_text(cell.source, f"cell_{cell_idx}_input.txt")
+
+    def mlflow_log_output(self, cell, cell_idx):
+        if cell.cell_type == "code" and hasattr(cell, "outputs"):
+            self._log_code_output(cell_idx, cell.outputs)
+        elif cell.cell_type == "markdown":
+            self._log_markdown_output(cell, cell_idx)
+
+    def _log_code_output(self, cell_idx, outputs):
+        for output_idx, output in enumerate(outputs):
+            if output.output_type == "stream":
+                self._log_stream_output(cell_idx, output_idx, output)
+            elif hasattr(output, "data"):
+                for output_data_idx, output_data in enumerate(output.data):
+                    if output_data == "text/plain":
+                        mlflow.log_text(
+                            output.data[output_data],
+                            f"cell_{cell_idx}_output_{output_data_idx}.txt",
+                        )
+                    elif output_data == "text/html":
+                        self._log_html_output(output, cell_idx, output_data_idx)
+                    elif output_data == "application/pdf":
+                        self._log_pdf_output(output, cell_idx, output_data_idx)
+                    elif output_data.startswith("image"):
+                        self._log_image_output(output, cell_idx, output_data_idx, output_data)
+
+    def _log_stream_output(self, cell_idx, output_idx, output):
+        mlflow.log_text("".join(output.text), f"cell_{cell_idx}_output_{output_idx}.txt")
+
+    def _log_html_output(self, output, cell_idx, output_idx):
+        if "text/html" in output.data:
+            html_content = output.data["text/html"]
+            if isinstance(html_content, list):
+                html_content = "".join(html_content)
+            mlflow.log_text(html_content, f"cell_{cell_idx}_output_{output_idx}.html")
+
+    def _log_pdf_output(self, output, cell_idx, output_idx):
+        pdf_data = base64.b64decode(output.data["application/pdf"].split(",")[1])
+        with open(f"cell_{cell_idx}_output_{output_idx}.pdf", "wb") as pdf_file:
+            pdf_file.write(pdf_data)
+        mlflow.log_artifact(f"cell_{cell_idx}_output_{output_idx}.pdf")
+
+    def _log_image_output(self, output, cell_idx, output_idx, mime_type):
+        image_data_str = output.data[mime_type]
+        if "," in image_data_str:
+            image_data_base64 = image_data_str.split(",")[1]
+        else:
+            image_data_base64 = image_data_str
+
+        try:
+            image_data = base64.b64decode(image_data_base64)
+            image_extension = mime_type.split("/")[1]
+            filename = f"cell_{cell_idx}_output_{output_idx}.{image_extension}"
+            with open(filename, "wb") as image_file:
+                image_file.write(image_data)
+            mlflow.log_artifact(filename)
+            os.remove(filename)
+        except Exception as e:
+            print(f"Error logging image output in cell {cell_idx}, output {output_idx}: {e}")
+
+    def _log_markdown_output(self, cell, cell_idx):
+        mlflow.log_text(cell.source, f"cell_{cell_idx}_output_0.md")
 
     def supported_features(cls) -> Dict[JobFeature, bool]:
         return {
diff --git a/jupyter_scheduler/scheduler.py b/jupyter_scheduler/scheduler.py
@@ -3,8 +3,9 @@
 import random
 import shutil
 from typing import Dict, List, Optional, Type, Union
+import signal
 import subprocess
-from typing import Dict, Optional, Type, Union
+import sys
 from uuid import uuid4
 
 import fsspec
@@ -408,17 +409,31 @@ class Scheduler(BaseScheduler):
     task_runner = Instance(allow_none=True, klass="jupyter_scheduler.task_runner.BaseTaskRunner")
 
     def start_mlflow_server(self):
-        subprocess.Popen(
+        mlflow_process = subprocess.Popen(
             [
                 "mlflow",
                 "server",
                 "--host",
                 MLFLOW_SERVER_HOST,
                 "--port",
                 MLFLOW_SERVER_PORT,
-            ]
+            ],
+            preexec_fn=os.setsid,
         )
         mlflow.set_tracking_uri(MLFLOW_SERVER_URI)
+        return mlflow_process
+
+    def stop_mlflow_server(self):
+        if self.mlflow_process is not None:
+            os.killpg(os.getpgid(self.mlflow_process.pid), signal.SIGTERM)
+            self.mlflow_process.wait()
+            self.mlflow_process = None
+            print("MLFlow server stopped")
+
+    def mlflow_signal_handler(self, signum, frame):
+        print("Shutting down MLFlow server")
+        self.stop_mlflow_server()
+        sys.exit(0)
 
     def __init__(
         self,
@@ -435,7 +450,9 @@ def __init__(
         if self.task_runner_class:
             self.task_runner = self.task_runner_class(scheduler=self, config=config)
 
-        self.start_mlflow_server()
+        self.mlflow_process = self.start_mlflow_server()
+        signal.signal(signal.SIGINT, self.mlflow_signal_handler)
+        signal.signal(signal.SIGTERM, self.mlflow_signal_handler)
 
     @property
     def db_session(self):