[1.x] Archiving scheduler, fix JFM tests (#418)

JasonWeill · pre-commit-ci[bot] · dlqqq · web-flow · commit f6066bd423c6 · 2023-08-15T15:29:09.000-07:00
* Archiving all-files scheduler (#388) * Fix typo in comment * WIP: Adds new scheduler * writes individual files * WIP: Write zip file * WIP: Trying to get zip file to be written only on scheduled job runs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * WIP: Removes zip type, incremental work for archiving work dir * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Create tar.gz in staging subdir * Capture side effect files in staging dir * Extracts files * Add filter * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update jupyter_scheduler/job_files_manager.py Co-authored-by: david qiu <david@qiu.dev> * Simplifies cleanup logic * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updates docs, deletes old Archiving*, renames AllFilesArchiving --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: david qiu <david@qiu.dev> * Avoids option compatible only with Python 3.11 * Fix JFM tests (#424) * fix JFM tests * pre-commit * add minor comment --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: david qiu <david@qiu.dev>
diff --git a/docs/operators/index.md b/docs/operators/index.md
@@ -86,6 +86,20 @@ jupyter lab --SchedulerApp.job_files_manager_class=jupyter_scheduler.job_files_m
 
 For more information on writing a custom implementation, please see the {doc}`developer's guide </developers/index>`.
 
+### Example: Capturing side effect files
+
+The default scheduler and execution manager classes do not capture
+**side effect files**, files that are created as a side effect of executing
+cells in a notebook. The `ArchivingScheduler` and `ArchivingExecutionManager`
+classes do capture side effect files. If you intend to run notebooks that produce
+side effect files, you can use these classes by running:
+
+```
+jupyter lab \
+  --SchedulerApp.scheduler_class=jupyter_scheduler.scheduler.ArchivingScheduler \
+  --Scheduler.execution_manager_class=jupyter_scheduler.executors.ArchivingExecutionManager
+```
+
 ## UI configuration
 
 You can configure the Jupyter Scheduler UI by installing a lab extension that both:
diff --git a/jupyter_scheduler/executors.py b/jupyter_scheduler/executors.py
@@ -1,4 +1,6 @@
 import io
+import os
+import shutil
 import tarfile
 import traceback
 from abc import ABC, abstractmethod
@@ -174,12 +176,12 @@ def validate(cls, input_path: str) -> bool:
 
 
 class ArchivingExecutionManager(DefaultExecutionManager):
-    """Execution manager that archives the output
-    files to a compressed tar file.
+    """Execution manager that archives all output files in and under the
+    output directory into a single archive file
 
     Notes
     -----
-    Should be used along with :class:`~jupyter_scheduler.scheduler.ArchiveDownloadingScheduler`
+    Should be used along with :class:`~jupyter_scheduler.scheduler.ArchivingScheduler`
     as the `scheduler_class` during jupyter server start.
     """
 
@@ -197,27 +199,41 @@ def execute(self):
             store_widget_state=True,
         )
 
+        # Get the directory of the input file
+        local_staging_dir = os.path.dirname(self.staging_paths["input"])
+        # Directory where side-effect files are written
+        run_dir = os.path.join(local_staging_dir, "files")
+        os.mkdir(run_dir)
+
         try:
-            ep.preprocess(nb)
+            ep.preprocess(nb, {"metadata": {"path": run_dir}})
         except CellExecutionError as e:
             pass
         finally:
+            # Create all desired output files, other than "input" and "tar.gz"
+            for output_format in job.output_formats:
+                if output_format == "input" or output_format == "tar.gz":
+                    pass
+                else:
+                    cls = nbconvert.get_exporter(output_format)
+                    output, resources = cls().from_notebook_node(nb)
+                    f = open(self.staging_paths[output_format], "wb")
+                    f.write(bytes(output, "utf-8"))
+                    f.close()
+
+            # Create an archive file of the staging directory for this run
+            # and everything under it
             fh = io.BytesIO()
             with tarfile.open(fileobj=fh, mode="w:gz") as tar:
-                output_formats = job.output_formats + ["input"]
-                for output_format in output_formats:
-                    if output_format == "input":
-                        with open(self.staging_paths["input"]) as f:
-                            output = f.read()
-                    else:
-                        cls = nbconvert.get_exporter(output_format)
-                        output, resources = cls().from_notebook_node(nb)
-                    data = bytes(output, "utf-8")
-                    source_f = io.BytesIO(initial_bytes=data)
-                    info = tarfile.TarInfo(self.staging_paths[output_format])
-                    info.size = len(data)
-                    tar.addfile(info, source_f)
+                for root, dirs, files in os.walk(local_staging_dir):
+                    for file in files:
+                        # This flattens the directory structure, so that in the tar
+                        # file, output files and side-effect files are side-by-side
+                        tar.add(os.path.join(root, file), file)
 
             archive_filepath = self.staging_paths["tar.gz"]
             with fsspec.open(archive_filepath, "wb") as f:
                 f.write(fh.getvalue())
+
+            # Clean up the side-effect files in the run directory
+            shutil.rmtree(run_dir)
diff --git a/jupyter_scheduler/job_files_manager.py b/jupyter_scheduler/job_files_manager.py
@@ -52,37 +52,32 @@ def __init__(
 
     def generate_filepaths(self):
         """A generator that produces filepaths"""
-        output_dir = self.output_dir
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
         output_formats = self.output_formats + ["input"]
 
         for output_format in output_formats:
             input_filepath = self.staging_paths[output_format]
-            output_filename = self.output_filenames[output_format]
-            output_filepath = os.path.join(output_dir, output_filename)
+            output_filepath = os.path.join(self.output_dir, self.output_filenames[output_format])
             if not os.path.exists(output_filepath) or self.redownload:
                 yield input_filepath, output_filepath
 
     def download_tar(self, archive_format: str = "tar"):
         archive_filepath = self.staging_paths[archive_format]
         read_mode = "r:gz" if archive_format == "tar.gz" else "tar"
+
         with fsspec.open(archive_filepath) as f:
             with tarfile.open(fileobj=f, mode=read_mode) as tar:
-                filepaths = self.generate_filepaths()
-                for input_filepath, output_filepath in filepaths:
-                    try:
-                        input_file = tar.extractfile(member=input_filepath)
-                        with fsspec.open(output_filepath, mode="wb") as output_file:
-                            output_file.write(input_file.read())
-                    except Exception as e:
-                        pass
+                tar.extractall(self.output_dir)
 
     def download(self):
+        # ensure presence of staging paths
         if not self.staging_paths:
             return
 
+        # ensure presence of output dir
+        output_dir = self.output_dir
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
         if "tar" in self.staging_paths:
             self.download_tar()
         elif "tar.gz" in self.staging_paths:
diff --git a/jupyter_scheduler/scheduler.py b/jupyter_scheduler/scheduler.py
@@ -686,7 +686,7 @@ def get_staging_paths(self, model: Union[DescribeJob, DescribeJobDefinition]) ->
 
 
 class ArchivingScheduler(Scheduler):
-    """Scheduler that adds archive path to staging paths."""
+    """Scheduler that captures all files in output directory in an archive."""
 
     execution_manager_class = TType(
         klass="jupyter_scheduler.executors.ExecutionManager",
@@ -705,12 +705,16 @@ def get_staging_paths(self, model: Union[DescribeJob, DescribeJobDefinition]) ->
             filename = create_output_filename(
                 model.input_filename, model.create_time, output_format
             )
-            staging_paths[output_format] = filename
+            # Use the staging directory to capture output files
+            staging_paths[output_format] = os.path.join(self.staging_path, id, filename)
 
-        output_format = "tar.gz"
-        filename = create_output_filename(model.input_filename, model.create_time, output_format)
-        staging_paths[output_format] = os.path.join(self.staging_path, model.job_id, filename)
-        staging_paths["input"] = os.path.join(self.staging_path, model.job_id, model.input_filename)
+        # Create an output archive file
+        staging_paths["tar.gz"] = os.path.join(
+            self.staging_path,
+            id,
+            create_output_filename(model.input_filename, model.create_time, "tar.gz"),
+        )
+        staging_paths["input"] = os.path.join(self.staging_path, id, model.input_filename)
 
         return staging_paths
 
diff --git a/jupyter_scheduler/tests/test_job_files_manager.py b/jupyter_scheduler/tests/test_job_files_manager.py
@@ -2,15 +2,14 @@
 import os
 import shutil
 import tarfile
-import tempfile
+import time
 from pathlib import Path
 from unittest.mock import patch
 
 import pytest
 
 from jupyter_scheduler.job_files_manager import Downloader, JobFilesManager
 from jupyter_scheduler.models import DescribeJob, JobFile
-from jupyter_scheduler.scheduler import BaseScheduler
 
 
 async def test_copy_from_staging():
@@ -68,6 +67,9 @@ async def test_copy_from_staging():
 def clear_outputs_dir():
     yield
     shutil.rmtree(OUTPUTS_DIR)
+    # rmtree() is not synchronous; wait until it has finished running
+    while os.path.isdir(OUTPUTS_DIR):
+        time.sleep(0.01)
 
 
 @pytest.mark.parametrize(
@@ -76,9 +78,9 @@ def clear_outputs_dir():
         (
             ["ipynb", "html"],
             {
-                "ipynb": "helloworld-out.ipynb",
-                "html": "helloworld-out.html",
-                "input": "helloworld-input.ipynb",
+                "ipynb": "job-1/helloworld-out.ipynb",
+                "html": "job-1/helloworld-out.html",
+                "input": "job-1/helloworld-input.ipynb",
             },
             {
                 "ipynb": os.path.join(HERE, "test_staging_dir", "job-1", "helloworld-1.ipynb"),
@@ -91,9 +93,9 @@ def clear_outputs_dir():
         (
             ["ipynb", "html"],
             {
-                "ipynb": "helloworld-out.ipynb",
-                "html": "helloworld-out.html",
-                "input": "helloworld-input.ipynb",
+                "ipynb": "job-2/helloworld-1.ipynb",
+                "html": "job-2/helloworld-1.html",
+                "input": "job-2/helloworld.ipynb",
             },
             {
                 "tar.gz": os.path.join(HERE, "test_staging_dir", "job-2", "helloworld.tar.gz"),
@@ -120,10 +122,13 @@ def test_downloader_download(
 
     assert os.path.exists(output_dir)
     for format in output_formats:
+        # get path to output file corresponding to this format
         out_filepath = os.path.join(output_dir, output_filenames[format])
 
+        # assert each output file exists
         assert os.path.exists(out_filepath)
 
+        # assert integrity of each output file
         if "tar.gz" in staging_paths:
             with tarfile.open(staging_paths["tar.gz"]) as tar:
                 input_file = tar.extractfile(member=staging_paths[format])