docs: add LSF and Slurm HPC guides (#33)

claymcleod · web-flow · commit afe60d67d8ec · 2026-02-13T14:19:24.000-06:00
diff --git a/.vitepress/config.mts b/.vitepress/config.mts
@@ -34,10 +34,16 @@ export default defineConfig({
           { text: "Guided Tour", link: "/guided-tour", docFooterText: "Getting Started &gt; Guided Tour" },
         ],
       },
+      {
+        text: "Concepts",
+        items: [
+          { text: "Provenance Tracking", link: "/concepts/provenance", docFooterText: "Concepts &gt; Provenance Tracking" },
+        ],
+      },
       {
         text: "Configuration",
         items: [
-          { text: "Overview", link: "/configuration/overview", docFooterText: "Configuration &gt; Configuration" },
+          { text: "Overview", link: "/configuration/overview", docFooterText: "Configuration &gt; Overview" },
           {
             text: "Execution Backends",
             collapsed: true,
@@ -64,9 +70,10 @@ export default defineConfig({
         ],
       },
       {
-        text: "Concepts",
+        text: "End-to-end Guides",
         items: [
-          { text: "Provenance Tracking", link: "/concepts/provenance", docFooterText: "Concepts &gt; Provenance Tracking" },
+          { text: "LSF + Apptainer", link: "/guides/lsf", docFooterText: "End-to-end Guides &gt; LSF + Apptainer" },
+          { text: "Slurm + Apptainer", link: "/guides/slurm", docFooterText: "End-to-end Guides &gt; Slurm + Apptainer" },
         ],
       },
       {
@@ -90,8 +97,8 @@ export default defineConfig({
         ]
       },
       {
-        text: "Visual Studio Code Extension",
-        items: [{ text: "Getting Started", link: "/vscode/getting-started", docFooterText: "Extension &gt; Getting Started" }],
+        text: "Editor Integrations",
+        items: [{ text: "Visual Studio Code", link: "/vscode/getting-started", docFooterText: "Editor Integrations &gt; Visual Studio Code" }],
       },
     ],
     socialLinks: [
diff --git a/concepts/provenance.md b/concepts/provenance.md
@@ -80,12 +80,13 @@ the run directory.
 ```
 ./out/
 ├── sprocket.db                       # SQLite provenance database
-├── output.log                        # Execution log
 ├── runs/
 │   └── <target>/
 │       ├── <timestamp>/              # Individual run (YYYY-MM-DD_HHMMSSffffff)
+│       │   ├── output.log            # Execution log
 │       │   ├── inputs.json           # Serialized inputs for the run
 │       │   ├── outputs.json          # Serialized outputs from the run
+│       │   ├── apptainer-images/     # Cached SIF images (Apptainer backends only)
 │       │   ├── tmp/                  # Temporary localization files
 │       │   └── attempts/
 │       │       └── <n>/              # Attempt number (0, 1, 2, ...)
@@ -108,12 +109,13 @@ subdirectory under `calls/`. Each call directory then contains the same
 ```
 ./out/
 ├── sprocket.db
-├── output.log
 ├── runs/
 │   └── <target>/
 │       ├── <timestamp>/
+│       │   ├── output.log
 │       │   ├── inputs.json
 │       │   ├── outputs.json
+│       │   ├── apptainer-images/     # Cached SIF images (Apptainer backends only)
 │       │   ├── tmp/                  # Workflow-level temporary files
 │       │   └── calls/               # Task execution directories
 │       │       └── <task_call_id>/   # One per task call in the workflow
@@ -165,6 +167,7 @@ the following:
 | `output.log` | Log of all messages emitted during the run |
 | `inputs.json` | Serialized inputs provided for the run |
 | `outputs.json` | Serialized outputs produced by the run |
+| `apptainer-images/` | Cached SIF container images pulled during the run (Apptainer backends only) |
 | `tmp/` | Temporary files used during input localization |
 | `attempts/` | Directory containing attempt subdirectories (task runs) |
 | `calls/` | Directory containing per-task-call subdirectories (workflow runs) |
diff --git a/configuration/backends/lsf.md b/configuration/backends/lsf.md
@@ -1,15 +1,19 @@
 # LSF + Apptainer backend
 
 Sprocket contains an experimental High-Performance Computing (HPC) backend
-targeting environments that use [LSF
-10.1.0](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0) for job scheduling and
-[Apptainer 1.3.6](https://apptainer.org/docs/user/1.3/) as a container runtime.
+targeting environments that use [LSF 10.1.0 or
+later](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0) for job scheduling and
+[Apptainer 1.3.6 or later](https://apptainer.org/docs/user/1.3/) as a container
+runtime.
 
 > [!WARNING]
 >
 > This backend is experimental, and its behavior and configuration may change
 > substantially between Sprocket releases.
 
+For a step-by-step walkthrough of setting up Sprocket on an LSF cluster, see the
+[LSF + Apptainer guide](/guides/lsf).
+
 To execute WDL workflows and tasks using the LSF + Apptainer backend, you must
 be running Sprocket on a Linux system with the LSF command-line tools available
 locally. The nodes where LSF dispatches jobs must have the Apptainer
@@ -21,56 +25,64 @@ using the HPC:
 ```toml
 # The LSF + Apptainer backend requires explicitly opting into experimental
 # features.
-run.experimental_features_enabled = true
+[run]
+experimental_features_enabled = true
 
 # Set the default backend to LSF + Apptainer.
-run.backends.default.type = "lsf_apptainer"
+[run.backends.default]
+type = "lsf_apptainer"
 
 # The LSF queue used by default for task execution.
 #
 # This parameter is optional. If it's absent and no other applicable queues
 # are specified, jobs will be submitted to your LSF cluster's default queue.
-# run.backends.default.default_lsf_queue.name = "standard"
+# default_lsf_queue.name = "standard"
 # The largest number of CPUs and memory that can be reserved for a single job
 # on this queue.
 #
 # These parameters are optional, and should be set according to site-specific
 # information about the hosts available to dispatch work from the queue. They
 # can also be set for the other types of queues, but this example leaves them
 # unconstrained by default.
-# run.backends.default.default_lsf_queue.max_cpu_per_task = 64
-# run.backends.default.default_lsf_queue.max_memory_per_task = "96 GB"
+# default_lsf_queue.max_cpu_per_task = 64
+# default_lsf_queue.max_memory_per_task = "96 GB"
 
 # The LSF queue used for short tasks.
 #
 # This parameter is optional, and overrides `default_lsf_queue`.
-# run.backends.default.short_task_lsf_queue.name = "short"
+# short_task_lsf_queue.name = "short"
 
 # The LSF queue used for GPU tasks.
 #
 # This parameter is optional, and overrides `default_lsf_queue` and
 # `short_task_lsf_queue`.
-# run.backends.default.gpu_lsf_queue.name = "gpu"
+# gpu_lsf_queue.name = "gpu"
 
 # The LSF queue used for FPGA tasks.
 #
 # This parameter is optional, and overrides `default_lsf_queue` and
 # `short_task_lsf_queue`.
-# run.backends.default.fpga_lsf_queue.name = "fpga"
+# fpga_lsf_queue.name = "fpga"
 
 # Additional command-line arguments to pass to `bsub` when submitting jobs to
 # LSF.
-# run.backends.default.extra_bsub_args = ["-app", "my_app_profile"]
+# extra_bsub_args = ["-app", "my_app_profile"]
 
-# The maximum number of subtasks each `scatter` will try executing at once.
-#
-# This is *not* a direct limit on the total number of concurrent tasks, but
-# can affect the number of jobs that get queued at one time.
-# run.backends.default.max_scatter_concurrency = 100
+# The maximum number of concurrent `bsub` processes the backend will spawn to
+# queue tasks. Defaults to `10`. Consider raising this for large-scale
+# workflow execution.
+# max_concurrency = 10
+
+# Prefix added to every LSF job name. Useful for identifying Sprocket jobs
+# in `bjobs` output (e.g., `bjobs -J "sprocket*"`).
+# job_name_prefix = "sprocket"
+
+# Task monitor polling interval in seconds. Defaults to `30`.
+# interval = 30
 
 # Additional command-line arguments to pass to `apptainer exec` when executing
 # tasks.
-# run.backends.default.extra_apptainer_exec_args = ["--hostname=\"my_host\""]
+# extra_apptainer_exec_args = ["--hostname=\"my_host\""]
 ```
 
 If you run into problems or have other feedback, please reach out to us in the
diff --git a/configuration/backends/slurm.md b/configuration/backends/slurm.md
@@ -1,15 +1,19 @@
 # Slurm + Apptainer backend
 
 Sprocket contains an experimental High-Performance Computing (HPC) backend
-targeting environments that use [Slurm
-25.05.0](https://slurm.schedmd.com/archive/slurm-25.05.0/) for job scheduling and
-[Apptainer 1.3.6](https://apptainer.org/docs/user/1.3/) as a container runtime.
+targeting environments that use [Slurm 25.05.0 or
+later](https://slurm.schedmd.com/archive/slurm-25.05.0/) for job scheduling and
+[Apptainer 1.3.6 or later](https://apptainer.org/docs/user/1.3/) as a container
+runtime.
 
 > [!WARNING]
 >
 > This backend is experimental, and its behavior and configuration may change
 > substantially between Sprocket releases.
 
+For a step-by-step walkthrough of setting up Sprocket on a Slurm cluster, see the
+[Slurm + Apptainer guide](/guides/slurm).
+
 To execute WDL workflows and tasks using the Slurm + Apptainer backend, you must
 be running Sprocket on a Linux system with the Slurm command-line tools available
 locally. The nodes where Slurm dispatches jobs must have the Apptainer
@@ -21,53 +25,49 @@ using the HPC:
 ```toml
 # The Slurm + Apptainer backend requires explicitly opting into experimental
 # features.
-run.experimental_features_enabled = true
+[run]
+experimental_features_enabled = true
 
 # Set the default backend to Slurm + Apptainer.
-run.backends.default.type = "slurm_apptainer"
+[run.backends.default]
+type = "slurm_apptainer"
 
 # The Slurm partition used by default for task execution.
 #
 # This parameter is optional. If it's absent and no other applicable
 # partitions are specified, jobs will be submitted to your Slurm cluster's
 # default partition.
-run.backends.default.default_slurm_partition.name = "gpu"
+default_slurm_partition.name = "gpu"
 # The largest number of CPUs and memory that can be reserved for a single job
 # on this partition.
 #
 # These parameters are optional, and should be set according to site-specific
 # information about the hosts available to dispatch work from the partition.
 # They can also be set for the other types of partitions, but this example
 # leaves them unconstrained by default.
-run.backends.default.default_slurm_partition.max_cpu_per_task = 64
-run.backends.default.default_slurm_partition.max_memory_per_task = "96 GB"
+default_slurm_partition.max_cpu_per_task = 64
+default_slurm_partition.max_memory_per_task = "96 GB"
 
 # The Slurm partition used for short tasks.
 #
 # This parameter is optional, and overrides `default_slurm_partition`.
-run.backends.default.short_task_slurm_partition.name = "short"
+short_task_slurm_partition.name = "short"
 
 # The Slurm partition used for GPU tasks.
 #
 # This parameter is optional, and overrides `default_slurm_partition` and
 # `short_task_slurm_partition`.
-run.backends.default.gpu_slurm_partition.name = "gpu"
+gpu_slurm_partition.name = "gpu"
 
 # The Slurm partition used for FPGA tasks.
 #
 # This parameter is optional, and overrides `default_slurm_partition` and
 # `short_task_slurm_partition`.
-run.backends.default.fpga_slurm_partition.name = "fpga"
+fpga_slurm_partition.name = "fpga"
 
 # Additional command-line arguments to pass to `sbatch` when submitting jobs
 # to Slurm.
-run.backends.default.extra_sbatch_args = ["--time=60"]
-
-# The maximum number of subtasks each `scatter` will try executing at once.
-#
-# This is *not* a direct limit on the total number of concurrent tasks, but
-# can affect the number of jobs that get queued at one time.
-run.backends.default.max_scatter_concurrency = 200
+extra_sbatch_args = ["--time=60"]
 ```
 
 If you run into problems or have other feedback, please reach out to us in the
diff --git a/guides/lsf.md b/guides/lsf.md
diff --git a/guides/slurm.md b/guides/slurm.md