Skip to content

Commit afe60d6

Browse files
authored
docs: add LSF and Slurm HPC guides (#33)
1 parent c2300e7 commit afe60d6

File tree

6 files changed

+804
-43
lines changed

6 files changed

+804
-43
lines changed

.vitepress/config.mts

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,16 @@ export default defineConfig({
3434
{ text: "Guided Tour", link: "/guided-tour", docFooterText: "Getting Started > Guided Tour" },
3535
],
3636
},
37+
{
38+
text: "Concepts",
39+
items: [
40+
{ text: "Provenance Tracking", link: "/concepts/provenance", docFooterText: "Concepts > Provenance Tracking" },
41+
],
42+
},
3743
{
3844
text: "Configuration",
3945
items: [
40-
{ text: "Overview", link: "/configuration/overview", docFooterText: "Configuration > Configuration" },
46+
{ text: "Overview", link: "/configuration/overview", docFooterText: "Configuration > Overview" },
4147
{
4248
text: "Execution Backends",
4349
collapsed: true,
@@ -64,9 +70,10 @@ export default defineConfig({
6470
],
6571
},
6672
{
67-
text: "Concepts",
73+
text: "End-to-end Guides",
6874
items: [
69-
{ text: "Provenance Tracking", link: "/concepts/provenance", docFooterText: "Concepts > Provenance Tracking" },
75+
{ text: "LSF + Apptainer", link: "/guides/lsf", docFooterText: "End-to-end Guides > LSF + Apptainer" },
76+
{ text: "Slurm + Apptainer", link: "/guides/slurm", docFooterText: "End-to-end Guides > Slurm + Apptainer" },
7077
],
7178
},
7279
{
@@ -90,8 +97,8 @@ export default defineConfig({
9097
]
9198
},
9299
{
93-
text: "Visual Studio Code Extension",
94-
items: [{ text: "Getting Started", link: "/vscode/getting-started", docFooterText: "Extension > Getting Started" }],
100+
text: "Editor Integrations",
101+
items: [{ text: "Visual Studio Code", link: "/vscode/getting-started", docFooterText: "Editor Integrations > Visual Studio Code" }],
95102
},
96103
],
97104
socialLinks: [

concepts/provenance.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,13 @@ the run directory.
8080
```
8181
./out/
8282
├── sprocket.db # SQLite provenance database
83-
├── output.log # Execution log
8483
├── runs/
8584
│ └── <target>/
8685
│ ├── <timestamp>/ # Individual run (YYYY-MM-DD_HHMMSSffffff)
86+
│ │ ├── output.log # Execution log
8787
│ │ ├── inputs.json # Serialized inputs for the run
8888
│ │ ├── outputs.json # Serialized outputs from the run
89+
│ │ ├── apptainer-images/ # Cached SIF images (Apptainer backends only)
8990
│ │ ├── tmp/ # Temporary localization files
9091
│ │ └── attempts/
9192
│ │ └── <n>/ # Attempt number (0, 1, 2, ...)
@@ -108,12 +109,13 @@ subdirectory under `calls/`. Each call directory then contains the same
108109
```
109110
./out/
110111
├── sprocket.db
111-
├── output.log
112112
├── runs/
113113
│ └── <target>/
114114
│ ├── <timestamp>/
115+
│ │ ├── output.log
115116
│ │ ├── inputs.json
116117
│ │ ├── outputs.json
118+
│ │ ├── apptainer-images/ # Cached SIF images (Apptainer backends only)
117119
│ │ ├── tmp/ # Workflow-level temporary files
118120
│ │ └── calls/ # Task execution directories
119121
│ │ └── <task_call_id>/ # One per task call in the workflow
@@ -165,6 +167,7 @@ the following:
165167
| `output.log` | Log of all messages emitted during the run |
166168
| `inputs.json` | Serialized inputs provided for the run |
167169
| `outputs.json` | Serialized outputs produced by the run |
170+
| `apptainer-images/` | Cached SIF container images pulled during the run (Apptainer backends only) |
168171
| `tmp/` | Temporary files used during input localization |
169172
| `attempts/` | Directory containing attempt subdirectories (task runs) |
170173
| `calls/` | Directory containing per-task-call subdirectories (workflow runs) |

configuration/backends/lsf.md

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
# LSF + Apptainer backend
22

33
Sprocket contains an experimental High-Performance Computing (HPC) backend
4-
targeting environments that use [LSF
5-
10.1.0](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0) for job scheduling and
6-
[Apptainer 1.3.6](https://apptainer.org/docs/user/1.3/) as a container runtime.
4+
targeting environments that use [LSF 10.1.0 or
5+
later](https://www.ibm.com/docs/en/spectrum-lsf/10.1.0) for job scheduling and
6+
[Apptainer 1.3.6 or later](https://apptainer.org/docs/user/1.3/) as a container
7+
runtime.
78

89
> [!WARNING]
910
>
1011
> This backend is experimental, and its behavior and configuration may change
1112
> substantially between Sprocket releases.
1213
14+
For a step-by-step walkthrough of setting up Sprocket on an LSF cluster, see the
15+
[LSF + Apptainer guide](/guides/lsf).
16+
1317
To execute WDL workflows and tasks using the LSF + Apptainer backend, you must
1418
be running Sprocket on a Linux system with the LSF command-line tools available
1519
locally. The nodes where LSF dispatches jobs must have the Apptainer
@@ -21,56 +25,64 @@ using the HPC:
2125
```toml
2226
# The LSF + Apptainer backend requires explicitly opting into experimental
2327
# features.
24-
run.experimental_features_enabled = true
28+
[run]
29+
experimental_features_enabled = true
2530

2631
# Set the default backend to LSF + Apptainer.
27-
run.backends.default.type = "lsf_apptainer"
32+
[run.backends.default]
33+
type = "lsf_apptainer"
2834

2935
# The LSF queue used by default for task execution.
3036
#
3137
# This parameter is optional. If it's absent and no other applicable queues
3238
# are specified, jobs will be submitted to your LSF cluster's default queue.
33-
# run.backends.default.default_lsf_queue.name = "standard"
39+
# default_lsf_queue.name = "standard"
3440
# The largest number of CPUs and memory that can be reserved for a single job
3541
# on this queue.
3642
#
3743
# These parameters are optional, and should be set according to site-specific
3844
# information about the hosts available to dispatch work from the queue. They
3945
# can also be set for the other types of queues, but this example leaves them
4046
# unconstrained by default.
41-
# run.backends.default.default_lsf_queue.max_cpu_per_task = 64
42-
# run.backends.default.default_lsf_queue.max_memory_per_task = "96 GB"
47+
# default_lsf_queue.max_cpu_per_task = 64
48+
# default_lsf_queue.max_memory_per_task = "96 GB"
4349

4450
# The LSF queue used for short tasks.
4551
#
4652
# This parameter is optional, and overrides `default_lsf_queue`.
47-
# run.backends.default.short_task_lsf_queue.name = "short"
53+
# short_task_lsf_queue.name = "short"
4854

4955
# The LSF queue used for GPU tasks.
5056
#
5157
# This parameter is optional, and overrides `default_lsf_queue` and
5258
# `short_task_lsf_queue`.
53-
# run.backends.default.gpu_lsf_queue.name = "gpu"
59+
# gpu_lsf_queue.name = "gpu"
5460

5561
# The LSF queue used for FPGA tasks.
5662
#
5763
# This parameter is optional, and overrides `default_lsf_queue` and
5864
# `short_task_lsf_queue`.
59-
# run.backends.default.fpga_lsf_queue.name = "fpga"
65+
# fpga_lsf_queue.name = "fpga"
6066

6167
# Additional command-line arguments to pass to `bsub` when submitting jobs to
6268
# LSF.
63-
# run.backends.default.extra_bsub_args = ["-app", "my_app_profile"]
69+
# extra_bsub_args = ["-app", "my_app_profile"]
6470

65-
# The maximum number of subtasks each `scatter` will try executing at once.
66-
#
67-
# This is *not* a direct limit on the total number of concurrent tasks, but
68-
# can affect the number of jobs that get queued at one time.
69-
# run.backends.default.max_scatter_concurrency = 100
71+
# The maximum number of concurrent `bsub` processes the backend will spawn to
72+
# queue tasks. Defaults to `10`. Consider raising this for large-scale
73+
# workflow execution.
74+
# max_concurrency = 10
75+
76+
# Prefix added to every LSF job name. Useful for identifying Sprocket jobs
77+
# in `bjobs` output (e.g., `bjobs -J "sprocket*"`).
78+
# job_name_prefix = "sprocket"
79+
80+
# Task monitor polling interval in seconds. Defaults to `30`.
81+
# interval = 30
7082

7183
# Additional command-line arguments to pass to `apptainer exec` when executing
7284
# tasks.
73-
# run.backends.default.extra_apptainer_exec_args = ["--hostname=\"my_host\""]
85+
# extra_apptainer_exec_args = ["--hostname=\"my_host\""]
7486
```
7587

7688
If you run into problems or have other feedback, please reach out to us in the

configuration/backends/slurm.md

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
# Slurm + Apptainer backend
22

33
Sprocket contains an experimental High-Performance Computing (HPC) backend
4-
targeting environments that use [Slurm
5-
25.05.0](https://slurm.schedmd.com/archive/slurm-25.05.0/) for job scheduling and
6-
[Apptainer 1.3.6](https://apptainer.org/docs/user/1.3/) as a container runtime.
4+
targeting environments that use [Slurm 25.05.0 or
5+
later](https://slurm.schedmd.com/archive/slurm-25.05.0/) for job scheduling and
6+
[Apptainer 1.3.6 or later](https://apptainer.org/docs/user/1.3/) as a container
7+
runtime.
78

89
> [!WARNING]
910
>
1011
> This backend is experimental, and its behavior and configuration may change
1112
> substantially between Sprocket releases.
1213
14+
For a step-by-step walkthrough of setting up Sprocket on a Slurm cluster, see the
15+
[Slurm + Apptainer guide](/guides/slurm).
16+
1317
To execute WDL workflows and tasks using the Slurm + Apptainer backend, you must
1418
be running Sprocket on a Linux system with the Slurm command-line tools available
1519
locally. The nodes where Slurm dispatches jobs must have the Apptainer
@@ -21,53 +25,49 @@ using the HPC:
2125
```toml
2226
# The Slurm + Apptainer backend requires explicitly opting into experimental
2327
# features.
24-
run.experimental_features_enabled = true
28+
[run]
29+
experimental_features_enabled = true
2530

2631
# Set the default backend to Slurm + Apptainer.
27-
run.backends.default.type = "slurm_apptainer"
32+
[run.backends.default]
33+
type = "slurm_apptainer"
2834

2935
# The Slurm partition used by default for task execution.
3036
#
3137
# This parameter is optional. If it's absent and no other applicable
3238
# partitions are specified, jobs will be submitted to your Slurm cluster's
3339
# default partition.
34-
run.backends.default.default_slurm_partition.name = "gpu"
40+
default_slurm_partition.name = "gpu"
3541
# The largest number of CPUs and memory that can be reserved for a single job
3642
# on this partition.
3743
#
3844
# These parameters are optional, and should be set according to site-specific
3945
# information about the hosts available to dispatch work from the partition.
4046
# They can also be set for the other types of partitions, but this example
4147
# leaves them unconstrained by default.
42-
run.backends.default.default_slurm_partition.max_cpu_per_task = 64
43-
run.backends.default.default_slurm_partition.max_memory_per_task = "96 GB"
48+
default_slurm_partition.max_cpu_per_task = 64
49+
default_slurm_partition.max_memory_per_task = "96 GB"
4450

4551
# The Slurm partition used for short tasks.
4652
#
4753
# This parameter is optional, and overrides `default_slurm_partition`.
48-
run.backends.default.short_task_slurm_partition.name = "short"
54+
short_task_slurm_partition.name = "short"
4955

5056
# The Slurm partition used for GPU tasks.
5157
#
5258
# This parameter is optional, and overrides `default_slurm_partition` and
5359
# `short_task_slurm_partition`.
54-
run.backends.default.gpu_slurm_partition.name = "gpu"
60+
gpu_slurm_partition.name = "gpu"
5561

5662
# The Slurm partition used for FPGA tasks.
5763
#
5864
# This parameter is optional, and overrides `default_slurm_partition` and
5965
# `short_task_slurm_partition`.
60-
run.backends.default.fpga_slurm_partition.name = "fpga"
66+
fpga_slurm_partition.name = "fpga"
6167

6268
# Additional command-line arguments to pass to `sbatch` when submitting jobs
6369
# to Slurm.
64-
run.backends.default.extra_sbatch_args = ["--time=60"]
65-
66-
# The maximum number of subtasks each `scatter` will try executing at once.
67-
#
68-
# This is *not* a direct limit on the total number of concurrent tasks, but
69-
# can affect the number of jobs that get queued at one time.
70-
run.backends.default.max_scatter_concurrency = 200
70+
extra_sbatch_args = ["--time=60"]
7171
```
7272

7373
If you run into problems or have other feedback, please reach out to us in the

0 commit comments

Comments
 (0)